From 7683cb2cb2eb5ea2b6a3a569e84cf0b3764ef713 Mon Sep 17 00:00:00 2001 From: Raushan Kumar Date: Fri, 17 Jan 2025 11:03:54 -0800 Subject: [PATCH] Reverting the TGI image version for LLAMA multiple GPUs in GKE samples (#1591) * The current image override the HF_HOME to /tmp from /data. Even after changing the mountpath to /tmp there is some regression in the newer TGI image which results into out of GPU memory on L4 and requires atleast A2 node. Rolling back the image version to get the sample working will investigation happen in the background. * Updating the images to GCR which works for these models. * Update ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> * Update ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> --------- Co-authored-by: Mofi Rahman Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com> --- .../falcon-40b/text-generation-inference.yaml | 5 ++++- .../llama2-70b/text-generation-inference.yaml | 5 ++++- .../llama3-70b/text-generation-inference.yaml | 7 +++++-- .../mixtral-8x7b/text-generation-inference.yaml | 5 ++++- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml index 107eb36309..247196c030 100644 --- a/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml +++ b/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml @@ -29,7 +29,7 @@ spec: spec: containers: - name: llm - image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.1-4.ubuntu2204.py310 resources: requests: cpu: "10" @@ -51,6 +51,9 @@ spec: volumeMounts: - mountPath: /dev/shm name: dshm + # mountPath is set to /data as it's the path where the HUGGINGFACE_HUB_CACHE environment + # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be + # stored - mountPath: /data name: ephemeral-volume volumes: diff --git a/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml index 541f353e59..2642aadad3 100644 --- a/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml +++ b/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml @@ -56,7 +56,10 @@ spec: volumeMounts: - mountPath: /dev/shm name: dshm - - mountPath: /data + # mountPath is set to /tmp as it's the path where the HF_HOME environment + # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image. + # i.e. where the downloaded model from the Hub will be stored + - mountPath: /tmp name: ephemeral-volume volumes: - name: dshm diff --git a/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml index e34e297f6a..67ce8cc9b5 100644 --- a/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml +++ b/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml @@ -29,7 +29,7 @@ spec: spec: containers: - name: llm - image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311 + image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-1.ubuntu2204.py310 resources: requests: cpu: "10" @@ -58,7 +58,10 @@ spec: volumeMounts: - mountPath: /dev/shm name: dshm - - mountPath: /data + # mountPath is set to /tmp as it's the path where the HUGGINGFACE_HUB_CACHE environment + # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image. + # i.e. where the downloaded model from the Hub will be stored + - mountPath: /tmp name: ephemeral-volume volumes: - name: dshm diff --git a/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml index ba7c229a8c..ec26530099 100644 --- a/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml +++ b/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml @@ -56,7 +56,10 @@ spec: volumeMounts: - mountPath: /dev/shm name: dshm - - mountPath: /data + # mountPath is set to /tmp as it's the path where the HF_HOME environment + # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image. + # i.e. where the downloaded model from the Hub will be stored + - mountPath: /tmp name: ephemeral-volume volumes: - name: dshm