Reverting the TGI image version for LLAMA multiple GPUs in GKE samples (

#1591) * The current image override the HF_HOME to /tmp from /data. Even after changing the mountpath to /tmp there is some regression in the newer TGI image which results into out of GPU memory on L4 and requires atleast A2 node. Rolling back the image version to get the sample working will investigation happen in the background. * Updating the images to GCR which works for these models. * Update ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml Co-authored-by: Alvaro Bartolome <[email protected]> * Update ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml Co-authored-by: Alvaro Bartolome <[email protected]> --------- Co-authored-by: Mofi Rahman <[email protected]> Co-authored-by: Alvaro Bartolome <[email protected]>
GoogleCloudPlatform · Jan 17, 2025 · 7683cb2 · 7683cb2
1 parent 48a4009
commit 7683cb2
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 5 deletions.
diff --git a/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml
@@ -29,7 +29,7 @@ spec:
     spec:
       containers:
       - name: llm
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.1-4.ubuntu2204.py310
         resources:
           requests:
             cpu: "10"
@@ -51,6 +51,9 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
+          # mountPath is set to /data as it's the path where the HUGGINGFACE_HUB_CACHE environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: ephemeral-volume
       volumes:

diff --git a/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml
@@ -56,7 +56,10 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
-          - mountPath: /data
+          # mountPath is set to /tmp as it's the path where the HF_HOME environment
+          # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image.
+          # i.e. where the downloaded model from the Hub will be stored
+          - mountPath: /tmp
             name: ephemeral-volume
       volumes:
         - name: dshm

diff --git a/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml
@@ -29,7 +29,7 @@ spec:
     spec:
       containers:
       - name: llm
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-1.ubuntu2204.py310
         resources:
           requests:
             cpu: "10"
@@ -58,7 +58,10 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
-          - mountPath: /data
+          # mountPath is set to /tmp as it's the path where the HUGGINGFACE_HUB_CACHE environment
+          # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image.
+          # i.e. where the downloaded model from the Hub will be stored
+          - mountPath: /tmp
             name: ephemeral-volume
       volumes:
         - name: dshm

diff --git a/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml
@@ -56,7 +56,10 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
-          - mountPath: /data
+          # mountPath is set to /tmp as it's the path where the HF_HOME environment
+          # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image.
+          # i.e. where the downloaded model from the Hub will be stored
+          - mountPath: /tmp
             name: ephemeral-volume
       volumes:
         - name: dshm