From 7683cb2cb2eb5ea2b6a3a569e84cf0b3764ef713 Mon Sep 17 00:00:00 2001
From: Raushan Kumar <raushankr@google.com>
Date: Fri, 17 Jan 2025 11:03:54 -0800
Subject: [PATCH] Reverting the TGI image version for LLAMA multiple GPUs in
 GKE samples (#1591)

* The current image override the HF_HOME to /tmp from /data. Even after changing the mountpath to /tmp there is some regression in the newer TGI image which results into out of GPU memory on L4 and requires atleast A2 node. Rolling back the image version to get the sample working will investigation happen in the background.

* Updating the images to GCR which works for these models.

* Update ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml

Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>

* Update ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml

Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>

---------

Co-authored-by: Mofi Rahman <mofi@google.com>
Co-authored-by: Alvaro Bartolome <36760800+alvarobartt@users.noreply.github.com>
---
 .../falcon-40b/text-generation-inference.yaml              | 5 ++++-
 .../llama2-70b/text-generation-inference.yaml              | 5 ++++-
 .../llama3-70b/text-generation-inference.yaml              | 7 +++++--
 .../mixtral-8x7b/text-generation-inference.yaml            | 5 ++++-
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml
index 107eb36309..247196c030 100644
--- a/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml
+++ b/ai-ml/llm-multiple-gpus/falcon-40b/text-generation-inference.yaml
@@ -29,7 +29,7 @@ spec:
     spec:
       containers:
       - name: llm
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.1-4.ubuntu2204.py310
         resources:
           requests:
             cpu: "10"
@@ -51,6 +51,9 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
+          # mountPath is set to /data as it's the path where the HUGGINGFACE_HUB_CACHE environment
+          # variable points to in the TGI container image i.e. where the downloaded model from the Hub will be
+          # stored
           - mountPath: /data
             name: ephemeral-volume
       volumes:
diff --git a/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml
index 541f353e59..2642aadad3 100644
--- a/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml
+++ b/ai-ml/llm-multiple-gpus/llama2-70b/text-generation-inference.yaml
@@ -56,7 +56,10 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
-          - mountPath: /data
+          # mountPath is set to /tmp as it's the path where the HF_HOME environment
+          # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image.
+          # i.e. where the downloaded model from the Hub will be stored
+          - mountPath: /tmp
             name: ephemeral-volume
       volumes:
         - name: dshm
diff --git a/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml
index e34e297f6a..67ce8cc9b5 100644
--- a/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml
+++ b/ai-ml/llm-multiple-gpus/llama3-70b/text-generation-inference.yaml
@@ -29,7 +29,7 @@ spec:
     spec:
       containers:
       - name: llm
-        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu124.2-3.ubuntu2204.py311
+        image: us-docker.pkg.dev/deeplearning-platform-release/gcr.io/huggingface-text-generation-inference-cu121.2-1.ubuntu2204.py310
         resources:
           requests:
             cpu: "10"
@@ -58,7 +58,10 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
-          - mountPath: /data
+          # mountPath is set to /tmp as it's the path where the HUGGINGFACE_HUB_CACHE environment
+          # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image.
+          # i.e. where the downloaded model from the Hub will be stored
+          - mountPath: /tmp
             name: ephemeral-volume
       volumes:
         - name: dshm
diff --git a/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml b/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml
index ba7c229a8c..ec26530099 100644
--- a/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml
+++ b/ai-ml/llm-multiple-gpus/mixtral-8x7b/text-generation-inference.yaml
@@ -56,7 +56,10 @@ spec:
         volumeMounts:
           - mountPath: /dev/shm
             name: dshm
-          - mountPath: /data
+          # mountPath is set to /tmp as it's the path where the HF_HOME environment
+          # variable in the TGI DLCs is set to instead of the default /data set within the TGI default image.
+          # i.e. where the downloaded model from the Hub will be stored
+          - mountPath: /tmp
             name: ephemeral-volume
       volumes:
         - name: dshm