From db7167d57f518ad509e3248831ab56986e98bf24 Mon Sep 17 00:00:00 2001
From: alpayariyak <aariyak@wpi.edu>
Date: Tue, 5 Mar 2024 19:14:35 +0000
Subject: [PATCH] 0.3.3

---
 .gitmodules                               |  3 ++
 Dockerfile                                |  2 +-
 README.md                                 |  4 +-
 src/config.py                             |  2 +-
 {vllm-base => vllm-base-image}/Dockerfile | 47 +++++++----------------
 {vllm-base => vllm-base-image}/README.md  |  0
 vllm-base/download_required_files.sh      | 12 ------
 7 files changed, 20 insertions(+), 50 deletions(-)
 create mode 100644 .gitmodules
 rename {vllm-base => vllm-base-image}/Dockerfile (57%)
 rename {vllm-base => vllm-base-image}/README.md (100%)
 delete mode 100644 vllm-base/download_required_files.sh
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..e4283dc
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "vllm-base-image/vllm"]
+	path = vllm-base-image/vllm
+	url = /devdisk/inference/worker-vllm/vllm-base-image/vllm
diff --git a/Dockerfile b/Dockerfile
index 1ce2457..3f1bf2f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 ARG WORKER_CUDA_VERSION=11.8.0
-FROM runpod/worker-vllm:base-0.3.1-cuda${WORKER_CUDA_VERSION} AS vllm-base
+FROM runpod/worker-vllm:base-0.3.2-cuda${WORKER_CUDA_VERSION} AS vllm-base
 
 RUN apt-get update -y \
     && apt-get install -y python3-pip
diff --git a/README.md b/README.md
index ea7a9d8..716e6ac 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 Deploy Blazing-fast LLMs powered by [vLLM](https://github.com/vllm-project/vllm) on RunPod Serverless in a few clicks.
 
-<p>Worker Version: 0.3.1 | vLLM Version: 0.3.2</p>
+<p>Worker Version: 0.3.2 | vLLM Version: 0.3.3</p>
 
 [![CD | Docker-Build-Release](https://github.com/runpod-workers/worker-vllm/actions/workflows/docker-build-release.yml/badge.svg)](https://github.com/runpod-workers/worker-vllm/actions/workflows/docker-build-release.yml)
 
@@ -88,7 +88,7 @@ This table provides a quick reference to the image tags you should use based on
 **LLM Settings**
 | `MODEL_NAME`**\***                        | -                    | `str`                                         | Hugging Face Model Repository (e.g., `openchat/openchat-3.5-1210`). |
 | `MODEL_REVISION`                    | `None`               | `str`                                         |Model revision(branch) to load. |
-| `MAX_MODEL_LENGTH`                  | Model's maximum      | `int`                                         |Maximum number of tokens for the engine to handle per request. |
+| `MAX_MODEL_LEN`                  | Model's maximum      | `int`                                         |Maximum number of tokens for the engine to handle per request. |
 | `BASE_PATH`                         | `/runpod-volume`     | `str`                                         |Storage directory for Huggingface cache and model. Utilizes network storage if attached when pointed at `/runpod-volume`, which will have only one worker download the model once, which all workers will be able to load. If no network volume is present, creates a local directory within each worker. |
 | `LOAD_FORMAT`                       | `auto`               | `str`                                         |Format to load model in. |
 | `HF_TOKEN`                          | -                    | `str`                                         |Hugging Face token for private and gated models. |
diff --git a/src/config.py b/src/config.py
index f9b1e03..d7bef44 100644
--- a/src/config.py
+++ b/src/config.py
@@ -39,7 +39,7 @@ def _initialize_config(self):
             "trust_remote_code": bool(int(os.getenv("TRUST_REMOTE_CODE", 0))),
             "gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.95)),
             "max_parallel_loading_workers": None if device_count() > 1 or not os.getenv("MAX_PARALLEL_LOADING_WORKERS") else int(os.getenv("MAX_PARALLEL_LOADING_WORKERS")),
-            "max_model_len": int(os.getenv("MAX_MODEL_LENGTH")) if os.getenv("MAX_MODEL_LENGTH") else None,
+            "max_model_len": int(os.getenv("MAX_MODEL_LEN")) if os.getenv("MAX_MODEL_LEN") else None,
             "tensor_parallel_size": device_count(),
             "seed": int(os.getenv("SEED")) if os.getenv("SEED") else None,
             "kv_cache_dtype": os.getenv("KV_CACHE_DTYPE"),
diff --git a/vllm-base/Dockerfile b/vllm-base-image/Dockerfile
similarity index 57%
rename from vllm-base/Dockerfile
rename to vllm-base-image/Dockerfile
index 2ea7224..7ba7e73 100644
--- a/vllm-base/Dockerfile
+++ b/vllm-base-image/Dockerfile
@@ -17,25 +17,16 @@ ARG WORKER_CUDA_VERSION
 RUN apt-get update -y \
     && apt-get install -y python3-pip git
 
-RUN if [ "${WORKER_CUDA_VERSION}" = "12.1.0" ]; then \
-        ldconfig /usr/local/cuda-12.1/compat/; \
-    fi
-
 # Set working directory
 WORKDIR /vllm-installation
 
 # Install build and runtime dependencies
-COPY vllm-${WORKER_CUDA_VERSION}/requirements.txt requirements.txt
+COPY vllm/requirements-${WORKER_CUDA_VERSION}.txt  requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements.txt
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "${WORKER_CUDA_VERSION}" = "11.8.0" ]; then \
-        pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \
-    fi
-
 # Install development dependencies
-COPY vllm-${WORKER_CUDA_VERSION}/requirements-dev.txt requirements-dev.txt
+COPY vllm/requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt
 
@@ -45,25 +36,15 @@ FROM dev AS build
 ARG WORKER_CUDA_VERSION
 
 # Install build dependencies
-COPY vllm-${WORKER_CUDA_VERSION}/requirements-build.txt requirements-build.txt
+COPY vllm/requirements-build.txt requirements-build.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-build.txt
 
 # Copy necessary files
-COPY vllm-${WORKER_CUDA_VERSION}/csrc csrc
-COPY vllm-${WORKER_CUDA_VERSION}/setup.py setup.py
-COPY vllm-12.1.0/pyproject.toml pyproject.toml
-COPY vllm-${WORKER_CUDA_VERSION}/vllm/__init__.py vllm/__init__.py
-
-# Conditional installation based on CUDA version
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "${WORKER_CUDA_VERSION}" = "11.8.0" ]; then \
-        pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \
-        rm pyproject.toml; \
-    elif [ "${WORKER_CUDA_VERSION}" != "12.1.0" ]; then \
-        echo "WORKER_CUDA_VERSION not supported"; \
-        exit 1; \
-    fi
+COPY vllm/csrc csrc
+COPY vllm/setup.py setup.py
+COPY vllm/pyproject.toml pyproject.toml
+COPY vllm/vllm/__init__.py vllm/__init__.py
 
 # Set environment variables for building extensions
 ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
@@ -72,8 +53,10 @@ ARG max_jobs=48
 ENV MAX_JOBS=${max_jobs}
 ARG nvcc_threads=1024
 ENV NVCC_THREADS=${nvcc_threads}
-
+ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION}
+ENV VLLM_INSTALL_PUNICA_KERNELS=0
 # Build extensions
+RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/
 RUN python3 setup.py build_ext --inplace
 
 FROM nvidia/cuda:${WORKER_CUDA_VERSION}-runtime-ubuntu22.04 AS vllm-base
@@ -88,19 +71,15 @@ RUN apt-get update -y \
 # Set working directory
 WORKDIR /vllm-installation
 
+
 # Install runtime dependencies
-COPY vllm-${WORKER_CUDA_VERSION}/requirements.txt requirements.txt
+COPY vllm/requirements-${WORKER_CUDA_VERSION}.txt  requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements.txt
 
-RUN --mount=type=cache,target=/root/.cache/pip \
-    if [ "${WORKER_CUDA_VERSION}" = "11.8.0" ]; then \
-        pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \
-    fi
-
 # Copy built files from the build stage
 COPY --from=build /vllm-installation/vllm/*.so /vllm-installation/vllm/
-COPY vllm-${WORKER_CUDA_VERSION}/vllm vllm
+COPY vllm/vllm vllm
 
 # Set PYTHONPATH environment variable
 ENV PYTHONPATH="/"
diff --git a/vllm-base/README.md b/vllm-base-image/README.md
similarity index 100%
rename from vllm-base/README.md
rename to vllm-base-image/README.md
diff --git a/vllm-base/download_required_files.sh b/vllm-base/download_required_files.sh
deleted file mode 100644
index b6138d3..0000000
--- a/vllm-base/download_required_files.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-git clone https://github.com/runpod/vllm-fork-for-sls-worker.git
-
-cp -r vllm-fork-for-sls-worker vllm-12.1.0
-cp -r vllm-fork-for-sls-worker vllm-11.8.0
-rm -rf vllm-fork-for-sls-worker
-
-cd vllm-11.8.0
-git checkout cuda-11.8
-
-echo "vLLM Base Image Builder Setup Complete."
\ No newline at end of file