From db7167d57f518ad509e3248831ab56986e98bf24 Mon Sep 17 00:00:00 2001 From: alpayariyak Date: Tue, 5 Mar 2024 19:14:35 +0000 Subject: [PATCH] 0.3.3 --- .gitmodules | 3 ++ Dockerfile | 2 +- README.md | 4 +- src/config.py | 2 +- {vllm-base => vllm-base-image}/Dockerfile | 47 +++++++---------------- {vllm-base => vllm-base-image}/README.md | 0 vllm-base/download_required_files.sh | 12 ------ 7 files changed, 20 insertions(+), 50 deletions(-) create mode 100644 .gitmodules rename {vllm-base => vllm-base-image}/Dockerfile (57%) rename {vllm-base => vllm-base-image}/README.md (100%) delete mode 100644 vllm-base/download_required_files.sh diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..e4283dc --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "vllm-base-image/vllm"] + path = vllm-base-image/vllm + url = /devdisk/inference/worker-vllm/vllm-base-image/vllm diff --git a/Dockerfile b/Dockerfile index 1ce2457..3f1bf2f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ ARG WORKER_CUDA_VERSION=11.8.0 -FROM runpod/worker-vllm:base-0.3.1-cuda${WORKER_CUDA_VERSION} AS vllm-base +FROM runpod/worker-vllm:base-0.3.2-cuda${WORKER_CUDA_VERSION} AS vllm-base RUN apt-get update -y \ && apt-get install -y python3-pip diff --git a/README.md b/README.md index ea7a9d8..716e6ac 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Deploy Blazing-fast LLMs powered by [vLLM](https://github.com/vllm-project/vllm) on RunPod Serverless in a few clicks. -

Worker Version: 0.3.1 | vLLM Version: 0.3.2

+

Worker Version: 0.3.2 | vLLM Version: 0.3.3

[![CD | Docker-Build-Release](https://github.com/runpod-workers/worker-vllm/actions/workflows/docker-build-release.yml/badge.svg)](https://github.com/runpod-workers/worker-vllm/actions/workflows/docker-build-release.yml) @@ -88,7 +88,7 @@ This table provides a quick reference to the image tags you should use based on **LLM Settings** | `MODEL_NAME`**\*** | - | `str` | Hugging Face Model Repository (e.g., `openchat/openchat-3.5-1210`). | | `MODEL_REVISION` | `None` | `str` |Model revision(branch) to load. | -| `MAX_MODEL_LENGTH` | Model's maximum | `int` |Maximum number of tokens for the engine to handle per request. | +| `MAX_MODEL_LEN` | Model's maximum | `int` |Maximum number of tokens for the engine to handle per request. | | `BASE_PATH` | `/runpod-volume` | `str` |Storage directory for Huggingface cache and model. Utilizes network storage if attached when pointed at `/runpod-volume`, which will have only one worker download the model once, which all workers will be able to load. If no network volume is present, creates a local directory within each worker. | | `LOAD_FORMAT` | `auto` | `str` |Format to load model in. | | `HF_TOKEN` | - | `str` |Hugging Face token for private and gated models. | diff --git a/src/config.py b/src/config.py index f9b1e03..d7bef44 100644 --- a/src/config.py +++ b/src/config.py @@ -39,7 +39,7 @@ def _initialize_config(self): "trust_remote_code": bool(int(os.getenv("TRUST_REMOTE_CODE", 0))), "gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.95)), "max_parallel_loading_workers": None if device_count() > 1 or not os.getenv("MAX_PARALLEL_LOADING_WORKERS") else int(os.getenv("MAX_PARALLEL_LOADING_WORKERS")), - "max_model_len": int(os.getenv("MAX_MODEL_LENGTH")) if os.getenv("MAX_MODEL_LENGTH") else None, + "max_model_len": int(os.getenv("MAX_MODEL_LEN")) if os.getenv("MAX_MODEL_LEN") else None, "tensor_parallel_size": device_count(), "seed": int(os.getenv("SEED")) if os.getenv("SEED") else None, "kv_cache_dtype": os.getenv("KV_CACHE_DTYPE"), diff --git a/vllm-base/Dockerfile b/vllm-base-image/Dockerfile similarity index 57% rename from vllm-base/Dockerfile rename to vllm-base-image/Dockerfile index 2ea7224..7ba7e73 100644 --- a/vllm-base/Dockerfile +++ b/vllm-base-image/Dockerfile @@ -17,25 +17,16 @@ ARG WORKER_CUDA_VERSION RUN apt-get update -y \ && apt-get install -y python3-pip git -RUN if [ "${WORKER_CUDA_VERSION}" = "12.1.0" ]; then \ - ldconfig /usr/local/cuda-12.1/compat/; \ - fi - # Set working directory WORKDIR /vllm-installation # Install build and runtime dependencies -COPY vllm-${WORKER_CUDA_VERSION}/requirements.txt requirements.txt +COPY vllm/requirements-${WORKER_CUDA_VERSION}.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - if [ "${WORKER_CUDA_VERSION}" = "11.8.0" ]; then \ - pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \ - fi - # Install development dependencies -COPY vllm-${WORKER_CUDA_VERSION}/requirements-dev.txt requirements-dev.txt +COPY vllm/requirements-dev.txt requirements-dev.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-dev.txt @@ -45,25 +36,15 @@ FROM dev AS build ARG WORKER_CUDA_VERSION # Install build dependencies -COPY vllm-${WORKER_CUDA_VERSION}/requirements-build.txt requirements-build.txt +COPY vllm/requirements-build.txt requirements-build.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-build.txt # Copy necessary files -COPY vllm-${WORKER_CUDA_VERSION}/csrc csrc -COPY vllm-${WORKER_CUDA_VERSION}/setup.py setup.py -COPY vllm-12.1.0/pyproject.toml pyproject.toml -COPY vllm-${WORKER_CUDA_VERSION}/vllm/__init__.py vllm/__init__.py - -# Conditional installation based on CUDA version -RUN --mount=type=cache,target=/root/.cache/pip \ - if [ "${WORKER_CUDA_VERSION}" = "11.8.0" ]; then \ - pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \ - rm pyproject.toml; \ - elif [ "${WORKER_CUDA_VERSION}" != "12.1.0" ]; then \ - echo "WORKER_CUDA_VERSION not supported"; \ - exit 1; \ - fi +COPY vllm/csrc csrc +COPY vllm/setup.py setup.py +COPY vllm/pyproject.toml pyproject.toml +COPY vllm/vllm/__init__.py vllm/__init__.py # Set environment variables for building extensions ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' @@ -72,8 +53,10 @@ ARG max_jobs=48 ENV MAX_JOBS=${max_jobs} ARG nvcc_threads=1024 ENV NVCC_THREADS=${nvcc_threads} - +ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION} +ENV VLLM_INSTALL_PUNICA_KERNELS=0 # Build extensions +RUN ldconfig /usr/local/cuda-$(echo "$WORKER_CUDA_VERSION" | sed 's/\.0$//')/compat/ RUN python3 setup.py build_ext --inplace FROM nvidia/cuda:${WORKER_CUDA_VERSION}-runtime-ubuntu22.04 AS vllm-base @@ -88,19 +71,15 @@ RUN apt-get update -y \ # Set working directory WORKDIR /vllm-installation + # Install runtime dependencies -COPY vllm-${WORKER_CUDA_VERSION}/requirements.txt requirements.txt +COPY vllm/requirements-${WORKER_CUDA_VERSION}.txt requirements.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt -RUN --mount=type=cache,target=/root/.cache/pip \ - if [ "${WORKER_CUDA_VERSION}" = "11.8.0" ]; then \ - pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \ - fi - # Copy built files from the build stage COPY --from=build /vllm-installation/vllm/*.so /vllm-installation/vllm/ -COPY vllm-${WORKER_CUDA_VERSION}/vllm vllm +COPY vllm/vllm vllm # Set PYTHONPATH environment variable ENV PYTHONPATH="/" diff --git a/vllm-base/README.md b/vllm-base-image/README.md similarity index 100% rename from vllm-base/README.md rename to vllm-base-image/README.md diff --git a/vllm-base/download_required_files.sh b/vllm-base/download_required_files.sh deleted file mode 100644 index b6138d3..0000000 --- a/vllm-base/download_required_files.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -git clone https://github.com/runpod/vllm-fork-for-sls-worker.git - -cp -r vllm-fork-for-sls-worker vllm-12.1.0 -cp -r vllm-fork-for-sls-worker vllm-11.8.0 -rm -rf vllm-fork-for-sls-worker - -cd vllm-11.8.0 -git checkout cuda-11.8 - -echo "vLLM Base Image Builder Setup Complete." \ No newline at end of file