Bump to vLLM 0.2.6, fixes and improvements

runpod-workers · Dec 19, 2023 · f324bef · f324bef
1 parent d6c145b
commit f324bef
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 44 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -8,7 +8,8 @@ ARG WORKER_CUDA_VERSION=11.8 # Required duplicate to keep in scope
 ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION} \
     HF_DATASETS_CACHE="/runpod-volume/huggingface-cache/datasets" \
     HUGGINGFACE_HUB_CACHE="/runpod-volume/huggingface-cache/hub" \
-    TRANSFORMERS_CACHE="/runpod-volume/huggingface-cache/hub" 
+    TRANSFORMERS_CACHE="/runpod-volume/huggingface-cache/hub" \
+    HF_TRANSFER=1
 
 
 # Install Python dependencies
@@ -20,13 +21,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 # Install torch and vllm based on CUDA version
 RUN if [[ "${WORKER_CUDA_VERSION}" == 11.8* ]]; then \
-        wget https://github.com/alpayariyak/vllm/releases/download/0.2.4-runpod-11.8/vllm-0.2.4+cu118-cp311-cp311-linux_x86_64.whl && \
-        python3.11 -m pip install vllm-0.2.4+cu118-cp311-cp311-linux_x86_64.whl && \
-        rm vllm-0.2.4+cu118-cp311-cp311-linux_x86_64.whl; \
-        python3.11 -m pip uninstall torch -y; \
-        python3.11 -m pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118; \
-        python3.11 -m pip uninstall xformers -y; \
-        python3.11 -m pip install --upgrade xformers --index-url https://download.pytorch.org/whl/cu118; \
+        python3.11 -m pip install -e git+https://github.com/alpayariyak/[email protected]#egg=vllm; \
+        python3.11 -m pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \
     else \
         python3.11 -m pip install -e git+https://github.com/alpayariyak/vllm.git#egg=vllm; \
     fi && \
@@ -42,9 +38,9 @@ ARG MODEL_BASE_PATH="/runpod-volume/"
 ARG HF_TOKEN=""
 ARG QUANTIZATION=""
 RUN if [ -n "$MODEL_NAME" ]; then \
-        python3.11 /download_model.py --model $MODEL_NAME --download_dir $MODEL_BASE_PATH; \
-        export MODEL_BASE_PATH=$MODEL_BASE_PATH; \
-        export MODEL_NAME=$MODEL_NAME; \
+        export MODEL_BASE_PATH=$MODEL_BASE_PATH && \
+        export MODEL_NAME=$MODEL_NAME && \
+        python3.11 /download_model.py --model $MODEL_NAME; \
     fi && \
     if [ -n "$QUANTIZATION" ]; then \
         export QUANTIZATION=$QUANTIZATION; \

diff --git a/README.md b/README.md
@@ -46,26 +46,26 @@ To build an image with the model baked in, you must specify the following docker
 `sudo docker build -t username/image:tag --build-arg MODEL_NAME="openchat/openchat_3.5" --build-arg MODEL_BASE_PATH="/models" .`
 
 ### Compatible Models
-- LLaMA & LLaMA-2
-- Mistral
-- Mixtral (Mistral MoE)
-- Yi
-- ChatGLM
-- Phi
-- MPT
-- OPT
-- Qwen
-- Aquila & Aquila2
-- Baichuan
-- BLOOM
-- Falcon
-- GPT-2
-- GPT BigCode
-- GPT-J
-- GPT-NeoX
-- InternLM
-
-And any other models supported by vLLM 0.2.4.
+- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
+- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
+- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
+- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
+- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
+- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
+- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
+- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
+- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
+- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
+- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
+- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
+- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
+- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
+- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
+- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
+- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
+  
+And any other models supported by vLLM 0.2.6.
 
 
 Ensure that you have Docker installed and properly set up before running the docker build commands. Once built, you can deploy this serverless worker in your desired environment with confidence that it will automatically scale based on demand. For further inquiries or assistance, feel free to contact our support team.

diff --git a/builder/requirements.txt b/builder/requirements.txt
@@ -1,3 +1,4 @@
 hf_transfer
 runpod==1.4.2
 huggingface-hub
+packaging
diff --git a/src/constants.py b/src/constants.py
@@ -14,8 +14,8 @@
     'min_p': float, 
     'use_beam_search': bool,
     'length_penalty': float, 
-    'early_stopping': [bool, str],
-    'stop': [str, list], 
+    'early_stopping': (bool, str),
+    'stop': (str, list), 
     'stop_token_ids': list, 
     'ignore_eos': bool,
     'max_tokens': int,

diff --git a/src/utils.py b/src/utils.py
@@ -51,28 +51,31 @@ def initialize_llm_engine() -> AsyncLLMEngine:
         logging.error(f"Error initializing vLLM engine: {e}")
         raise
 
-
-def validate_and_convert_sampling_params(params: Dict[str, Any]) -> SamplingParams:
+def validate_and_convert_sampling_params(params: Dict[str, Any]) -> Dict[str, Any]:
     validated_params = {}
 
     for key, value in params.items():
         expected_type = sampling_param_types.get(key)
         if value is None:
             validated_params[key] = None
             continue
-        
+
         if expected_type is None:
             continue
 
         if not isinstance(expected_type, tuple):
             expected_type = (expected_type,)
 
-        try:
-            validated_params[key] = next(
-                casted_value for t in expected_type 
-                if (casted_value := t(value)) or True
-            )
-        except (TypeError, ValueError):
-            continue
+        if any(isinstance(value, t) for t in expected_type):
+            validated_params[key] = value
+        else:
+            try:
+                casted_value = next(
+                    t(value) for t in expected_type
+                    if isinstance(value, t)
+                )
+                validated_params[key] = casted_value
+            except (TypeError, ValueError, StopIteration):
+                continue
 
-    return SamplingParams(**validated_params)
+    return validated_params