Changes to worker

runpod-workers · Nov 16, 2023 · e7c8de4 · ashleykleynhans · Nov 22, 2023 · e7c8de4
1 parent 24feadd
commit e7c8de4
Show file tree

Hide file tree

Showing 4 changed files with 56 additions and 49 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,6 +1,6 @@
 # Base image
 # The following docker base image is recommended by VLLM: 
-FROM runpod/pytorch:2.0.1-py3.10-cuda11.8.0-devel
+FROM runpod/base:0.4.1-cuda11.8.0
 
 # Use bash shell with pipefail option
 SHELL ["/bin/bash", "-o", "pipefail", "-c"]
@@ -10,7 +10,7 @@ WORKDIR /
 
 # Update and upgrade the system packages (Worker Template)
 ARG DEBIAN_FRONTEND=noninteractive
-RUN pip uninstall torch -y
+
 RUN pip install torch==2.0.1 -f https://download.pytorch.org/whl/cu118
 COPY builder/setup.sh /setup.sh
 RUN chmod +x /setup.sh && \

diff --git a/README.md b/README.md
@@ -9,20 +9,44 @@
 🚀 | This serverless worker utilizes vLLM (very Large Language Model) behind the scenes and is integrated into RunPod's serverless environment. It supports dynamic auto-scaling using the built-in RunPod autoscaling feature.
 </div>
 
-#### Docker Arguments:
-1. `HUGGING_FACE_HUB_TOKEN`: Your private Hugging Face token. This token is required for downloading models that necessitate agreement to an End User License Agreement (EULA), such as the llama2 family of models.
-2. `MODEL_NAME`: The Hugging Face model to use. Please ensure that the chosen model is supported by vLLM. Refer to the list of supported models for compatibility.
-3. `TOKENIZER`: (Optional) The specified tokenizer to use. If you want to use the default tokenizer for the model, do not provide this docker argument at all.
-4. `STREAMING`: Whether to use HTTP Streaming or not. Specify True if you want to enable HTTP Streaming; otherwise, omit this argument.
-5. `QUANTIZATION`: (Optional) `awq` to use AWQ Quantization. Base model must be in AWQ format.
+## Setting up the Serverless Worker
+### Docker Arguments
+#### Required:
+- `MODEL_NAME`: The Hugging Face model to use.
+- `STREAMING`: Whether to use HTTP Streaming or not.
+More information on receiving streaming responses from Serverless Endpoints can be found at [Endpoint URLs](https://docs.runpod.io/docs/serverless-endpoint-urls#streamjob_id), and a detailed example at [Llama2 7B Chat | Streaming Token Outputs](https://docs.runpod.io/reference/llama2-7b-chat#streaming-token-outputs). 
+#### Optional:
+- `HUGGING_FACE_HUB_TOKEN`: Your Hugging Face token to access private or gated models. You can get your token [here](https://huggingface.co/settings/token).
+- `TOKENIZER`: The specified tokenizer to use. If you want to use the default tokenizer for the model, do not provide this docker argument at all.
+- `QUANTIZATION`: `awq` to use AWQ Quantization. Base model must be in AWQ format.
 
-#### llama2 7B Chat:
-`docker build . --platform linux/amd64 --build-arg HUGGING_FACE_HUB_TOKEN=your_hugging_face_token_here --build-arg MODEL_NAME=meta-llama/Llama-2-7b-chat-hf --build-arg TOKENIZER=hf-internal-testing/llama-tokenizer --build-arg STREAMING=True`
+### Compatible Models
+- LLaMA & LLaMA-2 
+- Mistral 
+- MPT 
+- OPT 
+- Qwen 
+- Aquila & Aquila2 
+- Baichuan
+- BLOOM 
+- Falcon 
+- GPT-2
+- GPT BigCode
+- GPT-J
+- GPT-NeoX
+- InternLM
 
-#### llama2 13B Chat:
-`docker build . --platform linux/amd64 --build-arg HUGGING_FACE_HUB_TOKEN=your_hugging_face_token_here --build-arg MODEL_NAME=meta-llama/Llama-2-13b-chat-hf --build-arg TOKENIZER=hf-internal-testing/llama-tokenizer --build-arg STREAMING=True`
+> [!IMPORTANT]
+> If you are using private models or ones that are gated, such as Llama 2, you must provide your Hugging Face token as a docker argument. 
+
+
+### Examples
+#### llama2 2.7B Chat:
+```bash
+docker build . --platform linux/amd64 --build-arg --build-arg MODEL_NAME=meta-llama/Llama-2-7b-chat-hf --build-arg STREAMING=True  HUGGING_FACE_HUB_TOKEN=your_hugging_face_token_here
+```
+#### 
 
-Please make sure to replace your_hugging_face_token_here with your actual Hugging Face token to enable model downloads that require it.
 
 Ensure that you have Docker installed and properly set up before running the docker build commands. Once built, you can deploy this serverless worker in your desired environment with confidence that it will automatically scale based on demand. For further inquiries or assistance, feel free to contact our support team.
 

diff --git a/builder/setup.sh b/builder/setup.sh
diff --git a/src/handler.py b/src/handler.py
@@ -12,35 +12,43 @@
 
 # Prepare the model and tokenizer
 MODEL_NAME = os.environ.get('MODEL_NAME')
-MODEL_BASE_PATH = os.environ.get('MODEL_BASE_PATH', '/runpod-volume/')
+MODEL_BASE_PATH = os.environ.get('MODEfL_BASE_PATH', '/runpod-volume/')
 STREAMING = os.environ.get('STREAMING', False) == 'True'
 TOKENIZER = os.environ.get('TOKENIZER', None)
 USE_FULL_METRICS = os.environ.get('USE_FULL_METRICS', True)
+DTYPE = "auto"
+USE_HF_CHAT_TEMPLATE = os.environ.get('USE_HF_CHAT_TEMPLATE', False) == 'True'
+
+# Set up quantization-related parameters
 QUANTIZATION = os.environ.get('QUANTIZATION', None)
 
+if type(QUANTIZATION) is str and QUANTIZATION.lower() != "awq":
+    QUANTIZATION = None
+    print("Invalid quantization parameter. Using default value of None.")
+else:
+    DTYPE = "half"
+
 if not MODEL_NAME:
     print("Error: The model has not been provided.")
 
+if len(TOKENIZER) == 0:
+    print("Error: The tokenizer has not been provided. Defaulting to MODEL_NAME.")
+
 # Tensor parallelism
 try:
     NUM_GPU_SHARD = int(os.environ.get('NUM_GPU_SHARD', 1))
 except ValueError:
     print("Error: NUM_GPU_SHARD should be an integer. Using default value of 1.")
     NUM_GPU_SHARD = 1
 
-# Setup quantization parameter
-if type(QUANTIZATION) is str and QUANTIZATION.lower() != "awq":
-    QUANTIZATION = None
-    print("Invalid quantization parameter. Using default value of None.")
 
 # Prepare the engine's arguments
 engine_args = AsyncEngineArgs(
     model=f"{MODEL_BASE_PATH}{MODEL_NAME.split('/')[1]}",
     tokenizer=TOKENIZER,
     tokenizer_mode="auto",
     tensor_parallel_size=NUM_GPU_SHARD,
-    dtype="auto" if QUANTIZATION is None else "half",
-    seed=0,
+    dtype=DTYPE,
     disable_log_stats=False,
     quantization=QUANTIZATION,
 )
@@ -55,11 +63,8 @@
 
 def concurrency_controller() -> bool:
     # Calculate pending sequences
-    total_pending_sequences = len(llm.engine.scheduler.waiting) + len(llm.engine.scheduler.swapped)
-    print("Total pending sequences in vLLM queue: {}".format(total_pending_sequences))
-
-    # Enable auto-scaling if pending sequences exist
-    return total_pending_sequences > 30
+    total_queued_sequences = len(llm.engine.scheduler.waiting)
+    return total_queued_sequences > 0
 
 
 def prepare_metrics() -> dict:
@@ -144,13 +149,9 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
 
     # Utilize the built-in llama2 template if a llama2 base model is being employed.
     llama_models = ["llama-2-7b-chat-hf", "llama-2-13b-chat-hf", "llama-2-70b-chat-hf", "elinas/chronos-13b-v2"]
-    if any(model_name.lower() in MODEL_NAME.lower() for model_name in llama_models):
-        template = LLAMA2_TEMPLATE
-    else:
-        template = DEFAULT_TEMPLATE
 
     # Create the prompt using the template.
-    prompt = template(job_input['prompt'])
+    prompt = job_input['prompt']
 
     # Validate and set sampling parameters
     sampling_params = validate_and_set_sampling_params(job_input.get('sampling_params', None))
@@ -296,16 +297,8 @@ async def handler(job: dict) -> dict[str, list]:
 
     # Retrieve the job input.
     job_input = job['input']
-
-    # Utilize the built-in llama2 template if a llama2 base model is being employed.
-    llama_models = ["llama-2-7b-chat-hf", "llama-2-13b-chat-hf", "llama-2-70b-chat-hf", "elinas/chronos-13b-v2"]
-    if any(model_name.lower() in MODEL_NAME.lower() for model_name in llama_models):
-        template = LLAMA2_TEMPLATE
-    else:
-        template = DEFAULT_TEMPLATE
-
     # Create the prompt using the template.
-    prompt = template(job_input['prompt'])
+    prompt = job_input['prompt']
 
     # Validate and set sampling parameters
     sampling_params = validate_and_set_sampling_params(job_input.get('sampling_params', None))