From a5dc8b53b79107940d9ea9c29c3f198b0d187709 Mon Sep 17 00:00:00 2001 From: alpayariyak Date: Tue, 16 Jan 2024 17:58:46 -0500 Subject: [PATCH] Fix Concurrency, Add Max Model Length --- README.md | 10 +++++++--- builder/requirements.txt | 2 +- src/engine.py | 14 ++++++++++++-- src/handler.py | 4 ++-- 4 files changed, 22 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 304c36b..4b81e48 100644 --- a/README.md +++ b/README.md @@ -23,16 +23,20 @@ We now offer a pre-built Docker Image for the vLLM Worker that you can configure - `MODEL_NAME`: Hugging Face Model Repository (e.g., `openchat/openchat-3.5-1210`). - **Optional**: + - `MAX_MODEL_LENGTH`: Maximum number of tokens for the engine to be able to handle. (default: maximum supported by the model) - `MODEL_BASE_PATH`: Model storage directory (default: `/runpod-volume`). - `HF_TOKEN`: Hugging Face token for private and gated models (e.g., Llama, Falcon). - - `NUM_GPU_SHARD`: Number of GPUs to split the model across (default: `1`). - - `QUANTIZATION`: AWQ (`awq`) or SqueezeLLM (`squeezellm`) quantization. - - `MAX_CONCURRENCY`: Max concurrent requests (default: `100`). + - `NUM_GPU_SHARD`: Number of GPUs to split the model across. (default: `1`) + - `QUANTIZATION`: AWQ (`awq`), SqueezeLLM (`squeezellm`) or GPTQ (`gptq`) Quantization. The specified Model Repo must be of a quantized model. (default: `None`) + - `TRUST_REMOTE_CODE`: Whether to trust remote code with Hugging Face. (default: `0`) + - `MAX_CONCURRENCY`: Max concurrent requests. (default: `100`) - `DEFAULT_BATCH_SIZE`: Token streaming batch size (default: `30`). This reduces the number of HTTP calls, increasing speed 8-10x vs non-batching, matching non-streaming performance. - `DISABLE_LOG_STATS`: Enable (`0`) or disable (`1`) vLLM stats logging. - `DISABLE_LOG_REQUESTS`: Enable (`0`) or disable (`1`) request logging. ### Option 2: Build Docker Image with Model Inside +[!WARNING] If you are getting errors while building the image, try adding `ENV MAX_JOBS` to the Dockerfile and increase Docker memory limit to at least 25GB. + To build an image with the model baked in, you must specify the following docker arguments when building the image: #### Arguments: diff --git a/builder/requirements.txt b/builder/requirements.txt index 5654ea1..19d116f 100644 --- a/builder/requirements.txt +++ b/builder/requirements.txt @@ -1,5 +1,5 @@ hf_transfer -runpod==1.5.1 +runpod==1.5.2 huggingface-hub packaging typing-extensions==4.7.1 diff --git a/src/engine.py b/src/engine.py index cbd6a26..dde7036 100644 --- a/src/engine.py +++ b/src/engine.py @@ -29,7 +29,7 @@ def apply_chat_template(self, input: Union[str, list[dict[str, str]]]) -> str: ) -class VLLMEngine: +class vLLMEngine: def __init__(self): load_dotenv() # For local development self.config = self._initialize_config() @@ -41,11 +41,13 @@ def _initialize_config(self): return { "model": os.getenv("MODEL_NAME"), "download_dir": os.getenv("MODEL_BASE_PATH", "/runpod-volume/"), - "quantization": os.getenv("QUANTIZATION"), + "quantization": self._get_quantization(), "dtype": "auto" if os.getenv("QUANTIZATION") is None else "half", "disable_log_stats": bool(int(os.getenv("DISABLE_LOG_STATS", 1))), "disable_log_requests": bool(int(os.getenv("DISABLE_LOG_REQUESTS", 1))), + "trust_remote_code": bool(int(os.getenv("TRUST_REMOTE_CODE", 0))), "gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.98)), + "max_model_len": self._get_max_model_len(), "tensor_parallel_size": self._get_num_gpu_shard(), } @@ -65,9 +67,17 @@ def _get_num_gpu_shard(self): logging.info("Using %s GPU shards", final_num_gpu_shard) return final_num_gpu_shard + def _get_max_model_len(self): + max_model_len = os.getenv("MAX_MODEL_LEN") + return int(max_model_len) if max_model_len is not None else None + def _get_n_current_jobs(self): total_sequences = len(self.llm.engine.scheduler.waiting) + len(self.llm.engine.scheduler.swapped) + len(self.llm.engine.scheduler.running) return total_sequences + + def _get_quantization(self): + quantization = os.getenv("QUANTIZATION").lower() + return quantization if quantization in ["awq", "squeezellm", "gptq"] else None def concurrency_modifier(self, current_concurrency): n_current_jobs = self._get_n_current_jobs() diff --git a/src/handler.py b/src/handler.py index c388238..6fc10eb 100644 --- a/src/handler.py +++ b/src/handler.py @@ -2,9 +2,9 @@ from typing import Generator import runpod from utils import validate_sampling_params, random_uuid -from engine import VLLMEngine +from engine import vLLMEngine -vllm_engine = VLLMEngine() +vllm_engine = vLLMEngine() async def handler(job: dict) -> Generator[dict, None, None]: job_input = job["input"] llm_input = job_input.get("messages", job_input.get("prompt"))