From a5dc8b53b79107940d9ea9c29c3f198b0d187709 Mon Sep 17 00:00:00 2001
From: alpayariyak <aariyak@wpi.edu>
Date: Tue, 16 Jan 2024 17:58:46 -0500
Subject: [PATCH] Fix Concurrency, Add Max Model Length

---
 README.md                | 10 +++++++---
 builder/requirements.txt |  2 +-
 src/engine.py            | 14 ++++++++++++--
 src/handler.py           |  4 ++--
 4 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 304c36b..4b81e48 100644
--- a/README.md
+++ b/README.md
@@ -23,16 +23,20 @@ We now offer a pre-built Docker Image for the vLLM Worker that you can configure
    - `MODEL_NAME`: Hugging Face Model Repository (e.g., `openchat/openchat-3.5-1210`).
 
 - **Optional**:
+  - `MAX_MODEL_LENGTH`: Maximum number of tokens for the engine to be able to handle. (default: maximum supported by the model)
   - `MODEL_BASE_PATH`: Model storage directory (default: `/runpod-volume`).
   - `HF_TOKEN`: Hugging Face token for private and gated models (e.g., Llama, Falcon).
-  - `NUM_GPU_SHARD`: Number of GPUs to split the model across (default: `1`).
-  - `QUANTIZATION`: AWQ (`awq`) or SqueezeLLM (`squeezellm`) quantization.
-  - `MAX_CONCURRENCY`: Max concurrent requests (default: `100`).
+  - `NUM_GPU_SHARD`: Number of GPUs to split the model across. (default: `1`)
+  - `QUANTIZATION`: AWQ (`awq`), SqueezeLLM (`squeezellm`) or GPTQ (`gptq`) Quantization. The specified Model Repo must be of a quantized model. (default: `None`)
+  - `TRUST_REMOTE_CODE`: Whether to trust remote code with Hugging Face. (default: `0`)
+  - `MAX_CONCURRENCY`: Max concurrent requests. (default: `100`)
   - `DEFAULT_BATCH_SIZE`: Token streaming batch size (default: `30`). This reduces the number of HTTP calls, increasing speed 8-10x vs non-batching, matching non-streaming performance.
   - `DISABLE_LOG_STATS`: Enable (`0`) or disable (`1`) vLLM stats logging.
   - `DISABLE_LOG_REQUESTS`: Enable (`0`) or disable (`1`) request logging.
 
 ### Option 2: Build Docker Image with Model Inside
+[!WARNING] If you are getting errors while building the image, try adding `ENV MAX_JOBS` to the Dockerfile and increase Docker memory limit to at least 25GB.
+
 To build an image with the model baked in, you must specify the following docker arguments when building the image:
 
 #### Arguments:
diff --git a/builder/requirements.txt b/builder/requirements.txt
index 5654ea1..19d116f 100644
--- a/builder/requirements.txt
+++ b/builder/requirements.txt
@@ -1,5 +1,5 @@
 hf_transfer
-runpod==1.5.1
+runpod==1.5.2
 huggingface-hub
 packaging
 typing-extensions==4.7.1
diff --git a/src/engine.py b/src/engine.py
index cbd6a26..dde7036 100644
--- a/src/engine.py
+++ b/src/engine.py
@@ -29,7 +29,7 @@ def apply_chat_template(self, input: Union[str, list[dict[str, str]]]) -> str:
         )
 
 
-class VLLMEngine:
+class vLLMEngine:
     def __init__(self):
         load_dotenv() # For local development
         self.config = self._initialize_config()
@@ -41,11 +41,13 @@ def _initialize_config(self):
         return {
             "model": os.getenv("MODEL_NAME"),
             "download_dir": os.getenv("MODEL_BASE_PATH", "/runpod-volume/"),
-            "quantization": os.getenv("QUANTIZATION"),
+            "quantization": self._get_quantization(),
             "dtype": "auto" if os.getenv("QUANTIZATION") is None else "half",
             "disable_log_stats": bool(int(os.getenv("DISABLE_LOG_STATS", 1))),
             "disable_log_requests": bool(int(os.getenv("DISABLE_LOG_REQUESTS", 1))),
+            "trust_remote_code": bool(int(os.getenv("TRUST_REMOTE_CODE", 0))),
             "gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.98)),
+            "max_model_len": self._get_max_model_len(),
             "tensor_parallel_size": self._get_num_gpu_shard(),
         }
 
@@ -65,9 +67,17 @@ def _get_num_gpu_shard(self):
             logging.info("Using %s GPU shards", final_num_gpu_shard)
         return final_num_gpu_shard
     
+    def _get_max_model_len(self):
+        max_model_len = os.getenv("MAX_MODEL_LEN")
+        return int(max_model_len) if max_model_len is not None else None
+    
     def _get_n_current_jobs(self):
         total_sequences = len(self.llm.engine.scheduler.waiting) + len(self.llm.engine.scheduler.swapped) + len(self.llm.engine.scheduler.running)
         return total_sequences
+
+    def _get_quantization(self):
+        quantization = os.getenv("QUANTIZATION").lower()
+        return quantization if quantization in ["awq", "squeezellm", "gptq"] else None
     
     def concurrency_modifier(self, current_concurrency):
         n_current_jobs = self._get_n_current_jobs()
diff --git a/src/handler.py b/src/handler.py
index c388238..6fc10eb 100644
--- a/src/handler.py
+++ b/src/handler.py
@@ -2,9 +2,9 @@
 from typing import Generator
 import runpod
 from utils import validate_sampling_params, random_uuid
-from engine import VLLMEngine
+from engine import vLLMEngine
 
-vllm_engine = VLLMEngine()
+vllm_engine = vLLMEngine()
 async def handler(job: dict) -> Generator[dict, None, None]:
     job_input = job["input"]
     llm_input = job_input.get("messages", job_input.get("prompt"))