Skip to content

Commit

Permalink
Fix Concurrency, Add Max Model Length
Browse files Browse the repository at this point in the history
  • Loading branch information
alpayariyak committed Jan 16, 2024
1 parent 6e7a195 commit a5dc8b5
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 8 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,20 @@ We now offer a pre-built Docker Image for the vLLM Worker that you can configure
- `MODEL_NAME`: Hugging Face Model Repository (e.g., `openchat/openchat-3.5-1210`).

- **Optional**:
- `MAX_MODEL_LENGTH`: Maximum number of tokens for the engine to be able to handle. (default: maximum supported by the model)
- `MODEL_BASE_PATH`: Model storage directory (default: `/runpod-volume`).
- `HF_TOKEN`: Hugging Face token for private and gated models (e.g., Llama, Falcon).
- `NUM_GPU_SHARD`: Number of GPUs to split the model across (default: `1`).
- `QUANTIZATION`: AWQ (`awq`) or SqueezeLLM (`squeezellm`) quantization.
- `MAX_CONCURRENCY`: Max concurrent requests (default: `100`).
- `NUM_GPU_SHARD`: Number of GPUs to split the model across. (default: `1`)
- `QUANTIZATION`: AWQ (`awq`), SqueezeLLM (`squeezellm`) or GPTQ (`gptq`) Quantization. The specified Model Repo must be of a quantized model. (default: `None`)
- `TRUST_REMOTE_CODE`: Whether to trust remote code with Hugging Face. (default: `0`)
- `MAX_CONCURRENCY`: Max concurrent requests. (default: `100`)
- `DEFAULT_BATCH_SIZE`: Token streaming batch size (default: `30`). This reduces the number of HTTP calls, increasing speed 8-10x vs non-batching, matching non-streaming performance.
- `DISABLE_LOG_STATS`: Enable (`0`) or disable (`1`) vLLM stats logging.
- `DISABLE_LOG_REQUESTS`: Enable (`0`) or disable (`1`) request logging.

### Option 2: Build Docker Image with Model Inside
[!WARNING] If you are getting errors while building the image, try adding `ENV MAX_JOBS` to the Dockerfile and increase Docker memory limit to at least 25GB.

To build an image with the model baked in, you must specify the following docker arguments when building the image:

#### Arguments:
Expand Down
2 changes: 1 addition & 1 deletion builder/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
hf_transfer
runpod==1.5.1
runpod==1.5.2
huggingface-hub
packaging
typing-extensions==4.7.1
Expand Down
14 changes: 12 additions & 2 deletions src/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def apply_chat_template(self, input: Union[str, list[dict[str, str]]]) -> str:
)


class VLLMEngine:
class vLLMEngine:
def __init__(self):
load_dotenv() # For local development
self.config = self._initialize_config()
Expand All @@ -41,11 +41,13 @@ def _initialize_config(self):
return {
"model": os.getenv("MODEL_NAME"),
"download_dir": os.getenv("MODEL_BASE_PATH", "/runpod-volume/"),
"quantization": os.getenv("QUANTIZATION"),
"quantization": self._get_quantization(),
"dtype": "auto" if os.getenv("QUANTIZATION") is None else "half",
"disable_log_stats": bool(int(os.getenv("DISABLE_LOG_STATS", 1))),
"disable_log_requests": bool(int(os.getenv("DISABLE_LOG_REQUESTS", 1))),
"trust_remote_code": bool(int(os.getenv("TRUST_REMOTE_CODE", 0))),
"gpu_memory_utilization": float(os.getenv("GPU_MEMORY_UTILIZATION", 0.98)),
"max_model_len": self._get_max_model_len(),
"tensor_parallel_size": self._get_num_gpu_shard(),
}

Expand All @@ -65,9 +67,17 @@ def _get_num_gpu_shard(self):
logging.info("Using %s GPU shards", final_num_gpu_shard)
return final_num_gpu_shard

def _get_max_model_len(self):
max_model_len = os.getenv("MAX_MODEL_LEN")
return int(max_model_len) if max_model_len is not None else None

def _get_n_current_jobs(self):
total_sequences = len(self.llm.engine.scheduler.waiting) + len(self.llm.engine.scheduler.swapped) + len(self.llm.engine.scheduler.running)
return total_sequences

def _get_quantization(self):
quantization = os.getenv("QUANTIZATION").lower()
return quantization if quantization in ["awq", "squeezellm", "gptq"] else None

def concurrency_modifier(self, current_concurrency):
n_current_jobs = self._get_n_current_jobs()
Expand Down
4 changes: 2 additions & 2 deletions src/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
from typing import Generator
import runpod
from utils import validate_sampling_params, random_uuid
from engine import VLLMEngine
from engine import vLLMEngine

vllm_engine = VLLMEngine()
vllm_engine = vLLMEngine()
async def handler(job: dict) -> Generator[dict, None, None]:
job_input = job["input"]
llm_input = job_input.get("messages", job_input.get("prompt"))
Expand Down

0 comments on commit a5dc8b5

Please sign in to comment.