Skip to content

Commit

Permalink
update: max num tokens for bei (#1357)
Browse files Browse the repository at this point in the history
* update: n-gram deployment

* add BEI_MINIUMUM_MAX_NUM_TOKENS

* updated rc release version

* fmt serving image builder

* update sha of image

* fix: imports

* rename env variable
  • Loading branch information
michaelfeil authored Feb 3, 2025
1 parent 2b96fcb commit 53db85f
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "truss"
version = "0.9.60rc003"
version = "0.9.60rc004"
description = "A seamless bridge from model development to model delivery"
license = "MIT"
readme = "README.md"
Expand Down
3 changes: 2 additions & 1 deletion truss/base/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
TRTLLM_PREDICT_CONCURRENCY = 512
BEI_TRTLLM_CLIENT_BATCH_SIZE = 128
BEI_MAX_CONCURRENCY_TARGET_REQUESTS = 2048
BEI_REQUIRED_MAX_NUM_TOKENS = 16384

TRTLLM_MIN_MEMORY_REQUEST_GI = 24
HF_MODELS_API_URL = "https://huggingface.co/api/models"
Expand Down Expand Up @@ -104,7 +105,7 @@
TRTLLM_BASE_IMAGE = "baseten/briton-server:v0.16.0-5be7b58"
TRTLLM_PYTHON_EXECUTABLE = "/usr/local/briton/venv/bin/python"
BASE_TRTLLM_REQUIREMENTS = ["briton==0.4.2"]
BEI_TRTLLM_BASE_IMAGE = "baseten/bei:0.0.16@sha256:51e7ab169ffc2fa9e809a2e34d2f767277ba0c67e01c63fbca842992bb6402fc"
BEI_TRTLLM_BASE_IMAGE = "baseten/bei:0.0.17@sha256:9c3577f6ec672d6da5aca18e9c0ebdddd65ed80c8858e757fbde7e9cf48de01d"

BEI_TRTLLM_PYTHON_EXECUTABLE = "/usr/bin/python3"

Expand Down
9 changes: 8 additions & 1 deletion truss/contexts/image_builder/serving_image_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
BASE_SERVER_REQUIREMENTS_TXT_FILENAME,
BASE_TRTLLM_REQUIREMENTS,
BEI_MAX_CONCURRENCY_TARGET_REQUESTS,
BEI_REQUIRED_MAX_NUM_TOKENS,
BEI_TRTLLM_BASE_IMAGE,
BEI_TRTLLM_CLIENT_BATCH_SIZE,
BEI_TRTLLM_PYTHON_EXECUTABLE,
Expand Down Expand Up @@ -384,13 +385,19 @@ def prepare_trtllm_bei_encoder_build_dir(self, build_dir: Path):
# runtime batch size may not be higher than what the build settings of the model allow
# to 32 even if the engine.rank0 allows for higher batch_size
runtime_max_batch_size = min(config.trt_llm.build.max_batch_size, 32)

# make sure the user gets good performance, enforcing max_num_tokens here and in engine-builder
runtime_max_batch_tokens = max(
config.trt_llm.build.max_num_tokens, BEI_REQUIRED_MAX_NUM_TOKENS
)
port = 7997
start_command = " ".join(
[
"truss-transfer-cli && text-embeddings-router",
f"--port {port}",
# assert the max_batch_size is within trt-engine limits
f"--max-batch-requests {runtime_max_batch_size}",
# assert the max_num_tokens is within trt-engine limits
f"--max-batch-tokens {runtime_max_batch_tokens}",
# how many sentences can be in a single json payload.
# limited default to improve request based autoscaling.
f"--max-client-batch-size {BEI_TRTLLM_CLIENT_BATCH_SIZE}",
Expand Down

0 comments on commit 53db85f

Please sign in to comment.