defenseunicorns · justinthelaw · Oct 3, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024
@@ -1,13 +1,17 @@
 export LAI_HF_HUB_ENABLE_HF_TRANSFER="1"
-export LAI_REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"
-export LAI_REVISION="gptq-4bit-32g-actorder_True"
-export LAI_QUANTIZATION="gptq"
+export LAI_REPO_ID="justinthelaw/Hermes-2-Pro-Mistral-7B-4bit-32g"
+export LAI_REVISION="main"
 export LAI_TENSOR_PARALLEL_SIZE=1
+export LAI_TRUST_REMOTE_CODE=True
 export LAI_MODEL_SOURCE=".model/"
 export LAI_MAX_CONTEXT_LENGTH=32768
-export LAI_STOP_TOKENS='["</s>","<|endoftext|>","<|im_end|>"]'
-export LAI_PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n"
-export LAI_PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n"
-export LAI_PROMPT_FORMAT_CHAT_USER="USER: {}\n"
+export LAI_STOP_TOKENS='["</s>"]'
+export LAI_PROMPT_FORMAT_CHAT_SYSTEM="<|system|>\n{}<|end|>\n"
+export LAI_PROMPT_FORMAT_CHAT_USER="<|user|>\n{}<|end|>\n"
+export LAI_PROMPT_FORMAT_CHAT_ASSISTANT="<|assistant|>\n{}<|end|>\n"
 export LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=1.0
-export LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0
+export LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=0
+export LAI_ENFORCE_EAGER=False
+export LAI_GPU_MEMORY_UTILIZATION=0.90
+export LAI_WORKER_USE_RAY=True
+export LAI_ENGINE_USE_RAY=True
@@ -2,125 +2,133 @@ ARG LOCAL_VERSION
 FROM ghcr.io/defenseunicorns/leapfrogai/leapfrogai-sdk:${LOCAL_VERSION} AS sdk
 
 FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS builder
-ARG SDK_DEST=src/leapfrogai_sdk/build
-
-# Set the config file defaults
-ARG PYTHON_VERSION=3.11.6
-ARG HF_HUB_ENABLE_HF_TRANSFER="1"
-ARG REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"
-ARG REVISION="gptq-4bit-32g-actorder_True"
-ARG QUANTIZATION="gptq"
-ARG MODEL_SOURCE="/data/.model/"
-ARG MAX_CONTEXT_LENGTH=32768
-ARG STOP_TOKENS='["</s>","<|endoftext|>","<|im_end|>"]'
-ARG PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n"
-ARG PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n"
-ARG PROMPT_FORMAT_CHAT_USER="USER: {}\n"
-ARG PROMPT_FORMAT_DEFAULTS_TOP_P=1.0
-ARG PROMPT_FORMAT_DEFAULTS_TOP_K=0
-ARG TENSOR_PARALLEL_SIZE=1
 
-ENV DEBIAN_FRONTEND=noninteractive
+# set SDK location
+# set the pyenv and Python versions
+# set model download args
+ARG SDK_DEST=src/leapfrogai_sdk/build \
+    PYTHON_VERSION=3.11.6 \
+    PYENV_GIT_TAG=v2.4.8
 
+# use root user for deps installation and nonroot user creation
 USER root
-
+# get deps for vllm compilation, pyenv, python and model downloading
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get -y install \
+    git \
+    make \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    wget \
+    curl \
+    llvm \
+    libncurses5-dev \
+    libncursesw5-dev \
+    tk-dev \
+    libffi-dev \
+    liblzma-dev
+
+# setup nonroot user and permissions
 RUN groupadd -g 65532 vglusers && \
     useradd -ms /bin/bash nonroot -u 65532 -g 65532 && \
     usermod -a -G video,sudo nonroot
-
-# grab necesary python dependencies
-# TODO @JPERRY: Get context as to why we are doing this for this Dockerfile but not our other ones
-RUN apt-get -y update \
-    && apt-get install -y software-properties-common \
-    && add-apt-repository universe \
-    && add-apt-repository ppa:deadsnakes/ppa  \
-    && apt-get -y update
-
-# get deps for vllm compilation, model download, and pyenv
-RUN apt-get -y install git python3-venv make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev tk-dev libffi-dev
-
 USER nonroot
-WORKDIR /home/leapfrogai
 
+# copy-in SDK from sdk stage and vllm source code from host
+WORKDIR /home/leapfrogai
 COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST}
 COPY --chown=nonroot:nonroot packages/vllm packages/vllm
 
-# # create virtual environment for light-weight portability and minimal libraries
-RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv
-ENV PYENV_ROOT="/home/leapfrogai/.pyenv"
-ENV PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH"
-RUN pyenv install ${PYTHON_VERSION}
-RUN pyenv global ${PYTHON_VERSION}
-RUN python3 -m venv .venv
+# create virtual environment for light-weight portability and minimal libraries
+RUN curl https://pyenv.run | bash && \
+    echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \
+    echo 'export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \
+    echo 'eval "$(pyenv init -)"' >> ~/.bashrc && \
+    echo 'eval "$(pyenv virtualenv-init -)"' >> ~/.bashrc
+
+# Set environment variables
+ENV PYENV_ROOT="/home/nonroot/.pyenv" \
+    PATH="/home/nonroot/.pyenv/bin:$PATH"
+
+# Install Python 3.11.6, set it as global, and create a venv
+RUN . ~/.bashrc && \
+    PYTHON_CONFIGURE_OPTS="--enable-shared" pyenv install 3.11.6 && \
+    pyenv global 3.11.6 && \
+    pyenv exec python -m venv .venv
+
+# set path to venv python
 ENV PATH="/home/leapfrogai/.venv/bin:$PATH"
 
-RUN rm -f packages/vllm/build/*.whl
-RUN python -m pip wheel packages/vllm -w packages/vllm/build --find-links=${SDK_DEST}
-RUN pip install packages/vllm/build/lfai_vllm*.whl --no-index --find-links=packages/vllm/build/
+RUN rm -f packages/vllm/build/*.whl && \
+    python -m pip wheel packages/vllm -w packages/vllm/build --find-links=${SDK_DEST} && \
+    pip install packages/vllm/build/lfai_vllm*.whl --no-index --find-links=packages/vllm/build/
+
+#################
+# FINAL CONTAINER
+#################
 
 FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04
-## COPIED FROM ABOVE ##
-ARG SDK_DEST=src/leapfrogai_sdk/build
-# Set the config file defaults
-ARG PYTHON_VERSION=3.11.6
-ARG HF_HUB_ENABLE_HF_TRANSFER="1"
-ARG REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"
-ARG REVISION="gptq-4bit-32g-actorder_True"
-ARG QUANTIZATION="gptq"
-ARG MODEL_SOURCE="/data/.model/"
-ARG MAX_CONTEXT_LENGTH=32768
-ARG STOP_TOKENS='["</s>","<|endoftext|>","<|im_end|>"]'
-ARG PROMPT_FORMAT_CHAT_SYSTEM="SYSTEM: {}\n"
-ARG PROMPT_FORMAT_CHAT_ASSISTANT="ASSISTANT: {}\n"
-ARG PROMPT_FORMAT_CHAT_USER="USER: {}\n"
-ARG PROMPT_FORMAT_DEFAULTS_TOP_P=1.0
-ARG PROMPT_FORMAT_DEFAULTS_TOP_K=0
-ARG TENSOR_PARALLEL_SIZE=1
 
-ENV DEBIAN_FRONTEND=noninteractive
+# set SDK location
+ARG SDK_DEST=src/leapfrogai_sdk/build
 
+# model-specific arguments
+ARG TRUST_REMOTE_CODE="True" \
+    MODEL_SOURCE=".model/" \
+    MAX_CONTEXT_LENGTH=32768 \
+    STOP_TOKENS='["</s>"]' \
+    PROMPT_FORMAT_CHAT_SYSTEM="<|system|>\n{}<|end|>\n" \
+    PROMPT_FORMAT_CHAT_USER="<|user|>\n{}<|end|>\n" \
+    PROMPT_FORMAT_CHAT_ASSISTANT="<|assistant|>\n{}<|end|>\n" \
+    PROMPT_FORMAT_DEFAULTS_TOP_P=1.0 \
+    PROMPT_FORMAT_DEFAULTS_TOP_K=0 \
+    TENSOR_PARALLEL_SIZE=1 \
+    ENFORCE_EAGER=False \
+    GPU_MEMORY_UTILIZATION=0.99 \
+    WORKER_USE_RAY=True \
+    ENGINE_USE_RAY=True
+
+# setup nonroot user and permissions
 USER root
-
 RUN groupadd -g 65532 vglusers && \
     useradd -ms /bin/bash nonroot -u 65532 -g 65532 && \
     usermod -a -G video,sudo nonroot
-
-RUN apt-get -y update
-RUN apt-get -y install git wget build-essential libssl-dev zlib1g-dev libffi-dev
-
 USER nonroot
 
 WORKDIR /home/leapfrogai
 
+# copy-in SDK from sdk stagem model and vllm source code from builder
 COPY --from=sdk --chown=nonroot:nonroot /leapfrogai/${SDK_DEST} ./${SDK_DEST}
 COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/.venv /home/leapfrogai/.venv
 COPY --from=builder --chown=nonroot:nonroot /home/leapfrogai/packages/vllm/src /home/leapfrogai/packages/vllm/src
+# copy-in python binaries
+COPY --from=builder --chown=nonroot:nonroot /home/nonroot/.pyenv/versions/3.11.6/ /home/nonroot/.pyenv/versions/3.11.6/
+
+# load ARG values into env variables for pickup by confz
+ENV LAI_TRUST_REMOTE_CODE=${TRUST_REMOTE_CODE} \
+    LAI_MODEL_SOURCE=${MODEL_SOURCE} \
+    LAI_MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH} \
+    LAI_STOP_TOKENS=${STOP_TOKENS} \
+    LAI_PROMPT_FORMAT_CHAT_SYSTEM=${PROMPT_FORMAT_CHAT_SYSTEM} \
+    LAI_PROMPT_FORMAT_CHAT_USER=${PROMPT_FORMAT_CHAT_USER} \
+    LAI_PROMPT_FORMAT_CHAT_ASSISTANT=${PROMPT_FORMAT_CHAT_ASSISTANT} \
+    LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=${PROMPT_FORMAT_DEFAULTS_TOP_P} \
+    LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=${PROMPT_FORMAT_DEFAULTS_TOP_K} \
+    LAI_TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE} \
+    LAI_ENFORCE_EAGER=${ENFORCE_EAGER} \
+    LAI_GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION} \
+    LAI_WORKER_USE_RAY=${WORKER_USE_RAY} \
+    LAI_ENGINE_USE_RAY=${ENGINE_USE_RAY} \
+    # remove vLLM callback to stats server
+    VLLM_NO_USAGE_STATS=1
 
-# # create virtual environment for light-weight portability and minimal libraries
-RUN git clone --depth=1 https://github.com/pyenv/pyenv.git .pyenv
-ENV PYENV_ROOT="/home/leapfrogai/.pyenv"
-ENV PATH="$PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH"
-RUN pyenv install ${PYTHON_VERSION}
 ENV PATH="/home/leapfrogai/.venv/bin:$PATH"
 
-# download model
-ENV HF_HOME=/home/leapfrogai/.cache/huggingface
-
-# Load ARG values into env variables for pickup by confz
-ENV LAI_HF_HUB_ENABLE_HF_TRANSFER=${HF_HUB_ENABLE_HF_TRANSFER}
-ENV LAI_REPO_ID=${REPO_ID}
-ENV LAI_REVISION=${REVISION}
-ENV LAI_QUANTIZATION=${QUANTIZATION}
-ENV LAI_TENSOR_PARALLEL_SIZE=${TENSOR_PARALLEL_SIZE}
-ENV LAI_MODEL_SOURCE=${MODEL_SOURCE}
-ENV LAI_MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH}
-ENV LAI_STOP_TOKENS=${STOP_TOKENS}
-ENV LAI_PROMPT_FORMAT_CHAT_SYSTEM=${PROMPT_FORMAT_CHAT_SYSTEM}
-ENV LAI_PROMPT_FORMAT_CHAT_ASSISTANT=${PROMPT_FORMAT_CHAT_ASSISTANT}
-ENV LAI_PROMPT_FORMAT_CHAT_USER=${PROMPT_FORMAT_CHAT_USER}
-ENV LAI_PROMPT_FORMAT_DEFAULTS_TOP_P=${PROMPT_FORMAT_DEFAULTS_TOP_P}
-ENV LAI_PROMPT_FORMAT_DEFAULTS_TOP_K=${PROMPT_FORMAT_DEFAULTS_TOP_K}
-
 EXPOSE 50051:50051
 
 ENTRYPOINT ["python", "-m", "leapfrogai_sdk.cli", "--app-dir=packages/vllm/src/", "main:Model"]
@@ -2,7 +2,6 @@
 
 A LeapfrogAI API-compatible [vLLM](https://github.com/vllm-project/vllm) wrapper for quantized and un-quantized model inferencing across GPU infrastructures.
 
-
 ## Usage
 
 See [instructions](#instructions) to get the backend up and running. Then, use the [LeapfrogAI API server](https://github.com/defenseunicorns/leapfrogai-api) to interact with the backend.
@@ -21,15 +20,17 @@ The following are additional assumptions for GPU inferencing:
 
 ### Model Selection
 
-The default model that comes with this backend in this repository's officially released images is a [4-bit quantization of the Synthia-7b model](https://huggingface.co/TheBloke/SynthIA-7B-v2.0-GPTQ).
+The default model that comes with this backend in this repository's officially released images is a [4-bit quantization of the Phi-3-Mini-128k-Instruct model](https://huggingface.co/bsmit1659/Phi-3-mini-128k-instruct-0.2-awq).
 
 You can optionally specify different models or quantization types using the following Docker build arguments:
 
-- `--build-arg HF_HUB_ENABLE_HF_TRANSFER="1"`: Enable or disable HuggingFace Hub transfer (default: 1)
-- `--build-arg REPO_ID="TheBloke/Synthia-7B-v2.0-GPTQ"`: HuggingFace repository ID for the model
-- `--build-arg REVISION="gptq-4bit-32g-actorder_True"`: Revision or commit hash for the model
-- `--build-arg QUANTIZATION="gptq"`: Quantization type (e.g., gptq, awq, or empty for un-quantized)
+- `--build-arg MAX_CONTEXT_LENGTH="32768"`: Max context length, cannot exceed model's max length - the greater length the greater the vRAM requirements
 - `--build-arg TENSOR_PARALLEL_SIZE="1"`: The number of gpus to spread the tensor processing across
+- `--build-arg TRUST_REMOTE_CODE="True"`: Whether to trust inferencing code downloaded as part of the model download
+- `--build-arg ENGINE_USE_RAY="False"`: Distributed, multi-node inferencing mode for the engine
+- `--build-arg WORKER_USE_RAY="False"`: Distributed, multi-node inferencing mode for the worker(s)
+- `--build-arg GPU_MEMORY_UTILIZATION="0.99"`: Max memory utilization (fraction, out of 1.0) for the vLLM process
+- `--build-arg ENFORCE_EAGER="False"`: Disable CUDA graphs for faster token first-inferencing at the cost of more GPU memory (set to False for production)
 
 ## Zarf Package Deployment
 
@@ -46,6 +47,7 @@ uds zarf package deploy packages/vllm/zarf-package-vllm-*-dev.tar.zst --confirm
 ## Run Locally
 
 To run the vllm backend locally (starting from the root directory of the repository):
+
 ```bash
 # Setup Virtual Environment if you haven't done so already
 python -m venv .venv

@@ -8,7 +8,7 @@ version = "0.9.2"
 
 dependencies = [
     "pydantic >= 2.3.0",
-    "vllm==0.4.2",
+    "vllm==0.5.3.post1",
     "python-dotenv>=1.0.1",
     "aiostream>=0.5.2",
     "leapfrogai-sdk",

@@ -5,39 +5,85 @@
 
 
 class ConfigOptions(BaseConfig):
-    quantization: Literal[None, "awq", "gptq", "squeezellm"] = Field(
-        default=None,
-        description="Type of quantization, for un-quantized models omit this field",
-    )
     tensor_parallel_size: int = Field(
         default=1,
         title="GPU Utilization Count",
         description="The number of gpus to spread the tensor processing across."
         "This must be divisible to the number of attention heads in the model",
         examples=[1, 2, 3],
     )
+    enforce_eager: bool = Field(
+        default=True,
+        title="Enable Eager Mode",
+        description="Enable eager mode to start token generation immediately after prompt processing."
+        "Potentially reduces initial latency at the cost of slightly higher memory usage."
+        "Should be set to False in production environments with higher GPU memory.",
+        examples=[True, False],
+    )
+    gpu_memory_utilization: float = Field(
+        default=0.99,
+        title="GPU Memory Limit",
+        description="Maximum amount of GPU vRAM allocated to the vLLM engine and worker(s)",
+        examples=[0.50, 0.90, 0.99],
+    )
+    engine_use_ray: bool = Field(
+        default=True,
+        title="Use Ray for Engine",
+        description="Enable distributed inferencing for multi-node situations.",
+        examples=[True, False],
+    )
+    worker_use_ray: bool = Field(
+        default=True,
+        title="Use Ray for Worker",
+        description="Enable distributed inferencing for multi-node situations.",
+        examples=[True, False],
+    )
+    trust_remote_code: bool = Field(
+        default=True,
+        title="Trust Downloaded Model Code",
+        description="Whether to trust inferencing code downloaded as part of the model download."
+        "Please review the Python code in the .model/ directory before trusting custom model code.",
+        examples=[True, False],
+    )
 
 
 class DownloadOptions(BaseConfig):
     hf_hub_enable_hf_transfer: Literal["0", "1"] = Field(
         description="Option (0 - Disable, 1 - Enable) for faster transfers, tradeoff stability for faster speeds"
     )
     repo_id: str = Field(
-        description="HuggingFace repo id",
+        description="The HuggingFace git repository ID",
         examples=[
             "TheBloke/Synthia-7B-v2.0-GPTQ",
             "migtissera/Synthia-MoE-v3-Mixtral-8x7B",
             "microsoft/phi-2",
         ],
     )
     revision: str = Field(
-        description="The model branch to use",
+        description="The HuggingFace repository git branch to use",
         examples=["main", "gptq-4bit-64g-actorder_True"],
     )
 
 
 class AppConfig(BaseConfig):
     backend_options: ConfigOptions
+    CONFIG_SOURCES = [
+        EnvSource(
+            allow_all=True,
+            prefix="LAI_",
+            remap={
+                "tensor_parallel_size": "backend_options.tensor_parallel_size",
+                "trust_remote_code": "backend_options.trust_remote_code",
+                "enforce_eager": "backend_options.enforce_eager",
+                "gpu_memory_utilization": "backend_options.gpu_memory_utilization",
+                "worker_use_ray": "backend_options.worker_use_ray",
+                "engine_use_ray": "backend_options.engine_use_ray",
+            },
+        )
+    ]
+
+
+class DownloadConfig(BaseConfig):
     download_options: Optional[DownloadOptions]
     CONFIG_SOURCES = [
         EnvSource(
@@ -47,8 +93,6 @@ class AppConfig(BaseConfig):
                 "hf_hub_enable_hf_transfer": "download_options.hf_hub_enable_hf_transfer",
                 "repo_id": "download_options.repo_id",
                 "revision": "download_options.revision",
-                "quantization": "backend_options.quantization",
-                "tensor_parallel_size": "backend_options.tensor_parallel_size",
             },
         )
     ]