Merge pull request #403 from ROCm/upstream_merge_25_02_03

Upstream merge 25 02 03
ROCm · Feb 3, 2025 · b43c8d1 · b43c8d1
2 parents fdb06c3 + c887bc9
commit b43c8d1
Show file tree

Hide file tree

Showing 1,209 changed files with 23,627 additions and 4,264 deletions.
diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
@@ -1,12 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 import sys
 import zipfile
 
-# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
+# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
 # Note that we have 400 MiB quota, please use it wisely.
 # See https://github.com/pypi/support/issues/3792 .
 # Please also sync the value with the one in Dockerfile.
-VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
+VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
 
 
 def print_top_10_largest_files(zip_file):

diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import os
 

diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 LM eval harness on model to compare vs HF baseline computed offline.
 Configs are found in configs/$MODEL.yaml

diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 from pathlib import Path

diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 
 from transformers import AutoTokenizer

diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import argparse
 import json
 from pathlib import Path

diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 from lmdeploy.serve.openai.api_client import APIClient
 
 api_client = APIClient("http://localhost:8000")

diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import datetime
 import json
 import os

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -56,6 +56,11 @@ steps:
     env:
       DOCKER_BUILDKIT: "1"
 
+  - input: "Provide Release version here"
+    fields:
+      - text: "What is the release version?"
+        key: "release-version"
+
   - block: "Build CPU release image"
     key: block-cpu-release-image-build
     depends_on: ~
@@ -66,7 +71,7 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
-      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
+      - "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/run-gh200-test.sh b/.buildkite/run-gh200-test.sh
@@ -23,6 +23,6 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and test offline inference
-docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
-    python3 examples/offline_inference/basic.py
+docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
+    python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
 '
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
@@ -54,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
+       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -50,9 +50,9 @@ steps:
   - tests/multimodal
   - tests/test_utils
   - tests/worker
-  - tests/standalone_tests/lazy_torch_compile.py
+  - tests/standalone_tests/lazy_imports.py
   commands:
-  - python3 standalone_tests/lazy_torch_compile.py
+  - python3 standalone_tests/lazy_imports.py
   - pytest -v -s mq_llm_engine # MQLLMEngine
   - pytest -v -s async_engine # AsyncLLMEngine
   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
@@ -349,6 +349,7 @@ steps:
   - vllm/
   - tests/models
   commands:
+    - pytest -v -s models/test_transformers.py
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
@@ -485,6 +486,7 @@ steps:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
   # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
   - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'

diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -35,6 +35,43 @@ pull_request_rules:
       add:
         - frontend
 
+- name: label-structured-output
+  description: Automatically apply structured-output label
+  conditions:
+    - or:
+      - files~=^vllm/model_executor/guided_decoding/
+      - files=tests/model_executor/test_guided_processors.py
+      - files=tests/entrypoints/llm/test_guided_generate.py
+      - files=benchmarks/benchmark_serving_guided.py
+      - files=benchmarks/benchmark_guided.py
+  actions:
+    label:
+      add:
+        - structured-output
+
+- name: label-speculative-decoding
+  description: Automatically apply speculative-decoding label
+  conditions:
+    - or:
+      - files~=^vllm/spec_decode/
+      - files=vllm/model_executor/layers/spec_decode_base_sampler.py
+      - files~=^tests/spec_decode/
+  actions:
+    label:
+      add:
+        - speculative-decoding
+
+- name: label-v1
+  description: Automatically apply v1 label
+  conditions:
+    - or:
+      - files~=^vllm/v1/
+      - files~=^tests/v1/
+  actions:
+    label:
+      add:
+        - v1
+
 - name: ping author on conflicts and add 'needs-rebase' label
   conditions:
       - conflict

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -3,18 +3,18 @@ default_stages:
   - manual # Run in CI
 repos:
 - repo: https://github.com/google/yapf
-  rev: v0.32.0
+  rev: v0.43.0
   hooks:
   - id: yapf
     args: [--in-place, --verbose]
     additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.6.5
+  rev: v0.9.3
   hooks:
   - id: ruff
     args: [--output-format, github]
 - repo: https://github.com/codespell-project/codespell
-  rev: v2.3.0
+  rev: v2.4.0
   hooks:
   - id: codespell
     exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|csrc/rocm/.*|csrc/gradlib/.*'
@@ -23,7 +23,7 @@ repos:
   hooks:
   - id: isort
 - repo: https://github.com/pre-commit/mirrors-clang-format
-  rev: v18.1.5
+  rev: v19.1.7
   hooks:
   - id: clang-format
     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
@@ -35,7 +35,7 @@ repos:
   - id: pymarkdown
     files: docs/.*
 - repo: https://github.com/rhysd/actionlint
-  rev: v1.7.6
+  rev: v1.7.7
   hooks:
   - id: actionlint
 - repo: local
@@ -85,8 +85,26 @@ repos:
     entry: tools/png-lint.sh
     language: script
     types: [png]
+  - id: signoff-commit
+    name: Sign-off Commit
+    entry: bash
+    args:
+      - -c
+      - |
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
+        fi
+    language: system
+    verbose: true
+    stages: [commit-msg]
+  - id: check-spdx-header
+    name: Check SPDX headers
+    entry: python tools/check_spdx_header.py
+    language: python
+    types: [python]
   - id: suggestion
     name: Suggestion
     entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
     language: system
     verbose: true
+    pass_filenames: false
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -279,7 +279,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
     FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
-        GIT_TAG v3.6.0
+        GIT_TAG v3.7.0
         GIT_PROGRESS TRUE
 
         # Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@@ -333,7 +333,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   # CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
   cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
-    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
+    set(SRCS 
+       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
+       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
     set_gencode_flags_for_srcs(
       SRCS "${SRCS}"
       CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
@@ -485,9 +490,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
 endif()
 
 message(STATUS "Enabling C extension.")
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  list(APPEND VLLM_C_LIBS cuda)
-endif()
 define_gpu_extension_target(
   _C
   DESTINATION vllm
@@ -496,7 +498,6 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
-  LIBRARIES ${VLLM_C_LIBS}
   USE_SABI 3
   WITH_SOABI)
 

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
@@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
 
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement in the #code-of-conduct
-channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
+channel in the [vLLM Slack](https://slack.vllm.ai).
 All complaints will be reviewed and investigated promptly and fairly.
 
 All community leaders are obligated to respect the privacy and security of the

diff --git a/Dockerfile b/Dockerfile
@@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
 # Check the size of the wheel if RUN_WHEEL_CHECK is true
 COPY .buildkite/check-wheel-size.py check-wheel-size.py
 # sync the default value with .buildkite/check-wheel-size.py
-ARG VLLM_MAX_SIZE_MB=300
+ARG VLLM_MAX_SIZE_MB=400
 ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
 ARG RUN_WHEEL_CHECK=true
 RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \

diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
@@ -4,12 +4,12 @@ USER root
 
 ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
 
-RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
+RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev 
 
 # Some packages in requirements-cpu are installed here
 # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
 # Currently these may not be available for venv or pip directly
-RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
+RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
 
 COPY ./ /workspace/vllm
 
@@ -21,7 +21,6 @@ RUN --mount=type=bind,source=.git,target=.git \
 RUN --mount=type=cache,target=/root/.cache/pip  \
     RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
         'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-        torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0
 

diff --git a/README.md b/README.md
@@ -10,12 +10,13 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>
 
 <p align="center">
-| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
 </p>
 
 ---
 
 *Latest News* 🔥
+- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
 - [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
 - [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
 - [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
@@ -35,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone
 ## About
 vLLM is a fast and easy-to-use library for LLM inference and serving.
 
-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
 
 vLLM is fast with:
 
@@ -138,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
 ## Contact Us
 
 * For technical questions and feature requests, please use Github issues or discussions.
-* For discussing with fellow users, please use Discord.
-* For coordinating contributions and development, please use Slack.
+* For discussing with fellow users and coordinating contributions and development, please use Slack.
 * For security disclosures, please use Github's security advisory feature.
 * For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
 

diff --git a/benchmarks/P3L.py b/benchmarks/P3L.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
 """
 Patch-Perplexity (P3L)
 

diff --git a/benchmarks/P3L_mling.py b/benchmarks/P3L_mling.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
 """
 *MULTILINGUAL*  Patch-Perplexity (P3L)
 

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -1,3 +1,5 @@
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import os
 import sys

diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark guided decoding throughput."""
 import argparse
 import dataclasses

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark the latency of processing a single batch of requests."""
 import argparse
 import dataclasses

diff --git a/benchmarks/benchmark_long_document_qa_throughput.py b/benchmarks/benchmark_long_document_qa_throughput.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Offline benchmark to test the long document QA throughput.
 

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """
 Benchmark the efficiency of prefix caching.
 

diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: Apache-2.0
 """Benchmark offline prioritization."""
 import argparse
 import dataclasses