Skip to content

Commit

Permalink
Merge pull request #403 from ROCm/upstream_merge_25_02_03
Browse files Browse the repository at this point in the history
Upstream merge 25 02 03
  • Loading branch information
gshtras authored Feb 3, 2025
2 parents fdb06c3 + c887bc9 commit b43c8d1
Show file tree
Hide file tree
Showing 1,209 changed files with 23,627 additions and 4,264 deletions.
6 changes: 4 additions & 2 deletions .buildkite/check-wheel-size.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# SPDX-License-Identifier: Apache-2.0

import os
import sys
import zipfile

# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 300 MiB
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
# Note that we have 400 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/3792 .
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 300))
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))


def print_top_10_largest_files(zip_file):
Expand Down
2 changes: 2 additions & 0 deletions .buildkite/generate_index.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import os

Expand Down
1 change: 1 addition & 0 deletions .buildkite/lm-eval-harness/test_lm_eval_correctness.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""
LM eval harness on model to compare vs HF baseline computed offline.
Configs are found in configs/$MODEL.yaml
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import json
import os
from pathlib import Path
Expand Down
2 changes: 2 additions & 0 deletions .buildkite/nightly-benchmarks/scripts/download-tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse

from transformers import AutoTokenizer
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import argparse
import json
from pathlib import Path
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

from lmdeploy.serve.openai.api_client import APIClient

api_client = APIClient("http://localhost:8000")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import datetime
import json
import os
Expand Down
9 changes: 7 additions & 2 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ steps:
env:
DOCKER_BUILDKIT: "1"

- input: "Provide Release version here"
fields:
- text: "What is the release version?"
key: "release-version"

- block: "Build CPU release image"
key: block-cpu-release-image-build
depends_on: ~
Expand All @@ -66,7 +71,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --progress plain -f Dockerfile.cpu ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
env:
DOCKER_BUILDKIT: "1"
4 changes: 2 additions & 2 deletions .buildkite/run-gh200-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/basic.py
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
'
2 changes: 1 addition & 1 deletion .buildkite/run-neuron-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,4 +54,4 @@ docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py"
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
Empty file modified .buildkite/run-tpu-test.sh
100644 → 100755
Empty file.
6 changes: 4 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,9 @@ steps:
- tests/multimodal
- tests/test_utils
- tests/worker
- tests/standalone_tests/lazy_torch_compile.py
- tests/standalone_tests/lazy_imports.py
commands:
- python3 standalone_tests/lazy_torch_compile.py
- python3 standalone_tests/lazy_imports.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
Expand Down Expand Up @@ -349,6 +349,7 @@ steps:
- vllm/
- tests/models
commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py

Expand Down Expand Up @@ -485,6 +486,7 @@ steps:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
Expand Down
37 changes: 37 additions & 0 deletions .github/mergify.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,43 @@ pull_request_rules:
add:
- frontend

- name: label-structured-output
description: Automatically apply structured-output label
conditions:
- or:
- files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py
- files=benchmarks/benchmark_serving_guided.py
- files=benchmarks/benchmark_guided.py
actions:
label:
add:
- structured-output

- name: label-speculative-decoding
description: Automatically apply speculative-decoding label
conditions:
- or:
- files~=^vllm/spec_decode/
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
- files~=^tests/spec_decode/
actions:
label:
add:
- speculative-decoding

- name: label-v1
description: Automatically apply v1 label
conditions:
- or:
- files~=^vllm/v1/
- files~=^tests/v1/
actions:
label:
add:
- v1

- name: ping author on conflicts and add 'needs-rebase' label
conditions:
- conflict
Expand Down
28 changes: 23 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@ default_stages:
- manual # Run in CI
repos:
- repo: https://github.com/google/yapf
rev: v0.32.0
rev: v0.43.0
hooks:
- id: yapf
args: [--in-place, --verbose]
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.5
rev: v0.9.3
hooks:
- id: ruff
args: [--output-format, github]
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
rev: v2.4.0
hooks:
- id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*|csrc/rocm/.*|csrc/gradlib/.*'
Expand All @@ -23,7 +23,7 @@ repos:
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v18.1.5
rev: v19.1.7
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
Expand All @@ -35,7 +35,7 @@ repos:
- id: pymarkdown
files: docs/.*
- repo: https://github.com/rhysd/actionlint
rev: v1.7.6
rev: v1.7.7
hooks:
- id: actionlint
- repo: local
Expand Down Expand Up @@ -85,8 +85,26 @@ repos:
entry: tools/png-lint.sh
language: script
types: [png]
- id: signoff-commit
name: Sign-off Commit
entry: bash
args:
- -c
- |
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
fi
language: system
verbose: true
stages: [commit-msg]
- id: check-spdx-header
name: Check SPDX headers
entry: python tools/check_spdx_header.py
language: python
types: [python]
- id: suggestion
name: Suggestion
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
language: system
verbose: true
pass_filenames: false
13 changes: 7 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_TAG v3.6.0
GIT_TAG v3.7.0
GIT_PROGRESS TRUE

# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
Expand Down Expand Up @@ -333,7 +333,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
Expand Down Expand Up @@ -485,9 +490,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
endif()

message(STATUS "Enabling C extension.")
if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_C_LIBS cuda)
endif()
define_gpu_extension_target(
_C
DESTINATION vllm
Expand All @@ -496,7 +498,6 @@ define_gpu_extension_target(
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
LIBRARIES ${VLLM_C_LIBS}
USE_SABI 3
WITH_SOABI)

Expand Down
2 changes: 1 addition & 1 deletion CODE_OF_CONDUCT.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ representative at an online or offline/IRL event.

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement in the #code-of-conduct
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
channel in the [vLLM Slack](https://slack.vllm.ai).
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=300
ARG VLLM_MAX_SIZE_MB=400
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
Expand Down
5 changes: 2 additions & 3 deletions Dockerfile.ppc64le
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ USER root

ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"

RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev

# Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes

COPY ./ /workspace/vllm

Expand All @@ -21,7 +21,6 @@ RUN --mount=type=bind,source=.git,target=.git \
RUN --mount=type=cache,target=/root/.cache/pip \
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
torch==2.3.1 \
-r requirements-cpu.txt \
xformers uvloop==0.20.0

Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,13 @@ Easy, fast, and cheap LLM serving for everyone
</h3>

<p align="center">
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p>

---

*Latest News* 🔥
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
Expand All @@ -35,7 +36,7 @@ Easy, fast, and cheap LLM serving for everyone
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.

Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evloved into a community-driven project with contributions from both academia and industry.
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.

vLLM is fast with:

Expand Down Expand Up @@ -138,8 +139,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us

* For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users, please use Discord.
* For coordinating contributions and development, please use Slack.
* For discussing with fellow users and coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.

Expand Down
1 change: 1 addition & 0 deletions benchmarks/P3L.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
"""
Patch-Perplexity (P3L)
Expand Down
1 change: 1 addition & 0 deletions benchmarks/P3L_mling.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
"""
*MULTILINGUAL* Patch-Perplexity (P3L)
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/backend_request_func.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
# SPDX-License-Identifier: Apache-2.0

import json
import os
import sys
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_guided.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark guided decoding throughput."""
import argparse
import dataclasses
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark the latency of processing a single batch of requests."""
import argparse
import dataclasses
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_long_document_qa_throughput.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""
Offline benchmark to test the long document QA throughput.
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_prefix_caching.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""
Benchmark the efficiency of prefix caching.
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmark_prioritization.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark offline prioritization."""
import argparse
import dataclasses
Expand Down
Loading

0 comments on commit b43c8d1

Please sign in to comment.