diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml new file mode 100644 index 0000000000000..fa6ea236ef04f --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-70B-Instruct.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.892 + - name: "exact_match,flexible-extract" + value: 0.892 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml new file mode 100644 index 0000000000000..02668702b83af --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 +model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.756 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml new file mode 100644 index 0000000000000..fb4b4915ab955 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1 +model_name: "meta-llama/Meta-Llama-3-8B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.756 + - name: "exact_match,flexible-extract" + value: 0.752 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml new file mode 100644 index 0000000000000..dec9164d1b84e --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/Mixtral-8x7B-Instruct-v0.1.yaml @@ -0,0 +1,11 @@ +# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4 +model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.616 + - name: "exact_match,flexible-extract" + value: 0.632 +limit: 250 +num_fewshot: 5 diff --git a/.buildkite/lm-eval-harness/configs/models-large.txt b/.buildkite/lm-eval-harness/configs/models-large.txt new file mode 100644 index 0000000000000..127ec5d97bcff --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-large.txt @@ -0,0 +1,2 @@ +Meta-Llama-3-70B-Instruct.yaml +Mixtral-8x7B-Instruct-v0.1.yaml diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt new file mode 100644 index 0000000000000..273c5482db264 --- /dev/null +++ b/.buildkite/lm-eval-harness/configs/models-small.txt @@ -0,0 +1,2 @@ +Meta-Llama-3-8B-Instruct.yaml +Meta-Llama-3-8B-Instruct-FP8.yaml diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh new file mode 100644 index 0000000000000..fdb8ec5393b36 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for transformers. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo +} + +while getopts "m:b:l:f:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model hf \ + --model_args pretrained=$MODEL,parallelize=True \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh new file mode 100644 index 0000000000000..a2876bade8893 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install lm-eval==0.4.2 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh new file mode 100644 index 0000000000000..b4fdde6dab425 --- /dev/null +++ b/.buildkite/lm-eval-harness/run-tests.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using vllm and compares to " + echo "precomputed baseline (measured by HF transformers.)" + echo + echo "usage: ${0} " + echo + echo " -c - path to the test data config (e.g. configs/small-models.txt)" + echo " -t - tensor parallel size" + echo +} + +SUCCESS=0 + +while getopts "c:t:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +# Parse list of configs. +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG + +for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE===" + + export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG} + export LM_EVAL_TP_SIZE=$TP_SIZE + pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$? + + if [[ $LOCAL_SUCCESS == 0 ]]; then + echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" + else + echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" + fi + + SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) + +done + +if [ "${SUCCESS}" -eq "0" ]; then + exit 0 +else + exit 1 +fi diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py new file mode 100644 index 0000000000000..975841dad1c29 --- /dev/null +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -0,0 +1,54 @@ +""" +LM eval harness on model to compare vs HF baseline computed offline. +Configs are found in configs/$MODEL.yaml + +* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml +* export LM_EVAL_TP_SIZE=4 +* pytest -s test_lm_eval_correctness.py +""" + +import os +from pathlib import Path + +import lm_eval +import numpy +import yaml + +RTOL = 0.02 +TEST_DATA_FILE = os.environ.get( + "LM_EVAL_TEST_DATA_FILE", + ".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml") + +TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1) + + +def launch_lm_eval(eval_config): + model_args = f"pretrained={eval_config['model_name']}," \ + f"tensor_parallel_size={TP_SIZE}" + + results = lm_eval.simple_evaluate( + model="vllm", + model_args=model_args, + tasks=[task["name"] for task in eval_config["tasks"]], + num_fewshot=eval_config["num_fewshot"], + limit=eval_config["limit"], + batch_size="auto") + + return results + + +def test_lm_eval_correctness(): + eval_config = yaml.safe_load( + Path(TEST_DATA_FILE).read_text(encoding="utf-8")) + + # Launch eval requests. + results = launch_lm_eval(eval_config) + + # Confirm scores match ground truth. + for task in eval_config["tasks"]: + for metric in task["metrics"]: + ground_truth = metric["value"] + measured_value = results["results"][task["name"]][metric["name"]] + print(f'{task["name"]} | {metric["name"]}: ' + f'ground_truth={ground_truth} | measured={measured_value}') + assert numpy.isclose(ground_truth, measured_value, rtol=RTOL) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md new file mode 100644 index 0000000000000..4036b32a46bf7 --- /dev/null +++ b/.buildkite/nightly-benchmarks/README.md @@ -0,0 +1,103 @@ +# vLLM benchmark suite + +## Introduction + +This directory contains the performance benchmarking CI for vllm. +The goal is to help developers know the impact of their PRs on the performance of vllm. + +This benchmark will be *triggered* upon: +- A PR being merged into vllm. +- Every commit for those PRs with `perf-benchmarks` label. + +**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for more GPUs is comming later), with different models. + +**Benchmarking Duration**: about 1hr. + +**For benchmarking developers**: please try your best to constraint the duration of benchmarking to less than 1.5 hr so that it won't take forever to run. + + +## Configuring the workload + +The benchmarking workload contains three parts: +- Latency tests in `latency-tests.json`. +- Throughput tests in `throughput-tests.json`. +- Serving tests in `serving-tests.json`. + +See [descriptions.md](tests/descriptions.md) for detailed descriptions. + +### Latency test + +Here is an example of one test inside `latency-tests.json`: + +```json +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, +] +``` + +In this example: +- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`. +- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-benchmarks-suite.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15` + +Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly. + +WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file. + + +### Throughput test +The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`. + +The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot. + +### Serving test +We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example: + +``` +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, +] +``` + +Inside this example: +- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`. +- The `server-parameters` includes the command line arguments for vLLM server. +- The `client-parameters` includes the command line arguments for `benchmark_serving.py`. +- The `qps_list` controls the list of qps for test. It will be used to configure the `--request-rate` parameter in `benchmark_serving.py` + +The number of this test is less stable compared to the delay and latency benchmarks (due to randomized sharegpt dataset sampling inside `benchmark_serving.py`), but a large change on this number (e.g. 5% change) still vary the output greatly. + +WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`. + +## Visualizing the results +The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results. +You can find the result presented as a table inside the `buildkite/performance-benchmark` job page. +If you do not see the table, please wait till the benchmark finish running. +The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file. +The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking. diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml new file mode 100644 index 0000000000000..2b25c954b5c5c --- /dev/null +++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml @@ -0,0 +1,62 @@ +steps: + - label: "Wait for container to be ready" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + - image: badouralix/curl-jq + command: + - sh + - .buildkite/nightly-benchmarks/scripts/wait-for-image.sh + - wait + - label: "A100 Benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + command: + - bash .buildkite/nightly-benchmarks/run-benchmarks-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + # - label: "H100: NVIDIA SMI" + # agents: + # queue: H100 + # plugins: + # - docker#v5.11.0: + # image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + # command: + # - bash + # - .buildkite/nightly-benchmarks/run-benchmarks-suite.sh + # mount-buildkite-agent: true + # propagate-environment: true + # propagate-uid-gid: false + # ipc: host + # gpus: all + # environment: + # - VLLM_USAGE_SOURCE + # - HF_TOKEN + diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index d3bf3b72980a6..15d411febcee1 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash +# NOTE(simon): this script runs inside a buildkite agent with CPU only access. set -euo pipefail # Install system packages @@ -23,4 +24,4 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then fi # Upload sample.yaml -buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml +buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml diff --git a/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh new file mode 100644 index 0000000000000..021473f76d0e5 --- /dev/null +++ b/.buildkite/nightly-benchmarks/run-benchmarks-suite.sh @@ -0,0 +1,358 @@ +#!/bin/bash + +# This script should be run inside the CI process +# This script assumes that we are already inside the vllm/ directory +# Benchmarking results will be available inside vllm/benchmarks/results/ + +# Do not set -e, as the mixtral 8x22B model tends to crash occasionally +# and we still want to see other benchmarking results even when mixtral crashes. +set -o pipefail + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +check_hf_token() { + # check if HF_TOKEN is available and valid + if [[ -z "$HF_TOKEN" ]]; then + echo "Error: HF_TOKEN is not set." + exit 1 + elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then + echo "Error: HF_TOKEN does not start with 'hf_'." + exit 1 + else + echo "HF_TOKEN is set and valid." + fi +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 +} + +kill_gpu_processes() { + # kill all processes on GPU. + pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) + if [ -z "$pids" ]; then + echo "No GPU processes found." + else + for pid in $pids; do + kill -9 "$pid" + echo "Killed process with PID: $pid" + done + + echo "All GPU processes have been killed." + fi + + # waiting for GPU processes to be fully killed + sleep 10 + + # remove vllm config file + rm -rf ~/.config/vllm + + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/benchmark_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + +run_latency_tests() { + # run latency tests using `benchmark_latency.py` + # $1: a json file specifying latency test cases + + local latency_test_file + latency_test_file=$1 + + # Iterate over latency tests + jq -c '.[]' "$latency_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^latency_ ]]; then + echo "In latency-test.json, test_name must start with \"latency_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + latency_params=$(echo "$params" | jq -r '.parameters') + latency_args=$(json2args "$latency_params") + + # check if there is enough GPU to run the test + tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + latency_command="python3 benchmark_latency.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $latency_args" + + echo "Running test case $test_name" + echo "Latency command: $latency_command" + + # recoding benchmarking command ang GPU command + jq_output=$(jq -n \ + --arg latency "$latency_command" \ + --arg gpu "$gpu_type" \ + '{ + latency_command: $latency, + gpu_type: $gpu + }') + echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" + + # run the benchmark + eval "$latency_command" + + kill_gpu_processes + + done +} + + +run_throughput_tests() { + # run throughput tests using `benchmark_throughput.py` + # $1: a json file specifying throughput test cases + + local throughput_test_file + throughput_test_file=$1 + + # Iterate over throughput tests + jq -c '.[]' "$throughput_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^throughput_ ]]; then + echo "In throughput-test.json, test_name must start with \"throughput_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get arguments + throughput_params=$(echo "$params" | jq -r '.parameters') + throughput_args=$(json2args "$throughput_params") + + # check if there is enough GPU to run the test + tp=$(echo $throughput_params | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + throughput_command="python3 benchmark_throughput.py \ + --output-json $RESULTS_FOLDER/${test_name}.json \ + $throughput_args" + + echo "Running test case $test_name" + echo "Throughput command: $throughput_command" + # recoding benchmarking command ang GPU command + jq_output=$(jq -n \ + --arg command "$throughput_command" \ + --arg gpu "$gpu_type" \ + '{ + throughput_command: $command, + gpu_type: $gpu + }') + echo "$jq_output" > "$RESULTS_FOLDER/$test_name.commands" + + # run the benchmark + eval "$throughput_command" + + kill_gpu_processes + + done +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + if [[ ! "$test_name" =~ ^serving_ ]]; then + echo "In serving-test.json, test_name must start with \"serving_\"." + exit 1 + fi + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.server_parameters') + client_params=$(echo "$params" | jq -r '.client_parameters') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + # check if server model and client model is aligned + server_model=$(echo "$server_params" | jq -r '.model') + client_model=$(echo "$client_params" | jq -r '.model') + if [[ $server_model != "$client_model" ]]; then + echo "Server model and client model must be the same. Skip testcase $testname." + continue + fi + + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + eval "$server_command" & + + # wait until the server is alive + wait_for_server + if [ $? -eq 0 ]; then + echo "" + echo "vllm server is up and running." + else + echo "" + echo "vllm failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="python3 benchmark_serving.py \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu + }') + echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands" + + done + + # clean up + kill_gpu_processes + done +} + +main() { + check_gpus + check_hf_token + + # dependencies + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + + # get the current IP address, required by benchmark_serving.py + export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') + # turn of the reporting of the status of each request, to clean up the terminal output + export VLLM_LOG_LEVEL="WARNING" + + # prepare for benchmarking + cd benchmarks || exit 1 + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + + # benchmarking + run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json + run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json + run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json + + + # postprocess benchmarking results + pip install tabulate pandas + python3 $QUICK_BENCHMARK_ROOT/scripts/convert-results-json-to-markdown.py + + upload_to_buildkite +} + +main "$@" diff --git a/.buildkite/nightly-benchmarks/sample.yaml b/.buildkite/nightly-benchmarks/sample.yaml deleted file mode 100644 index 50e6e82072186..0000000000000 --- a/.buildkite/nightly-benchmarks/sample.yaml +++ /dev/null @@ -1,39 +0,0 @@ -steps: - # NOTE(simon): You can create separate blocks for different jobs - - label: "A100: NVIDIA SMI" - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - containers: - # - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT - # TODO(simon): check latest main branch or use the PR image. - - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6 - command: - - bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls' - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - # TODO(simon): bring H100 online - # - label: "H100: NVIDIA SMI" - # agents: - # queue: H100 - # plugins: - # - docker#v5.11.0: - # image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6 - # command: - # - bash -c 'nvidia-smi && nvidia-smi topo -m' - # propagate-environment: true - # ipc: host - # gpus: all - diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py new file mode 100644 index 0000000000000..534ecf17930e9 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -0,0 +1,192 @@ +import json +import os +from pathlib import Path + +import pandas as pd +from tabulate import tabulate + +results_folder = Path("results/") + +# latency results and the keys that will be printed into markdown +latency_results = [] +latency_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + "avg_latency": "Mean latency (ms)", + # "P10": "P10 (s)", + # "P25": "P25 (s)", + "P50": "Median latency (ms)", + # "P75": "P75 (s)", + # "P90": "P90 (s)", + "P99": "P99 latency (ms)", +} + +# throughput tests and the keys that will be printed into markdown +throughput_results = [] +throughput_results_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + # "num_requests": "# of req.", + # "total_num_tokens": "Total # of tokens", + # "elapsed_time": "Elapsed time (s)", + "requests_per_second": "Tput (req/s)", + # "tokens_per_second": "Tput (tok/s)", +} + +# serving results and the keys that will be printed into markdown +serving_results = [] +serving_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + # "completed": "# of req.", + "request_throughput": "Tput (req/s)", + # "input_throughput": "Input Tput (tok/s)", + # "output_throughput": "Output Tput (tok/s)", + "mean_ttft_ms": "Mean TTFT (ms)", + "median_ttft_ms": "Median TTFT (ms)", + "p99_ttft_ms": "P99 TTFT (ms)", + # "mean_tpot_ms": "Mean TPOT (ms)", + # "median_tpot_ms": "Median", + # "p99_tpot_ms": "P99", + "mean_itl_ms": "Mean ITL (ms)", + "median_itl_ms": "Median ITL (ms)", + "p99_itl_ms": "P99 ITL (ms)", +} + + +def read_markdown(file): + if os.path.exists(file): + with open(file, "r") as f: + return f.read() + "\n" + else: + return f"{file} not found.\n" + + +def results_to_json(latency, throughput, serving): + return json.dumps({ + 'latency': latency.to_dict(), + 'throughput': throughput.to_dict(), + 'serving': serving.to_dict() + }) + + +if __name__ == "__main__": + + # collect results + for test_file in results_folder.glob("*.json"): + + with open(test_file, "r") as f: + raw_result = json.loads(f.read()) + + if "serving" in str(test_file): + # this result is generated via `benchmark_serving.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + elif "latency" in f.name: + # this result is generated via `benchmark_latency.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # get different percentiles + for perc in [10, 25, 50, 75, 90, 99]: + # Multiply 1000 to convert the time unit from s to ms + raw_result.update( + {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}) + raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 + + # add the result to raw_result + latency_results.append(raw_result) + continue + + elif "throughput" in f.name: + # this result is generated via `benchmark_throughput.py` + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + throughput_results.append(raw_result) + continue + + print(f"Skipping {test_file}") + + latency_results = pd.DataFrame.from_dict(latency_results) + serving_results = pd.DataFrame.from_dict(serving_results) + throughput_results = pd.DataFrame.from_dict(throughput_results) + + raw_results_json = results_to_json(latency_results, throughput_results, + serving_results) + + # remapping the key, for visualization purpose + if not latency_results.empty: + latency_results = latency_results[list( + latency_column_mapping.keys())].rename( + columns=latency_column_mapping) + if not serving_results.empty: + serving_results = serving_results[list( + serving_column_mapping.keys())].rename( + columns=serving_column_mapping) + if not throughput_results.empty: + throughput_results = throughput_results[list( + throughput_results_column_mapping.keys())].rename( + columns=throughput_results_column_mapping) + + processed_results_json = results_to_json(latency_results, + throughput_results, + serving_results) + + # get markdown tables + latency_md_table = tabulate(latency_results, + headers='keys', + tablefmt='pipe', + showindex=False) + serving_md_table = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) + throughput_md_table = tabulate(throughput_results, + headers='keys', + tablefmt='pipe', + showindex=False) + + # document the result + with open(results_folder / "benchmark_results.md", "w") as f: + + results = read_markdown( + "../.buildkite/nightly-benchmarks/tests/descriptions.md") + results = results.format( + latency_tests_markdown_table=latency_md_table, + throughput_tests_markdown_table=throughput_md_table, + serving_tests_markdown_table=serving_md_table, + benchmarking_results_in_json_string=processed_results_json) + f.write(results) + + # document benchmarking results in json + with open(results_folder / "benchmark_results.json", "w") as f: + + results = latency_results.to_dict( + orient='records') + throughput_results.to_dict( + orient='records') + serving_results.to_dict(orient='records') + f.write(json.dumps(results)) diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh new file mode 100644 index 0000000000000..c785e6a0da628 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh @@ -0,0 +1,17 @@ +#!/bin/sh +TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) +URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" + +retries=0 +while [ $retries -lt 1000 ]; do + if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then + exit 0 + fi + + echo "Waiting for image to be available..." + + retries=$((retries + 1)) + sleep 5 +done + +exit 1 \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/tests/descriptions.md b/.buildkite/nightly-benchmarks/tests/descriptions.md new file mode 100644 index 0000000000000..891e4917070d9 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/descriptions.md @@ -0,0 +1,67 @@ + +## Latency tests + +This test suite aims to test vllm's end-to-end latency under a controlled setup. + +- Input length: 32 tokens. +- Output length: 128 tokens. +- Batch size: fixed (8). +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: end-to-end latency (mean, median, p99). + +### Latency benchmarking results + +{latency_tests_markdown_table} + +## Throughput tests + +This test suite aims to test vllm's throughput. + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm to achieve maximum throughput. +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput. + +### Throughput benchmarking results + +{throughput_tests_markdown_table} + +## Serving tests + +This test suite aims to test vllm's real serving metrics. + +- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 200 prompts. +- Batch size: dynamically determined by vllm and the arrival pattern of the requests. +- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). + +### Serving benchmarking results + +{serving_tests_markdown_table} + +## json version of the benchmarking tables + +This section contains the data of the markdown tables above in JSON format. +You can load the benchmarking tables into pandas dataframes as follows: + +```python +import json +import pandas as pd + +benchmarking_results_json = """The json string""" +benchmarking_results = json.loads(benchmarking_results_json) +latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"]) +throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"]) +serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"]) +``` + +The json string for all benchmarking tables: +```json +{benchmarking_results_in_json_string} +``` + +You can also check the raw experiment data in the Artifact tab of the Buildkite page. + diff --git a/.buildkite/nightly-benchmarks/tests/latency-tests.json b/.buildkite/nightly-benchmarks/tests/latency-tests.json new file mode 100644 index 0000000000000..06488cd79110a --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json @@ -0,0 +1,32 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + } +] \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/tests/serving-tests.json b/.buildkite/nightly-benchmarks/tests/serving-tests.json new file mode 100644 index 0000000000000..86a0fefa339f7 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json @@ -0,0 +1,59 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/tests/throughput-tests.json b/.buildkite/nightly-benchmarks/tests/throughput-tests.json new file mode 100644 index 0000000000000..41ac135748704 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json @@ -0,0 +1,35 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] \ No newline at end of file diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml new file mode 100644 index 0000000000000..1959f9752069f --- /dev/null +++ b/.buildkite/release-pipeline.yaml @@ -0,0 +1,21 @@ +steps: + - block: "Build wheels" + + - label: "Build wheel - Python {{matrix.python_version}}, CUDA {{matrix.cuda_version}}" + agents: + queue: cpu_queue + commands: + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --build-arg PYTHON_VERSION={{matrix.python_version}} --tag vllm-ci:build-image --target build --progress plain ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image cp -r dist /artifacts_host" + - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/" + matrix: + setup: + cuda_version: + - "11.8.0" + - "12.1.0" + python_version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 6a86bc0ebfb66..f4fa24be1f20f 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -4,21 +4,23 @@ set -ex # Try building the docker image docker build -t cpu-test -f Dockerfile.cpu . +docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . # Setup cleanup -remove_docker_container() { docker rm -f cpu-test || true; } +remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } trap remove_docker_container EXIT remove_docker_container # Run the image docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test +docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test-avx2 cpu-test-avx2 # offline inference docker exec cpu-test bash -c "python3 examples/offline_inference.py" +docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c "cd tests; pip install pytest Pillow protobuf - bash ../.buildkite/download-images.sh cd ../ - pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py" diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh new file mode 100755 index 0000000000000..70e56596c4a86 --- /dev/null +++ b/.buildkite/run-openvino-test.sh @@ -0,0 +1,14 @@ +# This script build the OpenVINO docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t openvino-test -f Dockerfile.openvino . + +# Setup cleanup +remove_docker_container() { docker rm -f openvino-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh new file mode 100644 index 0000000000000..22a7e76937a76 --- /dev/null +++ b/.buildkite/run-xpu-test.sh @@ -0,0 +1,14 @@ +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t xpu-test -f Dockerfile.xpu . + +# Setup cleanup +remove_docker_container() { docker rm -f xpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image and launch offline inference +docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 6b12d19ba611f..d96e3c6d192e2 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1,7 +1,10 @@ # In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. -# This script will be feed into Jinja template in `test-template.j2` to generate -# the final pipeline yaml file. + +# This script will be feed into Jinja template in `test-template-aws.j2` at +# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 +# to generate the final pipeline yaml file. + steps: - label: Regression Test @@ -24,37 +27,52 @@ steps: - label: Core Test mirror_hardwares: [amd] - command: pytest -v -s core + commands: + - pytest -v -s core + - pytest -v -s distributed/test_parallel_state.py - label: Distributed Comm Ops Test #mirror_hardwares: [amd] - command: pytest -v -s distributed/test_comm_ops.py working_dir: "/vllm-workspace/tests" num_gpus: 2 + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py -- label: Distributed Tests +- label: Distributed Tests (2 GPUs) mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: + - bash ../.buildkite/download-images.sh - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - - pytest -v -s spec_decode/e2e/test_integration_dist.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py -- label: Distributed Tests (Multiple Groups) +- label: Distributed Tests (4 GPUs) #mirror_hardwares: [amd] working_dir: "/vllm-workspace/tests" num_gpus: 4 commands: - pytest -v -s distributed/test_pynccl.py + # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. + # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - label: Engine Test mirror_hardwares: [amd] @@ -64,8 +82,8 @@ steps: mirror_hardwares: [amd] commands: - - pytest -v -s entrypoints -m llm - - pytest -v -s entrypoints -m openai + - pytest -v -s entrypoints/llm + - pytest -v -s entrypoints/openai - label: Examples Test working_dir: "/vllm-workspace/examples" @@ -95,13 +113,13 @@ steps: - label: Models Test #mirror_hardwares: [amd] commands: - - pytest -v -s models -m \"not llava\" + - pytest -v -s models -m \"not vlm\" -- label: Llava Test +- label: Vision Language Models Test mirror_hardwares: [amd] commands: - bash ../.buildkite/download-images.sh - - pytest -v -s models -m llava + - pytest -v -s models -m vlm - label: Prefix Caching Test mirror_hardwares: [amd] @@ -140,6 +158,9 @@ steps: num_gpus: 4 # This test runs llama 13B, so it is required to run on 4 GPUs. commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s -x lora/test_long_context.py - label: Tensorizer Test @@ -154,6 +175,15 @@ steps: #mirror_hardwares: [amd] command: pytest -v -s quantization +- label: Tracing Test + commands: + - "pip install \ + opentelemetry-sdk \ + opentelemetry-api \ + opentelemetry-exporter-otlp \ + opentelemetry-semantic-conventions-ai" + - pytest -v -s tracing + - label: Benchmarks working_dir: "/vllm-workspace/.buildkite" mirror_hardwares: [amd] @@ -161,9 +191,39 @@ steps: - pip install aiohttp - bash run-benchmarks.sh +- label: LM Eval Small Models + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-small.txt -t 1 + +- label: LM Eval Large Models + gpu: a100 + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + commands: + - pip install lm-eval + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-large.txt -t 4 + - label: Documentation Build working_dir: "/vllm-workspace/test_docs/docs" no_gpu: True commands: - pip install -r requirements-docs.txt - SPHINXOPTS=\"-W\" make html + +- label: Distributed Tests (A100) + gpu: a100 + num_gpus: 4 + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py + - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.5/flashinfer-0.0.5+cu121torch2.3-cp310-cp310-linux_x86_64.whl + - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py + - pytest -v -s -x lora/test_mixtral.py diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 deleted file mode 100644 index 3b5d36b246673..0000000000000 --- a/.buildkite/test-template-aws.j2 +++ /dev/null @@ -1,64 +0,0 @@ -{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %} -{% set default_working_dir = "/vllm-workspace/tests" %} - -steps: - - label: ":docker: build image" - agents: - queue: cpu_queue - commands: - - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" - - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - - wait - - {% for step in steps %} - - label: "{{ step.label }}" - agents: - {% if step.label == "Documentation Build" %} - queue: small_cpu_queue - {% elif step.no_gpu %} - queue: cpu_queue - {% elif step.num_gpus == 2 or step.num_gpus == 4 %} - queue: gpu_4_queue - {% else %} - queue: gpu_1_queue - {% endif %} - soft_fail: true - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - plugins: - - docker#v5.2.0: - image: {{ docker_image }} - always-pull: true - propagate-environment: true - {% if not step.no_gpu %} - gpus: all - {% endif %} - {% if step.label == "Benchmarks" %} - mount-buildkite-agent: true - {% endif %} - command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"] - environment: - - VLLM_USAGE_SOURCE=ci-test - - HF_TOKEN - {% if step.label == "Speculative decoding tests" %} - - VLLM_ATTENTION_BACKEND=XFORMERS - {% endif %} - volumes: - - /dev/shm:/dev/shm - {% endfor %} diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2 deleted file mode 100644 index 4a20a462b98ec..0000000000000 --- a/.buildkite/test-template.j2 +++ /dev/null @@ -1,96 +0,0 @@ -{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %} -{% set default_num_gpu = 1 %} -{% set default_working_dir = "/vllm-workspace/tests" %} - -steps: - - label: ":docker: build image" - commands: - - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ." - - "docker push {{ docker_image }}" - env: - DOCKER_BUILDKIT: "1" - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - - wait - - - group: "AMD Tests" - depends_on: ~ - steps: - {% for step in steps %} - {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %} - - label: "AMD: {{ step.label }}" - agents: - queue: amd - command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}" - env: - DOCKER_BUILDKIT: "1" - soft_fail: true - {% endif %} - {% endfor %} - - - label: "Neuron Test" - depends_on: ~ - agents: - queue: neuron - command: bash .buildkite/run-neuron-test.sh - soft_fail: false - - - label: "Intel Test" - depends_on: ~ - agents: - queue: intel - command: bash .buildkite/run-cpu-test.sh - - {% for step in steps %} - - label: "{{ step.label }}" - agents: - queue: kubernetes - soft_fail: {{ step.soft_fail or false }} - {% if step.parallelism %} - parallelism: {{ step.parallelism }} - {% endif %} - retry: - automatic: - - exit_status: -1 # Agent was lost - limit: 5 - - exit_status: -10 # Agent was lost - limit: 5 - plugins: - - kubernetes: - podSpec: - {% if step.num_gpus %} - priorityClassName: gpu-priority-cls-{{ step.num_gpus }} - {% endif %} - volumes: - - name: dshm - emptyDir: - medium: Memory - containers: - - image: "{{ docker_image }}" - command: ["bash"] - args: - - '-c' - - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" - {% if not step.no_gpu %} - resources: - requests: - nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" - limits: - nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}" - {% endif %} - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - volumeMounts: - - mountPath: /dev/shm - name: dshm - {% endfor %} diff --git a/.github/actions/nm-build-docker/action.yml b/.github/actions/nm-build-docker/action.yml index 44394943ff1f5..dd005542fcfbd 100644 --- a/.github/actions/nm-build-docker/action.yml +++ b/.github/actions/nm-build-docker/action.yml @@ -3,20 +3,16 @@ description: 'build docker image for nm-vllm' inputs: docker_tag: description: "tag to be used for the docker image" - type: string required: true extra_tag: description: "additional tag for the docker image" - type: string required: true - build_type: + wf_category: description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE" - type: string - default: 'NIGHTLY' + required: true build_version: description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531" - type: string - default: 'latest' + required: true runs: using: composite steps: @@ -29,11 +25,11 @@ runs: # build status=0 docker build --tag ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} \ - --build-arg build_type=${{ inputs.build_type }} \ + --build-arg build_type=${{ inputs.wf_category }} \ --build-arg build_version=${{ inputs.build_version }} \ --target vllm-openai . || status=$? if [ ${status} -eq 0 ]; then - echo "Add tag ${{ inputs.extra_tag }} for "${{ inputs.build_type }}" build too" + echo "Add tag ${{ inputs.extra_tag }} for "${{ inputs.wf_category }}" build too" docker image tag ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.extra_tag }} || ((status+=$?)) fi docker image ls -a diff --git a/.github/actions/nm-build-vllm/action.yml b/.github/actions/nm-build-vllm/action.yml index c78a9a8b27d65..fc629242248fb 100644 --- a/.github/actions/nm-build-vllm/action.yml +++ b/.github/actions/nm-build-vllm/action.yml @@ -7,15 +7,9 @@ inputs: venv: description: 'name for python virtual environment' required: true - pypi: - description: 'ip address for pypi server' - required: true outputs: - build_status: - description: "final status from 'pip install -e'" - value: ${{ steps.build.outputs.build_status }} whl_status: - description: "final status from 'pip3 wheel --no-deps -w dist'" + description: "final status from constructing the whl" value: ${{ steps.build.outputs.whl_status }} whl: description: 'basename for generated whl' @@ -31,35 +25,16 @@ runs: COMMIT=${{ github.sha }} VENV="${{ inputs.venv }}-${COMMIT:0:7}" source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate - # TODO: adjust when we need a proper release. use nightly now. pip3 install -r requirements-cuda.txt -r requirements-build.txt - # build - SUCCESS=0 - pip3 install -e . || SUCCESS=$? - echo "build_status=${SUCCESS}" >> "$GITHUB_OUTPUT" - if [ ${SUCCESS} -ne 0 ]; then - exit 1 - fi - # strip binaries - if [ ! $(command -v strip) ]; then - sudo apt install -y binutils - fi - if [ ! $(command -v file) ]; then - sudo apt install -y file - fi - for eachso in $(find . -type f -name '*.so') - do - strip $eachso - file $eachso - done # whl SUCCESS=0 - pip3 wheel --no-deps -w dist . || SUCCESS=$? + python setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 || SUCCESS=$? echo "whl_status=${SUCCESS}" >> "$GITHUB_OUTPUT" - BASE=$(./.github/scripts/convert-version ${{ inputs.python }}) ls -alh dist - WHL_FILEPATH=$(find dist -iname "*${BASE}*.whl") + WHL_FILEPATH=$(find dist -type f -iname "*linux_x86_64.whl") + echo "whl: ${WHL_FILEPATH}" RENAME=$(echo ${WHL_FILEPATH} | sed -e 's/linux_x86_64/manylinux_2_17_x86_64/') + echo "rename: ${RENAME}" mv ${WHL_FILEPATH} ${RENAME} WHL=$(basename ${RENAME}) echo "whl=${WHL}" >> "$GITHUB_OUTPUT" @@ -67,7 +42,7 @@ runs: exit 1 fi # sdist - python3 setup.py sdist || SUCCESS=$? + python setup.py sdist || SUCCESS=$? pyenv uninstall --force ${{ inputs.python}}/envs/${VENV} ls -alh dist TAR_FILEPATH=$(find dist -type f -iname "*.tar.gz") diff --git a/.github/actions/nm-caches/action.yml b/.github/actions/nm-caches/action.yml new file mode 100644 index 0000000000000..db4db069a7d7a --- /dev/null +++ b/.github/actions/nm-caches/action.yml @@ -0,0 +1,11 @@ +name: set up caches +description: 'set up HF and Python caches' +runs: + using: composite + steps: + - run: | + sudo mkdir -m 777 -p ${HF_HOME} + sudo chown -R $(whoami):$(whoami) ${HF_HOME} + sudo mkdir -m 777 -p ${PIP_CACHE_DIR} + sudo chown -R $(whoami):$(whoami) ${PIP_CACHE_DIR} + shell: bash diff --git a/.github/actions/nm-cp-assets/action.yml b/.github/actions/nm-cp-assets/action.yml index f4ccf88d38ac0..b4957d30d3b25 100644 --- a/.github/actions/nm-cp-assets/action.yml +++ b/.github/actions/nm-cp-assets/action.yml @@ -1,20 +1,15 @@ name: cp assets description: "cp whl and tarfile to Google storage 'neuralmagic-public-pypi/dist'" -inputs: - python: - description: 'python version, e.g. 3.10.12' - required: true runs: using: composite steps: - - id: mv_assets + - id: cp_assets run: | - VERSION_BASE=$(./.github/scripts/convert-version ${{ inputs.python }}) - WHL=$(find assets -type f -name "*nm_vllm*${VERSION_BASE}*.whl") + WHL=$(find assets -type f -name "*nm_vllm*.whl") WHL_FILE=$(basename ${WHL}) echo "whl: ${WHL}" echo "whl_file: ${WHL_FILE}" - TAR=$(find assets -path "*${{ inputs.python }}-nm-vllm*.tar.gz" -type f -name "nm-vllm*.tar.gz") + TAR=$(find assets -path "*nm-vllm*.tar.gz" -type f -name "nm-vllm*.tar.gz") TAR_FILE=$(basename ${TAR}) echo "tar: ${TAR}" echo "tar_file: ${TAR_FILE}" diff --git a/.github/actions/nm-get-docker-tag/action.yml b/.github/actions/nm-get-docker-tag/action.yml deleted file mode 100644 index 93abe52a5d97f..0000000000000 --- a/.github/actions/nm-get-docker-tag/action.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Get additinal docker image tag based on build type -description: 'docker image tag for nm-vllm' -inputs: - build_type: - description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE" - type: string - default: 'NIGHTLY' -outputs: - tag: - description: "extra tag for the docker image based on build type" - value: ${{ steps.extratag.outputs.tag }} -runs: - using: composite - steps: - - id: extratag - run: | - tag=nightly - if [[ "${{ inputs.build_type }}" = "RELEASE" ]]; then - tag=latest - fi - echo "tag=${tag}" >> $GITHUB_OUTPUT - shell: bash diff --git a/.github/actions/nm-get-docker-tags/action.yml b/.github/actions/nm-get-docker-tags/action.yml new file mode 100644 index 0000000000000..c7b9cf85e69e7 --- /dev/null +++ b/.github/actions/nm-get-docker-tags/action.yml @@ -0,0 +1,46 @@ +name: Get docker image tags based on build type and the image version based on wheel name +description: 'docker image tags for nm-vllm' +inputs: + wf_category: + description: "type of nm-vllm to install for the docker image: NIGHTLY or RELEASE" + required: true + whl: + description: "name of nm-vllm wheel to install for the docker image" + required: true +outputs: + tag: + description: "tag for the docker image based on wheel version" + value: ${{ steps.tags.outputs.tag }} + extra_tag: + description: "extra tag for the docker image based on build type, either latest (for RELEASE) or nightly (for NIGHTLY)" + value: ${{ steps.tags.outputs.extra_tag }} + build_version: + description: "version of nm-vllm, e.g. 0.4.0, 0.4.0.20240531" + value: ${{ steps.tags.outputs.build_version }} +runs: + using: composite + steps: + - id: tags + run: | + BUILD_VERSION=`echo "${{ inputs.whl }}" | cut -d'-' -f2` + if [[ "${{ inputs.wf_category }}" == "RELEASE" ]]; then + if [[ "${BUILD_VERSION}" =~ ^[0-9]+.[0-9]+.[0-9]+$ ]]; then + TAG="v${BUILD_VERSION}" + EXTRA_TAG=latest + else + echo "ERROR: wheel version ${BUILD_VERSION} doesn't match RELEASE format. Check input." + exit 1 + fi + else + if [[ "${BUILD_VERSION}" =~ ^[0-9]+.[0-9]+.[0-9]+.[0-9]{8}$ ]]; then + TAG=`echo "${BUILD_VERSION}" | cut -d'.' -f4` + EXTRA_TAG=nightly + else + echo "ERROR: wheel version ${BUILD_VERSION} doesn't match NIGHTLY format. Check input." + exit 1 + fi + fi + echo "tag=${TAG}" >> $GITHUB_OUTPUT + echo "extra_tag=${EXTRA_TAG}" >> $GITHUB_OUTPUT + echo "build_version=${BUILD_VERSION}" >> $GITHUB_OUTPUT + shell: bash diff --git a/.github/actions/nm-hf-cache/action.yml b/.github/actions/nm-hf-cache/action.yml deleted file mode 100644 index 62f54703c9e65..0000000000000 --- a/.github/actions/nm-hf-cache/action.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: HF cache -description: 'mount HF cache' -inputs: - fs_cache: - description: '(deprecated) filesystem to use for HF cache' - required: true -runs: - using: composite - steps: - - run: | - sudo mkdir -m 777 -p ${HF_HOME} - sudo chown -R $(whoami):$(whoami) ${HF_HOME} - shell: bash diff --git a/.github/actions/nm-install-testmo/action.yml b/.github/actions/nm-install-testmo/action.yml deleted file mode 100644 index 97e836f7e2cd9..0000000000000 --- a/.github/actions/nm-install-testmo/action.yml +++ /dev/null @@ -1,17 +0,0 @@ -name: install testmo cli -description: 'install testmo cli' -runs: - using: composite - steps: - - id: testmo_install - run: | - sudo mkdir -p /usr/local/apps - sudo chown -R $(whoami):$(whoami) /usr/local/apps - export XDG_CONFIG_HOME=/usr/local/apps - cd /usr/local/apps - curl -o- --retry 8 --retry-delay 16 --retry-max-time 600 https://raw.githubusercontent.com/nvm-sh/nvm/v0.39.5/install.sh | bash - export NVM_DIR='/usr/local/apps/nvm' - [ -s "/usr/local/apps/nvm/nvm.sh" ] && \. "/usr/local/apps/nvm/nvm.sh"; \ - nvm install 19; \ - npm install --global --save-prod @testmo/testmo-cli - shell: bash diff --git a/.github/actions/nm-install-whl/action.yml b/.github/actions/nm-install-whl/action.yml index e229d0bc95d2b..95d6722bb658f 100644 --- a/.github/actions/nm-install-whl/action.yml +++ b/.github/actions/nm-install-whl/action.yml @@ -22,8 +22,7 @@ runs: source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate fi pip3 install -r requirements-dev.txt - BASE=$(./.github/scripts/convert-version ${{ inputs.python }}) - WHL=$(find . -type f -iname "*${BASE}*.whl") + WHL=$(find . -type f -iname "nm_vllm*.whl") WHL_BASENAME=$(basename ${WHL}) echo "whl=${WHL_BASENAME}" >> "$GITHUB_OUTPUT" pip3 install ${WHL}[sparse] --extra-index-url https://pypi.neuralmagic.com/simple diff --git a/.github/actions/nm-lm-eval-smoke/action.yml b/.github/actions/nm-lm-eval-smoke/action.yml deleted file mode 100644 index 527909bf68786..0000000000000 --- a/.github/actions/nm-lm-eval-smoke/action.yml +++ /dev/null @@ -1,32 +0,0 @@ -name: run lm-eval accuracy smoke test -description: 'run lm-eval accuracy smoke test' -inputs: - python: - description: 'python version, e.g. 3.10.12' - required: true - venv: - description: 'name for python virtual environment' - required: true -runs: - using: composite - steps: - - id: lm-eval - run: | - # move source directories - mv vllm vllm-ignore || echo "no 'vllm' folder to move" - mv csrc csrc-ignore || echo "no 'csrc' folder to move" - - if [ -n "${{ inputs.venv }}" ]; then - COMMIT=${{ github.sha }} - VENV="${{ inputs.venv }}-${COMMIT:0:7}" - source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate - fi - - pip3 install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 - pip3 install optimum auto-gptq - - SUCCESS=0 - python .github/scripts/lm_eval_compare_hf_vs_vllm.py --hf_pretrained nm-testing/zephyr-beta-7b-gptq-g128 --vllm_pretrained nm-testing/zephyr-beta-7b-marlin-g128 || SUCCESS=$? - echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT" - exit ${SUCCESS} - shell: bash diff --git a/.github/actions/nm-lm-eval-accuracy/action.yml b/.github/actions/nm-lm-eval/action.yml similarity index 64% rename from .github/actions/nm-lm-eval-accuracy/action.yml rename to .github/actions/nm-lm-eval/action.yml index ee8c78c8855a3..f7f0b07fcf080 100644 --- a/.github/actions/nm-lm-eval-accuracy/action.yml +++ b/.github/actions/nm-lm-eval/action.yml @@ -1,5 +1,5 @@ -name: run lm-eval full accuracy test -description: 'run lm-eval full accuracy test' +name: run lm-eval accuracy test +description: 'run lm-eval accuracy test' inputs: python: description: 'python version, e.g. 3.10.12' @@ -7,15 +7,14 @@ inputs: venv: description: 'name for python virtual environment' required: true + lm_eval_configuration: + description: 'file containing test configuration' + required: true runs: using: composite steps: - id: lm-eval run: | - # move source directories - mv vllm vllm-ignore || echo "no 'vllm' folder to move" - mv csrc csrc-ignore || echo "no 'csrc' folder to move" - if [ -n "${{ inputs.venv }}" ]; then COMMIT=${{ github.sha }} VENV="${{ inputs.venv }}-${COMMIT:0:7}" @@ -26,7 +25,7 @@ runs: pip3 install pytest openai==1.3.9 SUCCESS=0 - pytest -v tests/accuracy/test_lm_eval_correctness.py || SUCCESS=$? - echo "test=${SUCCESS}" >> "$GITHUB_OUTPUT" + ./.github/scripts/nm-run-lm-eval-vllm.sh -c ${{ inputs.lm_eval_configuration }} || SUCCESS=$? + echo "lm_eval=${SUCCESS}" >> "$GITHUB_OUTPUT" exit ${SUCCESS} shell: bash diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml index fbd80377e687f..f9f987d7afda4 100644 --- a/.github/actions/nm-set-env/action.yml +++ b/.github/actions/nm-set-env/action.yml @@ -28,6 +28,8 @@ runs: # HF Cache echo "HF_TOKEN=${HF_TOKEN_SECRET}" >> $GITHUB_ENV echo "HF_HOME=/model-cache" >> $GITHUB_ENV + # Python cache + echo "PIP_CACHE_DIR=/model-cache/python-cache" >> $GITHUB_ENV # build NUM_THREADS=$(./.github/scripts/determine-threading -G ${{ inputs.Gi_per_thread }}) echo "MAX_JOBS=${NUM_THREADS}" >> $GITHUB_ENV @@ -39,6 +41,13 @@ runs: # testmo echo "XDG_CONFIG_HOME=/usr/local/apps" >> $GITHUB_ENV echo "PROJECT_ID=12" >> $GITHUB_ENV + # disable usage stats (writes to protected /usr/local/apps) + echo "VLLM_NO_USAGE_STATS=1" >> $GITHUB_ENV + echo "DO_NOT_TRACK=1" >> $GITHUB_ENV + # build type based on wf_category: Release (RELEASE & NIGHTLY); RelWithDebInfo (REMOTE) + if [[ "${{inputs.wf_category}}" != "REMOTE" ]]; then + echo "CMAKE_BUILD_TYPE=Release" >> $GITHUB_ENV + fi env: HF_TOKEN_SECRET: ${{ inputs.hf_token }} shell: bash diff --git a/.github/actions/nm-set-python/action.yml b/.github/actions/nm-set-python/action.yml deleted file mode 100644 index 1a3092b735bd3..0000000000000 --- a/.github/actions/nm-set-python/action.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: set python -description: 'sets python version and creates venv for neuralmagic' -inputs: - python: - description: 'python version, e.g. 3.10.12' - required: true - venv: - description: 'name for python virtual environment' - required: true -outputs: - version: - description: "result from 'python --version'" - value: ${{ steps.set_python.outputs.version }} -runs: - using: composite - steps: - - id: set_python - run: | - command -v pyenv - pyenv local ${{ inputs.python }} - COMMIT=${{ github.sha }} - VENV="${{ inputs.venv }}-${COMMIT:0:7}" - pyenv virtualenv --force ${VENV} - source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate - VERSION=$(python --version) - echo "version=${VERSION}" >> "$GITHUB_OUTPUT" - shell: bash diff --git a/.github/actions/nm-summary-build/action.yml b/.github/actions/nm-summary-build/action.yml index a890f5ad016f2..b6aa0c5b96da4 100644 --- a/.github/actions/nm-summary-build/action.yml +++ b/.github/actions/nm-summary-build/action.yml @@ -13,9 +13,6 @@ inputs: python: description: 'python version info' required: true - build_status: - description: 'status from build step' - required: true whl_status: description: 'status from build step' required: true @@ -23,8 +20,6 @@ runs: using: composite steps: - run: | - BUILD_STATUS=${{ inputs.build_status }} - BUILD_EMOJI=$(./.github/scripts/step-status ${BUILD_STATUS}) WHL_STATUS=${{ inputs.whl_status }} WHL_EMOJI=$(./.github/scripts/step-status ${WHL_STATUS}) echo "testmo URL: ${{ inputs.testmo_run_url }}" >> $GITHUB_STEP_SUMMARY @@ -37,6 +32,5 @@ runs: echo "| gitref: | '${{ inputs.gitref }}' |" >> $GITHUB_STEP_SUMMARY echo "| branch name: | '${{ github.ref_name }}' |" >> $GITHUB_STEP_SUMMARY echo "| python: | ${{ inputs.python }} |" >> $GITHUB_STEP_SUMMARY - echo "| build: | ${BUILD_EMOJI} |" >> $GITHUB_STEP_SUMMARY echo "| whl: | ${WHL_EMOJI} |" >> $GITHUB_STEP_SUMMARY shell: bash diff --git a/.github/actions/nm-test-whl/action.yml b/.github/actions/nm-test-whl/action.yml index 53ca57598f8f8..557374fa11b08 100644 --- a/.github/actions/nm-test-whl/action.yml +++ b/.github/actions/nm-test-whl/action.yml @@ -19,13 +19,15 @@ runs: steps: - id: test_whl run: | + sudo mkdir -m 777 -p /usr/local/apps + sudo chown -R $(whoami):$(whoami) /usr/local/apps pip install coverage pip install pytest-cov pip install pytest-xdist pip install -r requirements-dev.txt SUCCESS=0 VLLM_SRC=$(python3 -c "import vllm; print(vllm.__path__[0])") - ./.github/scripts/run-tests -s ${VLLM_SRC} -t ${{ inputs.test_directory }} -r ${{ inputs.test_results }} -f ${{ inputs.test_skip_list }}|| SUCCESS=$? + ./.github/scripts/run-tests -s ${VLLM_SRC} -t ${{ inputs.test_directory }} -r ${{ inputs.test_results }} || SUCCESS=$? pytest ./neuralmagic/tests/test_nm-vllm_licenses.py --junitxml=${{ inputs.test_results }}/test_nm-vllm_licenses.xml echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT" exit ${SUCCESS} diff --git a/.github/data/nm_benchmark_base_config_list.txt b/.github/data/nm_benchmark_base_config_list.txt new file mode 100644 index 0000000000000..8945192390c2a --- /dev/null +++ b/.github/data/nm_benchmark_base_config_list.txt @@ -0,0 +1 @@ +neuralmagic/benchmarks/configs/benchmark_serving.json diff --git a/.github/lm-eval-configs/full-large-models.txt b/.github/lm-eval-configs/full-large-models.txt new file mode 100644 index 0000000000000..55c913a282bc8 --- /dev/null +++ b/.github/lm-eval-configs/full-large-models.txt @@ -0,0 +1,9 @@ +Meta-Llama-3-70B-Instruct-FP8.yaml +Meta-Llama-3-70B-Instruct.yaml +Mixtral-8x22B-Instruct-v0.1-FP8.yaml +Mixtral-8x22B-Instruct-v0.1.yaml +Mixtral-8x7B-Instruct-v0.1-FP8.yaml +Mixtral-8x7B-Instruct-v0.1.yaml +Qwen2-57B-A14B-Instruct.yaml +Qwen2-72B-Instruct.yaml +Phi-3-medium-4k-instruct.yaml diff --git a/.github/lm-eval-configs/full-small-models.txt b/.github/lm-eval-configs/full-small-models.txt new file mode 100644 index 0000000000000..caca502f76d04 --- /dev/null +++ b/.github/lm-eval-configs/full-small-models.txt @@ -0,0 +1,7 @@ +gemma-7b-it.yaml +Meta-Llama-3-8B-Instruct-FP8-KV.yaml +Meta-Llama-3-8B-Instruct-FP8.yaml +Meta-Llama-3-8B-Instruct-W4A16.yaml +Meta-Llama-3-8B-Instruct.yaml +Mistral-7B-Instruct-v0.3.yaml +Qwen2-7B-Instruct.yaml diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml new file mode 100644 index 0000000000000..2ef7b975b8bc9 --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.900 + - name: "exact_match,flexible-extract" + value: 0.900 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml new file mode 100644 index 0000000000000..70f1030fa0007 --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-70B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.888 + - name: "exact_match,flexible-extract" + value: 0.888 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml new file mode 100644 index 0000000000000..1c46cda9da11a --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV -b 32 -l 250 -f 5 -t 1 +model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.764 + - name: "exact_match,flexible-extract" + value: 0.764 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml new file mode 100644 index 0000000000000..0d077dc19d95a --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1 +model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.744 + - name: "exact_match,flexible-extract" + value: 0.740 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml new file mode 100644 index 0000000000000..92d07ad0c734a --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -l 250 -f 5 +model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.684 + - name: "exact_match,flexible-extract" + value: 0.688 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml new file mode 100644 index 0000000000000..d7abd6b36bfc6 --- /dev/null +++ b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 +model_name: "meta-llama/Meta-Llama-3-8B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.74 + - name: "exact_match,flexible-extract" + value: 0.74 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml b/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml new file mode 100644 index 0000000000000..592652eed999e --- /dev/null +++ b/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mistral-7B-Instruct-v0.3 -b 32 -l 250 -f 5 +model_name: "mistralai/Mistral-7B-Instruct-v0.3" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.524 + - name: "exact_match,flexible-extract" + value: 0.524 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml new file mode 100644 index 0000000000000..8d1eaecf5bec6 --- /dev/null +++ b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x22B-Instruct-v0.1 -b 32 -l 250 -f 5 +model_name: "mistralai/Mixtral-8x22B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.840 + - name: "exact_match,flexible-extract" + value: 0.844 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml new file mode 100644 index 0000000000000..73f00b16c51aa --- /dev/null +++ b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x22B-Instruct-v0.1 -b 32 -l 250 -f 5 +model_name: "mistralai/Mixtral-8x22B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.876 + - name: "exact_match,flexible-extract" + value: 0.880 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml new file mode 100644 index 0000000000000..e3f30baf316be --- /dev/null +++ b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 +model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.616 + - name: "exact_match,flexible-extract" + value: 0.620 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml new file mode 100644 index 0000000000000..629e3721fdf44 --- /dev/null +++ b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml @@ -0,0 +1,11 @@ +# bash ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b 32 -l 250 -f 5 -t 4 +model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.616 + - name: "exact_match,flexible-extract" + value: 0.628 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml b/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml new file mode 100644 index 0000000000000..443db66c9adc6 --- /dev/null +++ b/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m microsoft/Phi-3-medium-4k-instruct -b 16 -l 250 -f 5 +model_name: "microsoft/Phi-3-medium-4k-instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.840 + - name: "exact_match,flexible-extract" + value: 0.852 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml new file mode 100644 index 0000000000000..a46aa16f0bcd4 --- /dev/null +++ b/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b 32 -l 250 -f 5 +model_name: "Qwen/Qwen2-57B-A14B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.736 + - name: "exact_match,flexible-extract" + value: 0.800 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml new file mode 100644 index 0000000000000..fe5a2c0af1e4a --- /dev/null +++ b/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-72B-Instruct -b 16 -l 250 -f 5 +model_name: "Qwen/Qwen2-72B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.828 + - name: "exact_match,flexible-extract" + value: 0.856 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml new file mode 100644 index 0000000000000..5bf60816dac8f --- /dev/null +++ b/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-7B-Instruct -b 32 -l 250 -f 5 +model_name: "Qwen/Qwen2-7B-Instruct" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.680 + - name: "exact_match,flexible-extract" + value: 0.756 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/models/gemma-7b-it.yaml b/.github/lm-eval-configs/models/gemma-7b-it.yaml new file mode 100644 index 0000000000000..0b3813d240add --- /dev/null +++ b/.github/lm-eval-configs/models/gemma-7b-it.yaml @@ -0,0 +1,11 @@ +# ./nm-run-lm-eval-gsm-hf-baseline.sh -m google/gemma-7b-it -b 16 -l 250 -f 5 +model_name: "google/gemma-7b-it" +tasks: +- name: "gsm8k" + metrics: + - name: "exact_match,strict-match" + value: 0.284 + - name: "exact_match,flexible-extract" + value: 0.324 +limit: 250 +num_fewshot: 5 diff --git a/.github/lm-eval-configs/smoke-large-models.txt b/.github/lm-eval-configs/smoke-large-models.txt new file mode 100644 index 0000000000000..127ec5d97bcff --- /dev/null +++ b/.github/lm-eval-configs/smoke-large-models.txt @@ -0,0 +1,2 @@ +Meta-Llama-3-70B-Instruct.yaml +Mixtral-8x7B-Instruct-v0.1.yaml diff --git a/.github/lm-eval-configs/smoke-small-models.txt b/.github/lm-eval-configs/smoke-small-models.txt new file mode 100644 index 0000000000000..d884f36672a74 --- /dev/null +++ b/.github/lm-eval-configs/smoke-small-models.txt @@ -0,0 +1 @@ +Meta-Llama-3-8B-Instruct.yaml diff --git a/.github/scripts/lm_eval_compare_hf_vs_vllm.py b/.github/scripts/lm_eval_compare_hf_vs_vllm.py deleted file mode 100644 index d8e256631e9a7..0000000000000 --- a/.github/scripts/lm_eval_compare_hf_vs_vllm.py +++ /dev/null @@ -1,125 +0,0 @@ -import argparse -import os -from typing import Dict, List, Tuple - -import lm_eval -import lm_eval.models.utils -import numpy as np -import scipy.stats - -os.environ["TOKENIZERS_PARALLELISM"] = "false" - - -def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]: - acc1, acc2 = res1["acc,none"], res2["acc,none"] - st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"] - Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2)) - # Determining the p-value - p_value = 2 * scipy.stats.norm.sf(abs(Z)) # two-tailed test - return Z, p_value - - -def print_results(data_to_print: List = None, - results_dict: Dict = None, - alpha: float = None): - model1_data, model2_data = data_to_print - for task in model1_data: - print(f"Task: {task}") - print(f"HF Accuracy: {model1_data[task]['acc,none']}") - print(f"vLLM Accuracy: {model2_data[task]['acc,none']}") - print(f"HF StdErr: {model1_data[task]['acc_stderr,none']}") - print(f"vLLM StdErr: {model2_data[task]['acc_stderr,none']}") - z = results_dict[task]["z"] - p_value = results_dict[task]["p_value"] - result = "PASS" if p_value > alpha else "FAIL" - print(f"Z-Score: {z}, P-Value: {p_value}, p > {alpha}: {result}\n") - - -def check_passing_score(results_dict: Dict = None, - alpha: float = None) -> bool: - for task in results_dict: - p_value = results_dict[task]["p_value"] - if p_value <= alpha: - return False - return True - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--hf_pretrained", - default="EleutherAI/pythia-70m", - help="name of model to compare as baseline") - parser.add_argument("--vllm_pretrained", - default="EleutherAI/pythia-70m", - help="name of model to compare as difference") - parser.add_argument("--hf_args", - help="huggingface model args =", - default="") - parser.add_argument("--vllm_args", - help="vllm model args =", - default="") - parser.add_argument("--tasks", type=str, default="arc_easy,hellaswag") - parser.add_argument( - "--limit", - type=float, - default=100, - ) - parser.add_argument( - "--alpha", - type=float, - default=0.05, - help="Significance level for two-tailed z-test", - ) - parser.add_argument( - "--device", - type=str, - default="cuda", - ) - parser.add_argument( - "--batch", - type=str, - default=4, - ) - parser.add_argument( - "--verbosity", - type=str, - default="INFO", - help="Logging verbosity", - ) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - tasks = args.tasks.split(",") - print("Tasks:", tasks) - hf_args, vllm_args = "," + args.hf_args, "," + args.vllm_args - results_hf = lm_eval.simple_evaluate( - model="hf", - model_args=f"pretrained={args.hf_pretrained}" + hf_args, - tasks=tasks, - limit=args.limit, - device=args.device, - batch_size=args.batch, - ) - lm_eval.models.utils.clear_torch_cache() - print("Memory stats cleared") - results_vllm = lm_eval.simple_evaluate( - model="vllm", - model_args=f"pretrained={args.vllm_pretrained}" + vllm_args, - tasks=tasks, - limit=args.limit, - device=args.device, - batch_size=args.batch, - ) - all_res = {} - for task1, task2 in zip(results_hf["results"].items(), - results_vllm["results"].items()): - assert task1[0] == task2[0] - z, p_value = calculate_z_value(task1[1], task2[1]) - all_res[task1[0]] = {"z": z, "p_value": p_value} - print_results([results_hf["results"], results_vllm["results"]], all_res, - args.alpha) - if not check_passing_score(all_res, args.alpha): - print("Accuracy test failed!") - exit(1) diff --git a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh new file mode 100755 index 0000000000000..fdb8ec5393b36 --- /dev/null +++ b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for transformers. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo +} + +while getopts "m:b:l:f:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model hf \ + --model_args pretrained=$MODEL,parallelize=True \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh new file mode 100644 index 0000000000000..d6b38752945ce --- /dev/null +++ b/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh @@ -0,0 +1,51 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for vllm. +# We use this for fp8, which HF does not support. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using huggingface transformers." + echo "This pathway is intended to be used to create baselines for " + echo "our automated nm-test-accuracy workflow" + echo + echo "usage: ${0} " + echo + echo " -m - huggingface stub or local directory of the model" + echo " -b - batch size to run the evaluation at" + echo " -l - limit number of samples to run" + echo " -f - number of fewshot samples to use" + echo " -t - tensor parallel size to run at" + echo +} + +while getopts "m:b:l:f:t:" OPT; do + case ${OPT} in + m ) + MODEL="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + l ) + LIMIT="$OPTARG" + ;; + f ) + FEWSHOT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +lm_eval --model vllm \ + --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \ + --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \ + --batch_size $BATCH_SIZE diff --git a/.github/scripts/nm-run-lm-eval-vllm.sh b/.github/scripts/nm-run-lm-eval-vllm.sh new file mode 100755 index 0000000000000..d0702a086d911 --- /dev/null +++ b/.github/scripts/nm-run-lm-eval-vllm.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# We can use this script to compute baseline accuracy on GSM for transformers. +# +# Make sure you have lm-eval-harness installed: +# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10 + +usage() { + echo`` + echo "Runs lm eval harness on GSM8k using vllm server and compares to " + echo "precomputed baseline (measured by HF transformers.)" + echo + echo "This script should be run from the /nm-vllm directory" + echo + echo "usage: ${0} " + echo + echo " -c - path to the test data config (e.g. .github/lm-eval-configs/small-models-smoke.txt)" + echo +} + +SUCCESS=0 + +while getopts "c:t:" OPT; do + case ${OPT} in + c ) + CONFIG="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +# Parse list of configs. +IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG + +for MODEL_CONFIG in "${MODEL_CONFIGS[@]}" +do + LOCAL_SUCCESS=0 + + echo "=== RUNNING MODEL: $MODEL_CONFIG ===" + + MODEL_CONFIG_PATH=$PWD/.github/lm-eval-configs/models/${MODEL_CONFIG} + LM_EVAL_TEST_DATA_FILE=$MODEL_CONFIG_PATH pytest -s tests/accuracy/test_lm_eval_correctness.py || LOCAL_SUCCESS=$? + + if [[ $LOCAL_SUCCESS == 0 ]]; then + echo "=== PASSED MODEL: ${MODEL_CONFIG} ===" + else + echo "=== FAILED MODEL: ${MODEL_CONFIG} ===" + fi + + SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) + +done + +if [ "${SUCCESS}" -eq "0" ]; then + exit 0 +else + exit 1 +fi diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests index e64ea401b16ce..bed217a40fe25 100755 --- a/.github/scripts/run-tests +++ b/.github/scripts/run-tests @@ -8,7 +8,6 @@ usage() { echo " -s - src directory, i.e. location of package *.py files." echo " -t - test directory, i.e. location of *.py test files. (default 'tests/')" echo " -r - desired results base directory. xml results will mirror provided tests directory structure. (default 'test-results/')" - echo " -f - file with test skip list, e.g. ' neuralmagic/tests/skip-for-remote-push.txt'. (default is to run all found tests)" echo " -h - this list of options" echo echo "note: all paths are relative to 'nm-vllm' root" @@ -35,9 +34,6 @@ while getopts "hs:t:r:f:" OPT; do r) RESULTS_DIR="${OPTARG}" ;; - f) - SKIP_LIST="${OPTARG}" - ;; esac done @@ -71,36 +67,6 @@ for FOUND in "${TESTS_FOUND[@]}"; do echo "${FOUND}" done -# build the skip list from provided file -declare -a TESTS_TO_EXCLUDE -if [ -f "${SKIP_LIST}" ]; then - while IFS= read -r line - do - TESTS_TO_EXCLUDE+=("${line}") - done < "${SKIP_LIST}" -fi - -echo "..." -for EXCLUDE in "${TESTS_TO_EXCLUDE[@]}"; do - for JJ in "${!TESTS_FOUND[@]}"; do - if [[ ${TESTS_FOUND[$JJ]} = ${EXCLUDE} ]]; then - echo "excluding: ${EXCLUDE}" - unset 'TESTS_FOUND[$JJ]' - fi - done -done - -echo "..." -echo "planning to run:" -for TEST in "${TESTS_FOUND[@]}" -do - echo "${TEST}" -done -echo "..." - -# download required artifacts for testing -# (cd ${TEST_DIR} && sudo bash ../.buildkite/download-images.sh) - # run selected tests SUCCESS=0 CC_PYTEST_FLAGS="--cov=${SRC_DIR} --cov=${TEST_DIR} --cov-report=html:cc-vllm-html --cov-append" @@ -109,12 +75,19 @@ do LOCAL_SUCCESS=0 RESULT_XML=$(echo ${TEST} | sed -e "s/${TEST_DIR}/${RESULTS_DIR}/" | sed -e "s/.py/.xml/") + # report which test is being run + # (in CI, if a test hangs, this logs *which* test is running *before* it hangs) + echo "=== RUNNING TEST: ${TEST} ===" + # this is a bit messy and brittle, but certain tests # need to be run with specific options if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? elif [[ "${TEST}" == *"distributed/test_same_node"* ]]; then VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 ${TEST} || LOCAL_SUCCESS=$? + elif [[ "${TEST}" == *"distributed/test_multimodal_broadcast.py"* ]]; then + TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? + TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? elif [[ "${TEST}" == *"distributed"* ]]; then CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then @@ -125,7 +98,18 @@ do pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$? fi - SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) + # if a file gets exit code 0, we are good + if [[ $LOCAL_SUCCESS == 0 ]]; then + echo "=== PASSED TEST: ${TEST} ===" + # if a file does not run any tests, pytest reports exit code of 5 + # since we skip full modules in our skipping strategy, this is common + elif [[ $LOCAL_SUCCESS == 5 ]]; then + echo "=== SKIPPED TEST: ${TEST} ===" + # otherwise, report failure + else + echo "=== FAILED TEST: ${TEST} ===" + SUCCESS=$((SUCCESS + LOCAL_SUCCESS)) + fi done diff --git a/.github/scripts/step-status b/.github/scripts/step-status index b07f17517be2b..4739dcaa2223a 100755 --- a/.github/scripts/step-status +++ b/.github/scripts/step-status @@ -5,7 +5,7 @@ STEP_STATUS=${1} -if [ $STEP_STATUS -eq 0 ]; then +if [ "$STEP_STATUS" -eq 0 ]; then # green check echo -e "\xE2\x9C\x85" else diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index 22e6c2ef0101e..62f0dbcd93eff 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -47,5 +47,5 @@ jobs: mypy vllm/model_executor --config-file pyproject.toml mypy vllm/lora --config-file pyproject.toml mypy vllm/logging --config-file pyproject.toml - mypy vllm/model_executor --config-file pyproject.toml + mypy tests --config-file pyproject.toml diff --git a/.github/workflows/nm-benchmark.yml b/.github/workflows/nm-benchmark.yml index bac8133fd5c97..9ab70c84a357c 100644 --- a/.github/workflows/nm-benchmark.yml +++ b/.github/workflows/nm-benchmark.yml @@ -29,7 +29,7 @@ on: required: true push_benchmark_results_to_gh_pages: description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" - type: string + type: boolean required: true # makes workflow manually callable @@ -61,11 +61,8 @@ on: required: true push_benchmark_results_to_gh_pages: description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" - type: choice - options: - - 'true' - - 'false' - default: 'false' + type: boolean + default: false env: BENCHMARK_RESULTS: /model-cache/benchmark_results @@ -109,11 +106,9 @@ jobs: Gi_per_thread: 1 nvcc_threads: 0 - - name: hf cache - id: hf_cache - uses: ./.github/actions/nm-hf-cache/ - with: - fs_cache: ${{ secrets.HF_FS_CACHE }} + - name: caches + id: caches + uses: ./.github/actions/nm-caches/ - name: download whl id: download diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml index 7847043da5ed1..0516679b5cf7e 100644 --- a/.github/workflows/nm-build-test.yml +++ b/.github/workflows/nm-build-test.yml @@ -1,23 +1,29 @@ name: nm build-test on: + # makes workflow reusable workflow_call: inputs: wf_category: - description: "categories: REMOTE, NIGHTLY, WEEKLY, RELEASE" + description: "workflow category: REMOTE, NIGHTLY, RELEASE" type: string default: "REMOTE" + push_to_pypi: + description: "When set to true, built wheels and tar.gz will be pushed to neuralmagic pypi if all tests pass" + type: boolean + default: false python: description: "python version, e.g. 3.10.12" type: string required: true + # build related parameters build_label: description: "requested runner label (specifies instance)" type: string default: gcp-k8s-build build_timeout: - description: "time limit for build in minutes " + description: "time limit for build in minutes" type: string default: "120" Gi_per_thread: @@ -28,27 +34,29 @@ on: description: "number of threads nvcc build threads" type: string default: "8" + # test related parameters - test_label_solo: - description: "requested runner label (specifies instance)" - type: string - required: true - test_label_multi: - description: "requested runner label (specifies instance)" + + # stringified Json array of maps + # each map has a "python", "gha label", "test skip env vars" e.g. + # [ + # {'python':'3.8.17','label':'gcp-k8s-l4-solo','test':'neuralmagic/tests/test_skip_env_vars/smoke.txt'}, + # ... + # ] + test_configs: + description: "python, label, skip envs" type: string required: true + test_timeout: - description: "time limit for test run in minutes " + description: "time limit for test run in minutes" type: string required: true gitref: description: "git commit hash or branch name" type: string required: true - test_skip_env_vars: - description: 'file with list of env vars controlling which tests to run' - type: string - required: true + # benchmark related parameters benchmark_label: description: "requested benchmark label (specifies instance)" @@ -63,13 +71,40 @@ on: type: string default: "720" push_benchmark_results_to_gh_pages: - description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" + description: "when set to true, the workflow pushes all benchmarking results to gh-pages UI" + type: boolean + default: false + + # lm-eval related parameters + lm_eval_label: + description: "requested runner label (specifies instance)" + type: string + default: "" + lm_eval_timeout: + description: "time limit for lm_eval in minutes" type: string - default: "false" + default: "60" + lm_eval_configuration: + description: "configuration for lm-eval test (see .github/lm-eval-configs)" + type: string + default: "" jobs: + JSON-VALIDATE: + runs-on: gcp-k8s-util + strategy: + matrix: + test_config: ${{ fromJson(inputs.test_configs) }} + steps: + - name: validate test config + run: | + echo "python: ${{ matrix.test_config.python }}" + echo "label: ${{ matrix.test_config.label }}" + echo "tests: ${{ matrix.test_config.test }}" + BUILD: + needs: [JSON-VALIDATE] uses: ./.github/workflows/nm-build.yml with: wf_category: ${{ inputs.wf_category }} @@ -81,42 +116,21 @@ jobs: python: ${{ inputs.python }} secrets: inherit - TEST-SOLO: + TEST: needs: [BUILD] if: success() + strategy: + fail-fast: false + matrix: + test_config: ${{ fromJson(inputs.test_configs) }} uses: ./.github/workflows/nm-test.yml with: - test_label: ${{ inputs.test_label_solo }} + test_label: ${{ matrix.test_config.label }} timeout: ${{ inputs.test_timeout }} gitref: ${{ github.ref }} - python: ${{ inputs.python }} + python: ${{ matrix.test_config.python }} whl: ${{ needs.BUILD.outputs.whl }} - test_skip_env_vars: ${{ inputs.test_skip_env_vars }} - secrets: inherit - - # TODO: re-enable - # TEST-MULTI: - # needs: [BUILD] - # if: success() && contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category) - # uses: ./.github/workflows/nm-test.yml - # with: - # test_label: ${{ inputs.test_label_multi }} - # timeout: ${{ inputs.test_timeout }} - # gitref: ${{ github.ref }} - # python: ${{ inputs.python }} - # whl: ${{ needs.BUILD.outputs.whl }} - # test_skip_env_vars: ${{ inputs.test_skip_env_vars }} - # secrets: inherit - - UPLOAD: - needs: [TEST-SOLO] - if: contains(fromJSON('["NIGHTLY", "WEEKLY", "RELEASE"]'), inputs.wf_category) - uses: ./.github/workflows/nm-upload-assets-to-gcp.yml - with: - label: ${{ inputs.build_label }} - timeout: ${{ inputs.build_timeout }} - gitref: ${{ github.ref }} - python: ${{ inputs.python }} + test_skip_env_vars: ${{ matrix.test_config.test }} secrets: inherit BENCHMARK: @@ -134,16 +148,37 @@ jobs: push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" secrets: inherit - TEST-ACCURACY-FULL: + LM-EVAL: needs: [BUILD] - if: ${{ inputs.wf_category == 'WEEKLY' || inputs.wf_category == 'RELEASE' }} - uses: ./.github/workflows/nm-test-accuracy-full.yml + uses: ./.github/workflows/nm-lm-eval.yml with: - label: ${{ inputs.test_label_multi }} - timeout: ${{ inputs.benchmark_timeout }} + label: ${{ inputs.lm_eval_label }} + timeout: ${{ inputs.lm_eval_timeout }} gitref: ${{ inputs.gitref }} - Gi_per_thread: ${{ inputs.Gi_per_thread }} - nvcc_threads: ${{ inputs.nvcc_threads }} python: ${{ inputs.python }} whl: ${{ needs.BUILD.outputs.whl }} + lm_eval_configuration: ${{ inputs.lm_eval_configuration }} secrets: inherit + + # uploading is only available when using GCP autoscaling group + UPLOAD: + needs: [TEST, BENCHMARK, LM-EVAL] + if: ${{ inputs.push_to_pypi }} + uses: ./.github/workflows/nm-upload-assets-to-gcp.yml + with: + label: gcp-k8s-util + timeout: ${{ inputs.build_timeout }} + gitref: ${{ github.ref }} + secrets: inherit + + # update docker + DOCKER: + needs: [BUILD] + if: ${{ inputs.wf_category != 'REMOTE' }} + uses: ./.github/workflows/publish-docker.yml + with: + push_to_repository: ${{ inputs.push_to_pypi }} + gitref: ${{ inputs.gitref }} + wf_category: ${{ inputs.wf_category }} + whl: ${{ needs.BUILD.outputs.whl }} + secrets: inherit diff --git a/.github/workflows/nm-build.yml b/.github/workflows/nm-build.yml index 077d0a147f24c..10173813ac4b4 100644 --- a/.github/workflows/nm-build.yml +++ b/.github/workflows/nm-build.yml @@ -31,6 +31,10 @@ on: description: "python version, e.g. 3.10.12" type: string required: true + outputs: + whl: + description: 'basename for generated whl' + value: ${{ jobs.BUILD.outputs.whl }} # makes workflow manually callable workflow_dispatch: @@ -73,7 +77,13 @@ jobs: runs-on: ${{ inputs.build_label }} timeout-minutes: ${{ fromJson(inputs.timeout) }} + + permissions: + contents: 'read' + id-token: 'write' + outputs: + run_id: ${{ github.run_id }} whl: ${{ steps.build.outputs.whl }} tarfile: ${{ steps.build.outputs.tarfile }} @@ -98,7 +108,7 @@ jobs: - name: set python id: set_python - uses: ./.github/actions/nm-set-python/ + uses: neuralmagic/nm-actions/actions/set-python@main with: python: ${{ inputs.python }} venv: ${{ env.VENV_BASE }} @@ -118,7 +128,27 @@ jobs: with: python: ${{ inputs.python }} venv: ${{ env.VENV_BASE }} - pypi: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }} + + # GCP + - name: 'Authenticate to Google Cloud' + id: auth + uses: google-github-actions/auth@v2.1.3 + with: + project_id: ${{ secrets.GCP_PROJECT }} + workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }} + service_account: ${{ secrets.NM_PYPI_SA }} + + - name: 'Set up Cloud SDK' + uses: 'google-github-actions/setup-gcloud@v2' + with: + version: '>= 473.0.0' + + - name: copy whl and source distribution + run: | + # echo "whl: ${{ steps.build.outputs.whl }}" + # echo "tarfile: ${{ steps.build.outputs.tarfile }}" + gcloud storage cp dist/${{ steps.build.outputs.whl }} gs://neuralmagic-public-pypi/assets/${{ github.run_id }}/${{ steps.build.outputs.whl }} + gcloud storage cp dist/${{ steps.build.outputs.tarfile }} gs://neuralmagic-public-pypi/assets/${{ github.run_id }}/${{ steps.build.outputs.tarfile }} - name: upload whl uses: actions/upload-artifact@v4 @@ -126,7 +156,7 @@ jobs: with: name: ${{ steps.build.outputs.whl }} path: dist/${{ steps.build.outputs.whl }} - retention-days: 15 + retention-days: 5 - name: upload tar.gz uses: actions/upload-artifact@v4 @@ -144,7 +174,6 @@ jobs: gitref: ${{ inputs.gitref }} testmo_run_url: https://neuralmagic.testmo.net/automation/runs/view/${{ steps.create_testmo_run.outputs.id }} python: ${{ steps.set_python.outputs.version }} - build_status: ${{ steps.build.outputs.build_status }} whl_status: ${{ steps.build.outputs.whl_status }} - name: run status @@ -154,9 +183,7 @@ jobs: BUILD_STATUS: ${{ steps.build.outputs.build_status }} WHL_STATUS: ${{ steps.build.outputs.whl_status }} run: | - echo "build status: ${BUILD_STATUS}" echo "build status: ${WHL_STATUS}" - if [ -z "${BUILD_STATUS}" ] || [ "${BUILD_STATUS}" -ne "0" ]; then exit 1; fi if [ -z "${WHL_STATUS}" ] || [ "${WHL_STATUS}" -ne "0" ]; then exit 1; fi - name: complete testmo run diff --git a/.github/workflows/nm-test-accuracy-full.yml b/.github/workflows/nm-lm-eval.yml similarity index 80% rename from .github/workflows/nm-test-accuracy-full.yml rename to .github/workflows/nm-lm-eval.yml index ae3ebee62203e..4ffeb76ef9f15 100644 --- a/.github/workflows/nm-test-accuracy-full.yml +++ b/.github/workflows/nm-lm-eval.yml @@ -15,14 +15,6 @@ on: description: "git commit hash or branch name" type: string required: true - Gi_per_thread: - description: 'requested GiB to reserve per thread' - type: string - required: true - nvcc_threads: - description: "number of threads nvcc build threads" - type: string - required: true python: description: "python version, e.g. 3.10.12" type: string @@ -31,6 +23,10 @@ on: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true + lm_eval_configuration: + description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)' + type: string + required: true # makes workflow manually callable workflow_dispatch: @@ -47,14 +43,6 @@ on: description: "git commit hash or branch name" type: string required: true - Gi_per_thread: - description: 'requested GiB to reserve per thread' - type: string - required: true - nvcc_threads: - description: "number of threads nvcc build threads" - type: string - required: true python: description: "python version, e.g. 3.10.12" type: string @@ -63,9 +51,13 @@ on: description: "whl to test (variable appears late binding so unusable outside 'download artifact')" type: string required: true + lm_eval_configuration: + description: 'file containing tests configuration (see: nm-vllm/neuralmagic/lm-eval)' + type: string + required: true jobs: - TEST-ACCURACY-FULL: + LM-EVAL: runs-on: ${{ inputs.label }} timeout-minutes: ${{ fromJSON(inputs.timeout) }} @@ -77,6 +69,12 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ inputs.python }} + + - name: install automation components + run: | + sudo apt-get update --fix-missing + sudo apt-get install -y git-all + sudo apt-get install -y curl - name: checkout repository code uses: actions/checkout@v4 @@ -93,11 +91,9 @@ jobs: Gi_per_thread: ${{ inputs.Gi_per_thread }} nvcc_threads: ${{ inputs.nvcc_threads }} - - name: hf cache - id: hf_cache - uses: ./.github/actions/nm-hf-cache/ - with: - fs_cache: ${{ secrets.HF_FS_CACHE }} + - name: caches + id: caches + uses: ./.github/actions/nm-caches/ - name: download whl id: download @@ -114,7 +110,8 @@ jobs: venv: - name: run lm-eval-accuracy - uses: ./.github/actions/nm-lm-eval-accuracy/ + uses: ./.github/actions/nm-lm-eval/ with: python: ${{ inputs.python }} venv: + lm_eval_configuration: ${{ inputs.lm_eval_configuration }} diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml index fecd49c190c46..434f2b9032b1b 100644 --- a/.github/workflows/nm-nightly.yml +++ b/.github/workflows/nm-nightly.yml @@ -3,89 +3,48 @@ run-name: ${{ github.actor }} triggered nightly on ${{ github.ref }} on: schedule: # * is a special character in YAML so you have to quote this string - - cron: '0 1 * * 1-6' # nightly run (Mon-Sat) + - cron: '0 1 * * *' # nightly run workflow_dispatch: inputs: - push_benchmark_results_to_gh_pages: - description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI " + wf_category: + description: "workflow category, default is NIGHTLY" type: choice options: - - 'true' - - 'false' - default: 'false' + - NIGHTLY + - RELEASE + default: NIGHTLY + push_to_pypi: + description: "when set and tests pass, then '.whl' and '.tar.gz' will be pushed to neuralmagic pypi" + type: boolean + default: false + push_benchmark_results_to_gh_pages: + description: "when set, then all benchmarking results are published to gh-pages UI " + type: boolean + default: false jobs: - PYTHON-3-8: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: NIGHTLY - python: 3.8.17 - gitref: ${{ github.ref }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_list: neuralmagic/tests/skip-for-nightly.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" - secrets: inherit - - PYTHON-3-9: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: NIGHTLY - python: 3.9.17 - gitref: ${{ github.ref }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" - secrets: inherit - - PYTHON-3-10: + NIGHTLY: uses: ./.github/workflows/nm-build-test.yml with: - wf_category: NIGHTLY + wf_category: ${{ inputs.wf_category || 'NIGHTLY' }} python: 3.10.12 gitref: ${{ github.ref }} + push_to_pypi: ${{ github.event_name == 'schedule' || inputs.push_to_pypi }} - test_label_solo: aws-avx2-32G-a10g-24G - test_label_multi: ignore + test_configs: '[{"python":"3.8.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, + {"python":"3.9.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, + {"python":"3.10.12","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}, + {"python":"3.11.4","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}]' test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 720 + benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt + benchmark_timeout: 480 push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" - secrets: inherit - - PYTHON-3-11: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: NIGHTLY - python: 3.11.4 - gitref: ${{ github.ref }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./.github/lm-eval-configs/smoke-small-models.txt + lm_eval_timeout: 60 secrets: inherit diff --git a/.github/workflows/nm-release.yml b/.github/workflows/nm-release.yml deleted file mode 100644 index f5c9056cbc5d7..0000000000000 --- a/.github/workflows/nm-release.yml +++ /dev/null @@ -1,86 +0,0 @@ -name: nm release -run-name: ${{ github.actor }} verifying branch '${{ github.ref }}' -on: - workflow_dispatch: - inputs: - push_benchmark_results_to_gh_pages: - description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" - type: choice - options: - - 'true' - - 'false' - default: 'false' - -jobs: - - PYTHON-3-8: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: 'RELEASE' - python: 3.8.17 - gitref: ${{ github.ref }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} - secrets: inherit - - PYTHON-3-9: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: 'RELEASE' - python: 3.9.17 - gitref: ${{ github.ref }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} - secrets: inherit - - PYTHON-3-10: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: 'RELEASE' - python: 3.10.12 - gitref: ${{ github.ref }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} - secrets: inherit - - PYTHON-3-11: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: 'RELEASE' - python: 3.11.4 - gitref: ${{ github.ref }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 720 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: ${{ inputs.push_benchmark_results_to_gh_pages }} - secrets: inherit diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml index 3c1fe246756a4..a44274d9e8a11 100644 --- a/.github/workflows/nm-remote-push.yml +++ b/.github/workflows/nm-remote-push.yml @@ -12,66 +12,24 @@ concurrency: jobs: - BUILD-TEST-3-8: - uses: ./.github/workflows/nm-build-test.yml - with: - python: 3.8.17 - gitref: ${{ github.ref }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 480 - secrets: inherit - - BUILD-TEST-3-9: - uses: ./.github/workflows/nm-build-test.yml - with: - python: 3.9.17 - gitref: ${{ github.ref }} - - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 480 - secrets: inherit - - BUILD-TEST-3-10: + REMOTE: uses: ./.github/workflows/nm-build-test.yml with: python: 3.10.12 gitref: ${{ github.ref }} + push_to_pypi: false - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore + test_configs: '[{"python":"3.8.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, + {"python":"3.9.17","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, + {"python":"3.10.12","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}, + {"python":"3.11.4","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}]' test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt + benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt benchmark_timeout: 480 - secrets: inherit - - BUILD-TEST-3-11: - uses: ./.github/workflows/nm-build-test.yml - with: - python: 3.11.4 - gitref: ${{ github.ref }} - test_label_solo: gcp-k8s-l4-solo - test_label_multi: ignore - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/smoke.txt - - benchmark_label: gcp-k8s-l4-solo - benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt - benchmark_timeout: 480 + lm_eval_label: gcp-k8s-l4-solo + lm_eval_configuration: ./.github/lm-eval-configs/smoke-small-models.txt + lm_eval_timeout: 60 secrets: inherit diff --git a/.github/workflows/nm-test.yml b/.github/workflows/nm-test.yml index 87860bcc356bb..01d6fa96730d3 100644 --- a/.github/workflows/nm-test.yml +++ b/.github/workflows/nm-test.yml @@ -4,7 +4,7 @@ on: workflow_call: inputs: test_label: - description: "requested runner label (specifies instance)" + description: "requested runner label" type: string required: true timeout: @@ -32,7 +32,7 @@ on: workflow_dispatch: inputs: test_label: - description: "requested runner label (specifies instance)" + description: "requested runner label" type: string required: true timeout: @@ -94,7 +94,7 @@ jobs: nvcc_threads: 0 - name: install testmo - uses: ./.github/actions/nm-install-testmo/ + uses: neuralmagic/nm-actions/actions/install-testmo@main - name: create testmo run id: create_testmo_run @@ -109,11 +109,9 @@ jobs: id: verify_python uses: ./.github/actions/nm-verify-python/ - - name: hf cache - id: hf_cache - uses: ./.github/actions/nm-hf-cache/ - with: - fs_cache: ${{ secrets.HF_FS_CACHE }} + - name: caches + id: caches + uses: ./.github/actions/nm-caches/ - name: download whl id: download @@ -131,7 +129,7 @@ jobs: - name: run buildkite script run: | cd tests && sudo bash ../.buildkite/download-images.sh - + - name: setenv test skip id: setenv_test_skip uses: ./.github/actions/nm-set-env-test-skip diff --git a/.github/workflows/nm-upload-assets-to-gcp.yml b/.github/workflows/nm-upload-assets-to-gcp.yml index bfade2a90d2f9..8d8835271de0a 100644 --- a/.github/workflows/nm-upload-assets-to-gcp.yml +++ b/.github/workflows/nm-upload-assets-to-gcp.yml @@ -16,10 +16,6 @@ on: description: 'git commit hash or branch name' type: string required: true - python: - description: "python version, e.g. 3.10.12" - type: string - required: true jobs: @@ -33,6 +29,12 @@ jobs: steps: + - name: install automation components + run: | + sudo apt-get update --fix-missing + sudo apt-get install -y git-all + sudo apt-get install -y curl + - name: checkout id: checkout uses: actions/checkout@v4 @@ -68,5 +70,3 @@ jobs: - name: cp assets id: cp-assets uses: ./.github/actions/nm-cp-assets/ - with: - python: ${{ inputs.python }} diff --git a/.github/workflows/nm-weekly.yml b/.github/workflows/nm-weekly.yml deleted file mode 100644 index d92a2619ef359..0000000000000 --- a/.github/workflows/nm-weekly.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: nm Weekly -run-name: ${{ github.actor }} triggered weekly on ${{ github.ref }} -on: - schedule: - # * is a special character in YAML so you have to quote this string - - cron: '0 1 * * 0' # weekly run (Sun) - - workflow_dispatch: - inputs: - push_benchmark_results_to_gh_pages: - description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI" - type: choice - options: - - 'true' - - 'false' - default: 'false' - -jobs: - - BUILD-TEST: - uses: ./.github/workflows/nm-build-test.yml - with: - wf_category: WEEKLY - python: 3.10.12 - gitref: ${{ github.ref }} - - test_label_solo: aws-avx2-32G-a10g-24G - test_label_multi: aws-avx2-192G-4-a10g-96G - test_timeout: 480 - test_skip_env_vars: neuralmagic/tests/test_skip_env_vars/full.txt - - benchmark_label: aws-avx2-32G-a10g-24G - benchmark_config_list_file: ./.github/data/nm_benchmark_weekly_configs_list.txt - benchmark_timeout: 720 - push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}" - secrets: inherit diff --git a/.github/workflows/publish-docker.yml b/.github/workflows/publish-docker.yml index c6a54bb6d3c21..4492994d79197 100644 --- a/.github/workflows/publish-docker.yml +++ b/.github/workflows/publish-docker.yml @@ -1,28 +1,43 @@ name: Docker Build + Publish on: - workflow_dispatch: + workflow_call: inputs: - docker_tag: - description: "tag to be used for the docker image" + push_to_repository: + description: "whether to push out the docker image: false (default) or true" + type: boolean + default: false + gitref: + description: "git commit hash or branch name" + type: string + default: 'main' + wf_category: + description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE" + type: string + default: 'NIGHTLY' + whl: + description: "nm-vllm wheel to install for the docker image" type: string required: true + + workflow_dispatch: + inputs: push_to_repository: - description: "whether to push out the docker image: no (default) or yes" - type: string - default: 'no' + description: "whether to push out the docker image: false (default) or true" + type: boolean + default: false gitref: description: "git commit hash or branch name" type: string default: 'main' - build_type: + wf_category: description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE" type: string default: 'NIGHTLY' - build_version: - description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531" + whl: + description: "nm-vllm wheel to install for the docker image" type: string - default: 'latest' + required: true jobs: build-docker-image: @@ -50,35 +65,36 @@ jobs: id: setup uses: ./.github/actions/nm-setup-nvidia-container-toolkit/ - - name: Get docker image extra tag - id: tag - uses: ./.github/actions/nm-get-docker-tag/ + - name: Get docker image tags + id: tags + uses: ./.github/actions/nm-get-docker-tags/ with: - build_type: ${{ inputs.build_type }} + wf_category: ${{ inputs.wf_category }} + whl: ${{ inputs.whl }} - name: Build image id: build uses: ./.github/actions/nm-build-docker/ with: - docker_tag: ${{ inputs.docker_tag }} - extra_tag: ${{ steps.tag.outputs.tag }} - build_type: ${{ inputs.build_type }} - build_version: ${{ inputs.build_version }} + docker_tag: ${{ steps.tags.outputs.tag }} + extra_tag: ${{ steps.tags.outputs.extra_tag }} + wf_category: ${{ inputs.wf_category }} + build_version: ${{ steps.tags.outputs.build_version }} - name: Push image uses: docker/build-push-action@v5 - if: ${{ inputs.push_to_repository == 'yes' && steps.build.outputs.status == 0 }} + if: ${{ inputs.push_to_repository && steps.build.outputs.status == 0 }} with: context: . target: vllm-openai push: true - tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} + tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ steps.tags.outputs.tag }} - name: Push image uses: docker/build-push-action@v5 - if: ${{ inputs.push_to_repository == 'yes' && steps.build.outputs.status == 0 }} + if: ${{ inputs.push_to_repository && steps.build.outputs.status == 0 }} with: context: . target: vllm-openai push: true - tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ steps.tag.outputs.tag }} + tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ steps.tags.outputs.extra_tag }} diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml index e71033f828006..773def58fd966 100644 --- a/.github/workflows/ruff.yml +++ b/.github/workflows/ruff.yml @@ -25,7 +25,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2 + pip install ruff==0.1.5 codespell==2.3.0 tomli==2.0.1 isort==5.13.2 - name: Analysing the code with ruff run: | ruff . diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh index f8181c7758dbe..60a3978f9abd7 100644 --- a/.github/workflows/scripts/build.sh +++ b/.github/workflows/scripts/build.sh @@ -9,7 +9,7 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH # Install requirements $python_executable -m pip install wheel packaging -$python_executable -m pip install -r requirements-cuda.txt -r requirements-build.txt +$python_executable -m pip install -r requirements-cuda.txt # Limit the number of parallel jobs to avoid OOM export MAX_JOBS=1 diff --git a/.gitignore b/.gitignore index 06e8923ced311..ef8b4845a9176 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ # This file has been modified by Neural Magic +# nm-vllm commit id, generated by setup.py +venv/commit_id.py + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/CMakeLists.txt b/CMakeLists.txt index 3812e96257f41..e04111efe110c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,8 @@ cmake_minimum_required(VERSION 3.21) project(vllm_extensions LANGUAGES CXX) -option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda") +# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py) +set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") message(STATUS "Target device: ${VLLM_TARGET_DEVICE}") @@ -32,8 +33,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11 # versions are derived from Dockerfile.rocm # set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0") -set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1") -set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1") +set(TORCH_SUPPORTED_VERSION_ROCM "2.4.0") # # Try to find python package with an executable that exactly matches @@ -98,18 +98,11 @@ elseif(HIP_FOUND) # .hip extension automatically, HIP must be enabled explicitly. enable_language(HIP) - # ROCm 5.x - if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X}) - message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} " - "expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.") - endif() - - # ROCm 6.x - if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND - NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X}) - message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} " - "expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.") + # ROCm 5.X and 6.X + if (ROCM_VERSION_DEV_MAJOR GREATER_EQUAL 5 AND + NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM}) + message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM} " + "expected for ROCm build, saw ${Torch_VERSION} instead.") endif() else() message(FATAL_ERROR "Can't find CUDA or HIP installation.") @@ -179,9 +172,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") "csrc/quantization/gptq_marlin/gptq_marlin.cu" "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu" "csrc/custom_all_reduce.cu" - "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu" - "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu" - "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu") + "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" + "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu" + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu") # # The CUTLASS kernels for Hopper require sm90a to be enabled. @@ -189,7 +182,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # That adds an extra 17MB to compiled binary, so instead we selectively enable it. if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0) set_source_files_properties( - "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu" + "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu" PROPERTIES COMPILE_FLAGS "-gencode arch=compute_90a,code=sm_90a") diff --git a/Dockerfile b/Dockerfile index 9f741b0ac7e53..cc4976db8fc13 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,18 +5,35 @@ # docs/source/dev/dockerfile/dockerfile.rst and # docs/source/assets/dev/dockerfile-stages-dependency.png +ARG CUDA_VERSION=12.4.1 #################### BASE BUILD IMAGE #################### # prepare basic build environment -FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS base -RUN apt-get update -y && \ - apt-get install -y python3-pip git +ARG CUDA_VERSION=12.4.1 +ARG PYTHON_VERSION=3 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ + && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ + && apt-get update -y \ + && apt-get install -y ccache software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update -y \ + && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv python3-pip \ + && if [ "${PYTHON_VERSION}" != "3" ]; then update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1; fi \ + && python3 --version \ + && python3 -m pip --version + +RUN apt-get update -y \ + && apt-get install -y python3-pip git curl sudo # Workaround for https://github.com/openai/triton/issues/2507 and # https://github.com/pytorch/pytorch/issues/107960 -- hopefully # this won't be needed for future versions of this docker image # or future versions of triton. -RUN ldconfig /usr/local/cuda-12.4/compat/ +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ WORKDIR /workspace @@ -27,6 +44,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-cuda.txt # install development dependencies +COPY requirements-lint.txt requirements-lint.txt +COPY requirements-test.txt requirements-test.txt COPY requirements-dev.txt requirements-dev.txt RUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements-dev.txt @@ -34,30 +53,19 @@ RUN --mount=type=cache,target=/root/.cache/pip \ #################### BASE BUILD IMAGE #################### #################### WHEEL BUILD IMAGE #################### -FROM dev AS build +FROM base AS build + +ARG PYTHON_VERSION=3 # install compiler cache to speed up compilation leveraging local or remote caching RUN apt-get update -y && apt-get install -y ccache #################### EXTENSION Build IMAGE #################### -#################### FLASH_ATTENTION Build IMAGE #################### -FROM dev as flash-attn-builder -# flash attention version -ARG flash_attn_version=v2.5.8 -ENV FLASH_ATTN_VERSION=${flash_attn_version} - -WORKDIR /usr/src/flash-attention-v2 - -# Download the wheel or build it if a pre-compiled release doesn't exist -RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \ - --no-build-isolation --no-deps --no-cache-dir - -#################### FLASH_ATTENTION Build IMAGE #################### - #################### vLLM installation IMAGE #################### # image with vLLM installed -FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base +FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base +ARG CUDA_VERSION=12.4.1 WORKDIR /vllm-workspace RUN apt-get update -y && \ @@ -67,7 +75,7 @@ RUN apt-get update -y && \ # https://github.com/pytorch/pytorch/issues/107960 -- hopefully # this won't be needed for future versions of this docker image # or future versions of triton. -RUN ldconfig /usr/local/cuda-12.4/compat/ +RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ # install nm-vllm wheel first, so that torch etc will be installed ARG build_type="NIGHTLY" @@ -92,9 +100,6 @@ RUN --mount=type=bind,from=build \ fi; \ fi -RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \ - --mount=type=cache,target=/root/.cache/pip \ - pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir #################### vLLM installation IMAGE #################### #################### TEST IMAGE #################### @@ -122,7 +127,7 @@ FROM vllm-base AS vllm-openai # install additional dependencies for openai api server RUN --mount=type=cache,target=/root/.cache/pip \ - pip install accelerate hf_transfer modelscope + pip install accelerate hf_transfer 'modelscope!=1.15.0' ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 403a1cd0391b0..6e55203decc56 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -3,9 +3,13 @@ FROM ubuntu:22.04 AS cpu-test-1 RUN apt-get update -y \ - && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \ + && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +RUN echo 'export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:$LD_PRELOAD' >> ~/.bashrc + +RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl + RUN pip install --upgrade pip \ && pip install wheel packaging ninja "setuptools>=49.4.0" numpy @@ -17,10 +21,14 @@ WORKDIR /workspace/vllm RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... +ARG VLLM_CPU_DISABLE_AVX512 +ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512} + RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install WORKDIR /workspace/ -RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks CMD ["/bin/bash"] diff --git a/Dockerfile.openvino b/Dockerfile.openvino new file mode 100644 index 0000000000000..9861997b451a9 --- /dev/null +++ b/Dockerfile.openvino @@ -0,0 +1,26 @@ +# The vLLM Dockerfile is used to construct vLLM image that can be directly used +# to run the OpenAI compatible server. + +FROM ubuntu:22.04 AS dev + +RUN apt-get update -y && \ + apt-get install -y python3-pip git +WORKDIR /workspace + +# copy requirements +COPY requirements-build.txt /workspace/vllm/ +COPY requirements-common.txt /workspace/vllm/ +COPY requirements-openvino.txt /workspace/vllm/ + +COPY vllm/ /workspace/vllm/vllm +COPY setup.py /workspace/vllm/ + +# install build requirements +RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt +# build vLLM with OpenVINO backend +RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/nightly/" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/ + +COPY examples/ /workspace/vllm/examples +COPY benchmarks/ /workspace/vllm/benchmarks + +CMD ["/bin/bash"] diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le new file mode 100644 index 0000000000000..d4e4c483cada8 --- /dev/null +++ b/Dockerfile.ppc64le @@ -0,0 +1,22 @@ +FROM mambaorg/micromamba +ARG MAMBA_DOCKERFILE_ACTIVATE=1 +USER root + +RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 + +# Some packages in requirements-cpu are installed here +# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba +# Currently these may not be available for venv or pip directly +RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +# These packages will be in rocketce eventually +RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing + +RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install + +WORKDIR /vllm-workspace +ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"] diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 954958df88fc0..1b89b892bbf1c 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,35 +1,35 @@ -# default base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -FROM $BASE_IMAGE - -ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - -RUN echo "Base image is $BASE_IMAGE" - -# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" -# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" - - +# Default ROCm 6.1 base image +ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" + +# Tested and supported base rocm/pytorch images +ARG ROCm_5_7_BASE="rocm/pytorch:rocm5.7_ubuntu20.04_py3.9_pytorch_2.0.1" \ + ROCm_6_0_BASE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" \ + ROCM_6_1_BASE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" + +# Default ROCm ARCHes to build vLLM for. +ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" + +# Whether to build CK-based flash-attention +# If 0, will not build flash attention +# This is useful for gfx target where flash-attention is not supported +# (i.e. those that do not appear in `FA_GFX_ARCHS`) +# Triton FA is used by default on ROCm now so this is unnecessary. +ARG BUILD_FA="1" ARG FA_GFX_ARCHS="gfx90a;gfx942" -RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS" - ARG FA_BRANCH="ae7928c" -RUN echo "FA_BRANCH is $FA_BRANCH" -# whether to build flash-attention -# if 0, will not build flash attention -# this is useful for gfx target where flash-attention is not supported -# In that case, we need to use the python reference attention implementation in vllm -ARG BUILD_FA="1" - -# whether to build triton on rocm +# Whether to build triton on rocm ARG BUILD_TRITON="1" +ARG TRITON_BRANCH="0ef1848" -# Install some basic utilities -RUN apt-get update && apt-get install python3 python3-pip -y +### Base image build stage +FROM $BASE_IMAGE AS base + +# Import arg(s) defined before this build stage +ARG PYTORCH_ROCM_ARCH # Install some basic utilities +RUN apt-get update && apt-get install python3 python3-pip -y RUN apt-get update && apt-get install -y \ curl \ ca-certificates \ @@ -40,76 +40,165 @@ RUN apt-get update && apt-get install -y \ build-essential \ wget \ unzip \ - nvidia-cuda-toolkit \ tmux \ + ccache \ && rm -rf /var/lib/apt/lists/* -### Mount Point ### -# When launching the container, mount the code directory to /app +# When launching the container, mount the code directory to /vllm-workspace ARG APP_MOUNT=/vllm-workspace -VOLUME [ ${APP_MOUNT} ] WORKDIR ${APP_MOUNT} -RUN python3 -m pip install --upgrade pip -RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas +RUN pip install --upgrade pip +# Remove sccache so it doesn't interfere with ccache +# TODO: implement sccache support across components +RUN apt-get purge -y sccache; pip uninstall -y sccache; rm -f "$(which sccache)" +# Install torch == 2.4.0 on ROCm +RUN case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-5.7"*) \ + pip uninstall -y torch torchaudio torchvision \ + && pip install --no-cache-dir --pre \ + torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \ + torchvision==0.19.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm5.7;; \ + *"rocm-6.0"*) \ + pip uninstall -y torch torchaudio torchvision \ + && pip install --no-cache-dir --pre \ + torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \ + torchvision==0.19.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.0;; \ + *"rocm-6.1"*) \ + pip uninstall -y torch torchaudio torchvision \ + && pip install --no-cache-dir --pre \ + torch==2.4.0.dev20240612 torchaudio==2.4.0.dev20240612 \ + torchvision==0.19.0.dev20240612 \ + --index-url https://download.pytorch.org/whl/nightly/rocm6.1;; \ + *) ;; esac ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: -# Install ROCm flash-attention -RUN if [ "$BUILD_FA" = "1" ]; then \ - mkdir libs \ +ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} +ENV CCACHE_DIR=/root/.cache/ccache + + +### AMD-SMI build stage +FROM base AS build_amdsmi +# Build amdsmi wheel always +RUN cd /opt/rocm/share/amd_smi \ + && pip wheel . --wheel-dir=/install + + +### Flash-Attention wheel build stage +FROM base AS build_fa +ARG BUILD_FA +ARG FA_GFX_ARCHS +ARG FA_BRANCH +# Build ROCm flash-attention wheel if `BUILD_FA = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_FA" = "1" ]; then \ + mkdir -p libs \ && cd libs \ && git clone https://github.com/ROCm/flash-attention.git \ && cd flash-attention \ - && git checkout ${FA_BRANCH} \ + && git checkout "${FA_BRANCH}" \ && git submodule update --init \ - && export GPU_ARCHS=${FA_GFX_ARCHS} \ - && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \ - patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \ - && python3 setup.py install \ - && cd ..; \ + && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-5.7"*) \ + export VLLM_TORCH_PATH="$(python3 -c 'import torch; print(torch.__path__[0])')" \ + && patch "${VLLM_TORCH_PATH}"/utils/hipify/hipify_python.py hipify_patch.patch;; \ + *) ;; esac \ + && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ fi -# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. -# Manually removed it so that later steps of numpy upgrade can continue -RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ - rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi -# build triton -RUN if [ "$BUILD_TRITON" = "1" ]; then \ +### Triton wheel build stage +FROM base AS build_triton +ARG BUILD_TRITON +ARG TRITON_BRANCH +# Build triton wheel if `BUILD_TRITON = 1` +RUN --mount=type=cache,target=${CCACHE_DIR} \ + if [ "$BUILD_TRITON" = "1" ]; then \ mkdir -p libs \ && cd libs \ - && pip uninstall -y triton \ - && git clone https://github.com/ROCm/triton.git \ - && cd triton/python \ - && pip3 install . \ - && cd ../..; \ + && git clone https://github.com/OpenAI/triton.git \ + && cd triton \ + && git checkout "${TRITON_BRANCH}" \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=/install; \ + # Create an empty directory otherwise as later build stages expect one + else mkdir -p /install; \ fi -WORKDIR /vllm-workspace + +### Final vLLM build stage +FROM base AS final +# Import the vLLM development directory from the build context COPY . . -#RUN python3 -m pip install pynvml # to be removed eventually -RUN python3 -m pip install --upgrade pip numba +# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. +# Manually remove it so that later steps of numpy upgrade can continue +RUN case "$(which python3)" in \ + *"/opt/conda/envs/py_3.9"*) \ + rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ + *) ;; esac + +# Package upgrades for useful functionality or to avoid dependency issues +RUN --mount=type=cache,target=/root/.cache/pip \ + pip install --upgrade numba scipy huggingface-hub[cli] -# make sure punica kernels are built (for LoRA) +# Make sure punica kernels are built (for LoRA) ENV VLLM_INSTALL_PUNICA_KERNELS=1 # Workaround for ray >= 2.10.0 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 +# Silences the HF Tokenizers warning +ENV TOKENIZERS_PARALLELISM=false -ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so - -RUN --mount=type=cache,target=/root/.cache/pip \ +RUN --mount=type=cache,target=${CCACHE_DIR} \ + --mount=type=cache,target=/root/.cache/pip \ pip install -U -r requirements-rocm.txt \ - && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \ - && python3 setup.py install \ - && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \ - && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \ - && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \ - && cd .. + && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ + *"rocm-6.0"*) \ + patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch;; \ + *"rocm-6.1"*) \ + # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM + wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P rocm_patch \ + && cp rocm_patch/libamdhip64.so.6 /opt/rocm/lib/libamdhip64.so.6 \ + # Prevent interference if torch bundles its own HIP runtime + && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \ + *) ;; esac \ + && python3 setup.py clean --all \ + && python3 setup.py develop + +# Copy amdsmi wheel into final image +RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \ + mkdir -p libs \ + && cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y amdsmi; + +# Copy triton wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_triton,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y triton; fi + +# Copy flash-attn wheel(s) into final image if they were built +RUN --mount=type=bind,from=build_fa,src=/install,target=/install \ + mkdir -p libs \ + && if ls /install/*.whl; then \ + cp /install/*.whl libs \ + # Preemptively uninstall to avoid same-version no-installs + && pip uninstall -y flash-attn; fi +# Install wheels that were built to the final image +RUN --mount=type=cache,target=/root/.cache/pip \ + if ls libs/*.whl; then \ + pip install libs/*.whl; fi CMD ["/bin/bash"] diff --git a/Dockerfile.tpu b/Dockerfile.tpu new file mode 100644 index 0000000000000..931c844c08dce --- /dev/null +++ b/Dockerfile.tpu @@ -0,0 +1,19 @@ +ARG NIGHTLY_DATE="20240601" +ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:nightly_3.10_tpuvm_$NIGHTLY_DATE" + +FROM $BASE_IMAGE + +WORKDIR /workspace +COPY . /workspace/vllm + +ENV VLLM_TARGET_DEVICE="tpu" +# Install aiohttp separately to avoid build errors. +RUN pip install aiohttp +# Install the TPU and Pallas dependencies. +RUN pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html +RUN pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html + +# Build vLLM. +RUN cd /workspace/vllm && python setup.py develop + +CMD ["/bin/bash"] diff --git a/Dockerfile.xpu b/Dockerfile.xpu new file mode 100644 index 0000000000000..c39e551672d20 --- /dev/null +++ b/Dockerfile.xpu @@ -0,0 +1,22 @@ +FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04 + +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \ + echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \ + chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \ + rm /etc/apt/sources.list.d/intel-graphics.list && \ + wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \ + echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \ + chmod 644 /usr/share/keyrings/intel-graphics.gpg + +RUN apt-get update -y \ +&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip + +COPY ./ /workspace/vllm + +WORKDIR /workspace/vllm + +RUN pip install -v -r requirements-xpu.txt + +RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install + +CMD ["/bin/bash"] diff --git a/README.md b/README.md index 80567d5bdbfc2..5bdff29dfd159 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,20 @@ -

- tool icon -   nm-vllm -

+# nm-vllm ## Overview -[vLLM](https://github.com/vllm-project/vllm) is a fast and easy-to-use library for LLM inference that Neural Magic regularly contributes upstream improvements to. This fork, `nm-vllm` is our opinionated focus on incorporating the latest LLM optimizations like quantization and sparsity for enhanced performance. +`nm-vllm` is our supported enterprise distribution of [vLLM](https://github.com/vllm-project/vllm). ## Installation -The [nm-vllm PyPi package](https://pypi.neuralmagic.com/simple/nm-vllm/index.html) includes pre-compiled binaries for CUDA (version 12.1) kernels, streamlining the setup process. For other PyTorch or CUDA versions, please compile the package from source. + +### PyPI +The [nm-vllm PyPi package](https://pypi.neuralmagic.com/simple/nm-vllm/index.html) includes pre-compiled binaries for CUDA (version 12.1) kernels. For other PyTorch or CUDA versions, please compile the package from source. Install it using pip: ```bash pip install nm-vllm --extra-index-url https://pypi.neuralmagic.com/simple ``` -For utilizing weight-sparsity kernels, such as through `sparsity="sparse_w16a16"`, you can extend the installation with the `sparsity` extras: +To utilize the weight sparsity features, include the optional `sparse` dependencies. ```bash pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple ``` @@ -24,111 +23,22 @@ You can also build and install `nm-vllm` from source (this will take ~10 minutes ```bash git clone https://github.com/neuralmagic/nm-vllm.git cd nm-vllm -pip install -e . +pip install -e .[sparse] --extra-index-url https://pypi.neuralmagic.com/simple ``` -## Quickstart - -Neural Magic maintains a variety of sparse models on our Hugging Face organization profiles, [neuralmagic](https://huggingface.co/neuralmagic) and [nm-testing](https://huggingface.co/nm-testing). - -A collection of ready-to-use SparseGPT and GPTQ models in inference optimized marlin format are [available on Hugging Face](https://huggingface.co/collections/neuralmagic/compressed-llms-for-nm-vllm-65e73e3d51d3200e34b77431) - -#### Model Inference with Marlin (4-bit Quantization) - -Marlin is an extremely optimized FP16xINT4 matmul kernel aimed at LLM inference that can deliver close to ideal (4x) speedups up to batchsizes of 16-32 tokens. -To use Marlin within nm-vllm, simply pass the Marlin quantized directly to the engine. It will detect the quantization from the model's config. - -Here is a demonstraiton with a [4-bit quantized OpenHermes Mistral](https://huggingface.co/neuralmagic/OpenHermes-2.5-Mistral-7B-marlin) model: - -```python -from vllm import LLM, SamplingParams -from transformers import AutoTokenizer - -model_id = "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin" -model = LLM(model_id, max_model_len=4096) -tokenizer = AutoTokenizer.from_pretrained(model_id) -sampling_params = SamplingParams(max_tokens=100, temperature=0.8, top_p=0.95) +### Docker -messages = [ - {"role": "user", "content": "What is synthetic data in machine learning?"}, -] -formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) -outputs = model.generate(formatted_prompt, sampling_params=sampling_params) -print(outputs[0].outputs[0].text) -``` - -#### Model Inference with Weight Sparsity - -For a quick demonstration, here's how to run a small [50% sparse llama2-110M](https://huggingface.co/nm-testing/llama2.c-stories110M-pruned50) model trained on storytelling: - -```python -from vllm import LLM, SamplingParams - -model = LLM( - "neuralmagic/llama2.c-stories110M-pruned50", - sparsity="sparse_w16a16", # If left off, model will be loaded as dense -) - -sampling_params = SamplingParams(max_tokens=100, temperature=0) -outputs = model.generate("Hello my name is", sampling_params=sampling_params) -print(outputs[0].outputs[0].text) -``` +The [`nm-vllm` container registry](https://github.com/neuralmagic/nm-vllm/pkgs/container/nm-vllm-openai) includes premade docker images. -Here is a more realistic example of running a 50% sparse OpenHermes 2.5 Mistral 7B model finetuned for instruction-following: +Launch the OpenAI-compatible server with: -```python -from vllm import LLM, SamplingParams -from transformers import AutoTokenizer - -model_id = "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50" -model = LLM(model_id, sparsity="sparse_w16a16", max_model_len=4096) -tokenizer = AutoTokenizer.from_pretrained(model_id) -sampling_params = SamplingParams(max_tokens=100, temperature=0.8, top_p=0.95) - -messages = [ - {"role": "user", "content": "What is sparsity in deep learning?"}, -] -formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) -outputs = model.generate(formatted_prompt, sampling_params=sampling_params) -print(outputs[0].outputs[0].text) -``` - -There is also support for semi-structured 2:4 sparsity using the `sparsity="semi_structured_sparse_w16a16"` argument: -```python -from vllm import LLM, SamplingParams - -model = LLM("neuralmagic/llama2.c-stories110M-pruned2.4", sparsity="semi_structured_sparse_w16a16") -sampling_params = SamplingParams(max_tokens=100, temperature=0) -outputs = model.generate("Once upon a time, ", sampling_params=sampling_params) -print(outputs[0].outputs[0].text) -``` - -#### Integration with OpenAI-Compatible Server - -You can also quickly use the same flow with an OpenAI-compatible model server: ```bash -python -m vllm.entrypoints.openai.api_server \ - --model neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50 \ - --sparsity sparse_w16a16 \ - --max-model-len 4096 +MODEL_ID=Qwen/Qwen2-0.5B-Instruct +docker run --gpus all --shm-size 2g ghcr.io/neuralmagic/nm-vllm-openai:latest --model $MODEL_ID ``` -## Quantized Inference Performance - -Developed in collaboration with IST-Austria, [GPTQ](https://arxiv.org/abs/2210.17323) is the leading quantization algorithm for LLMs, which enables compressing the model weights from 16 bits to 4 bits with limited impact on accuracy. nm-vllm includes support for the recently-developed Marlin kernels for accelerating GPTQ models. Prior to Marlin, the existing kernels for INT4 inference failed to scale in scenarios with multiple concurrent users. - -

- Marlin Performance -

- -## Sparse Inference Performance - -Developed in collaboration with IST-Austria, [SparseGPT](https://arxiv.org/abs/2301.00774) and [Sparse Fine-tuning](https://arxiv.org/abs/2310.06927) are the leading algorithms for pruning LLMs, which enables removing at least half of model weights with limited impact on accuracy. - -nm-vllm includes support for newly-developed sparse inference kernels, which provides both memory reduction and acceleration of sparse models leveraging sparsity. - -

- Sparse Memory Compression - Sparse Inference Performance -

+## Models +Neural Magic maintains a variety of optimized models on our Hugging Face organization profiles: +- [neuralmagic](https://huggingface.co/neuralmagic) +- [nm-testing](https://huggingface.co/nm-testing) diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index 58dcc6167efa6..fd2461ce58b93 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -4,10 +4,13 @@ import time import traceback from dataclasses import dataclass, field -from typing import List, Optional +from typing import List, Optional, Union import aiohttp +import huggingface_hub.constants from tqdm.asyncio import tqdm +from transformers import (AutoTokenizer, PreTrainedTokenizer, + PreTrainedTokenizerFast) AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @@ -68,9 +71,13 @@ async def async_request_tgi( chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue + chunk_bytes = chunk_bytes.decode("utf-8") - chunk = remove_prefix(chunk_bytes.decode("utf-8"), - "data:") + #NOTE: Sometimes TGI returns a ping response without + # any data, we should skip it. + if chunk_bytes.startswith(":"): + continue + chunk = remove_prefix(chunk_bytes, "data:") data = json.loads(chunk) timestamp = time.perf_counter() @@ -258,6 +265,9 @@ async def async_request_openai_completions( else: data = json.loads(chunk) + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated if data["choices"][0]["text"]: timestamp = time.perf_counter() # First token @@ -266,12 +276,8 @@ async def async_request_openai_completions( output.ttft = ttft # Decoding phase - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # do not want to include as inter-token-latency - elif data.get("usage", None) is None: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - + most_recent_timestamp) most_recent_timestamp = timestamp generated_text += data["choices"][0]["text"] @@ -384,6 +390,30 @@ def remove_prefix(text: str, prefix: str) -> str: return text +def get_model(pretrained_model_name_or_path: str): + if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true': + from modelscope import snapshot_download + else: + from huggingface_hub import snapshot_download + + model_path = snapshot_download( + model_id=pretrained_model_name_or_path, + local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE, + ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"]) + return model_path + + +def get_tokenizer( + pretrained_model_name_or_path: str, trust_remote_code: bool +) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: + if pretrained_model_name_or_path is not None and not os.path.exists( + pretrained_model_name_or_path): + pretrained_model_name_or_path = get_model( + pretrained_model_name_or_path) + return AutoTokenizer.from_pretrained(pretrained_model_name_or_path, + trust_remote_code=trust_remote_code) + + ASYNC_REQUEST_FUNCS = { "tgi": async_request_tgi, "vllm": async_request_openai_completions, @@ -392,4 +422,5 @@ def remove_prefix(text: str, prefix: str) -> str: "openai": async_request_openai_completions, "openai-chat": async_request_openai_chat_completions, "tensorrt-llm": async_request_trt_llm, + "scalellm": async_request_openai_completions, } diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py index 1a41b66b38824..a46ee15817f4c 100644 --- a/benchmarks/benchmark_latency.py +++ b/benchmarks/benchmark_latency.py @@ -10,8 +10,10 @@ from tqdm import tqdm from vllm import LLM, SamplingParams +from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptStrictInputs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def main(args: argparse.Namespace): @@ -19,25 +21,32 @@ def main(args: argparse.Namespace): # NOTE(woosuk): If the request cannot be processed in a single batch, # the engine will automatically process the request in multiple batches. - llm = LLM(model=args.model, - speculative_model=args.speculative_model, - num_speculative_tokens=args.num_speculative_tokens, - tokenizer=args.tokenizer, - quantization=args.quantization, - tensor_parallel_size=args.tensor_parallel_size, - trust_remote_code=args.trust_remote_code, - dtype=args.dtype, - enforce_eager=args.enforce_eager, - kv_cache_dtype=args.kv_cache_dtype, - quantization_param_path=args.quantization_param_path, - device=args.device, - ray_workers_use_nsight=args.ray_workers_use_nsight, - use_v2_block_manager=args.use_v2_block_manager, - enable_chunked_prefill=args.enable_chunked_prefill, - download_dir=args.download_dir, - block_size=args.block_size, - gpu_memory_utilization=args.gpu_memory_utilization, - distributed_executor_backend=args.distributed_executor_backend) + llm = LLM( + model=args.model, + speculative_model=args.speculative_model, + num_speculative_tokens=args.num_speculative_tokens, + speculative_draft_tensor_parallel_size=\ + args.speculative_draft_tensor_parallel_size, + tokenizer=args.tokenizer, + quantization=args.quantization, + tensor_parallel_size=args.tensor_parallel_size, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + max_model_len=args.max_model_len, + enforce_eager=args.enforce_eager, + kv_cache_dtype=args.kv_cache_dtype, + quantization_param_path=args.quantization_param_path, + device=args.device, + ray_workers_use_nsight=args.ray_workers_use_nsight, + use_v2_block_manager=args.use_v2_block_manager, + enable_chunked_prefill=args.enable_chunked_prefill, + download_dir=args.download_dir, + block_size=args.block_size, + gpu_memory_utilization=args.gpu_memory_utilization, + load_format=args.load_format, + distributed_executor_backend=args.distributed_executor_backend, + otlp_traces_endpoint=args.otlp_traces_endpoint, + ) sampling_params = SamplingParams( n=args.n, @@ -96,7 +105,7 @@ def run_to_completion(profile_dir: Optional[str] = None): for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): latencies.append(run_to_completion(profile_dir=None)) latencies = np.array(latencies) - percentages = [10, 25, 50, 75, 90] + percentages = [10, 25, 50, 75, 90, 99] percentiles = np.percentile(latencies, percentages) print(f'Avg latency: {np.mean(latencies)} seconds') for percentage, percentile in zip(percentages, percentiles): @@ -114,12 +123,16 @@ def run_to_completion(profile_dir: Optional[str] = None): if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the latency of processing a single batch of ' 'requests till completion.') parser.add_argument('--model', type=str, default='facebook/opt-125m') parser.add_argument('--speculative-model', type=str, default=None) parser.add_argument('--num-speculative-tokens', type=int, default=None) + parser.add_argument('--speculative-draft-tensor-parallel-size', + '-spec-draft-tp', + type=int, + default=None) parser.add_argument('--tokenizer', type=str, default=None) parser.add_argument('--quantization', '-q', @@ -145,6 +158,12 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface') + parser.add_argument( + '--max-model-len', + type=int, + default=None, + help='Maximum length of a sequence (including prompt and output). ' + 'If None, will be derived from the model.') parser.add_argument( '--dtype', type=str, @@ -188,9 +207,10 @@ def run_to_completion(profile_dir: Optional[str] = None): parser.add_argument( "--device", type=str, - default="cuda", - choices=["cuda", "cpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') + default="auto", + choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], + help='device type for vLLM execution, supporting CUDA, OpenVINO and ' + 'CPU.') parser.add_argument('--block-size', type=int, default=16, @@ -222,6 +242,29 @@ def run_to_completion(profile_dir: Optional[str] = None): help='the fraction of GPU memory to be used for ' 'the model executor, which can range from 0 to 1.' 'If unspecified, will use the default value of 0.9.') + parser.add_argument( + '--load-format', + type=str, + default=EngineArgs.load_format, + choices=[ + 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', + 'bitsandbytes' + ], + help='The format of the model weights to load.\n\n' + '* "auto" will try to load the weights in the safetensors format ' + 'and fall back to the pytorch bin format if safetensors format ' + 'is not available.\n' + '* "pt" will load the weights in the pytorch bin format.\n' + '* "safetensors" will load the weights in the safetensors format.\n' + '* "npcache" will load the weights in pytorch format and store ' + 'a numpy cache to speed up the loading.\n' + '* "dummy" will initialize the weights with random values, ' + 'which is mainly for profiling.\n' + '* "tensorizer" will load the weights using tensorizer from ' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'section for more information.\n' + '* "bitsandbytes" will load the weights using bitsandbytes ' + 'quantization.\n') parser.add_argument( '--distributed-executor-backend', choices=['ray', 'mp'], @@ -229,5 +272,10 @@ def run_to_completion(profile_dir: Optional[str] = None): help='Backend to use for distributed serving. When more than 1 GPU ' 'is used, will be automatically set to "ray" if installed ' 'or "mp" (multiprocessing) otherwise.') + parser.add_argument( + '--otlp-traces-endpoint', + type=str, + default=None, + help='Target URL to which OpenTelemetry traces will be sent.') args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py index 089966986984f..395107a5ec747 100644 --- a/benchmarks/benchmark_prefix_caching.py +++ b/benchmarks/benchmark_prefix_caching.py @@ -1,7 +1,7 @@ -import argparse import time from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501 @@ -44,7 +44,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance with or without automatic ' 'prefix caching.') parser.add_argument('--model', diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 4112a3272518e..42867fc40edd2 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -31,7 +31,7 @@ import warnings from dataclasses import dataclass from datetime import datetime -from typing import AsyncGenerator, List, Optional, Tuple +from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple import numpy as np from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput, @@ -39,7 +39,15 @@ from tqdm.asyncio import tqdm from transformers import PreTrainedTokenizerBase -from vllm.transformers_utils.tokenizer import get_tokenizer +try: + from vllm.transformers_utils.tokenizer import get_tokenizer +except ImportError: + from backend_request_func import get_tokenizer + +try: + from vllm.utils import FlexibleArgumentParser +except ImportError: + from argparse import ArgumentParser as FlexibleArgumentParser @dataclass @@ -200,12 +208,12 @@ def calculate_metrics( dur_s: float, tokenizer: PreTrainedTokenizerBase, ) -> Tuple[BenchmarkMetrics, List[int]]: - actual_output_lens = [] + actual_output_lens: List[int] = [] total_input = 0 completed = 0 - itls = [] - tpots = [] - ttfts = [] + itls: List[float] = [] + tpots: List[float] = [] + ttfts: List[float] = [] for i in range(len(outputs)): if outputs[i].success: # We use the tokenizer to count the number of output tokens for all @@ -265,7 +273,7 @@ async def benchmark( disable_tqdm: bool, ): if backend in ASYNC_REQUEST_FUNCS: - request_func = ASYNC_REQUEST_FUNCS.get(backend) + request_func = ASYNC_REQUEST_FUNCS[backend] else: raise ValueError(f"Unknown backend: {backend}") @@ -292,7 +300,7 @@ async def benchmark( pbar = None if disable_tqdm else tqdm(total=len(input_requests)) benchmark_start_time = time.perf_counter() - tasks = [] + tasks: List[asyncio.Task] = [] async for request in get_request(input_requests, request_rate): prompt, prompt_len, output_len = request request_func_input = RequestFuncInput( @@ -310,7 +318,7 @@ async def benchmark( pbar=pbar))) outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks) - if not disable_tqdm: + if pbar is not None: pbar.close() benchmark_duration = time.perf_counter() - benchmark_start_time @@ -466,7 +474,7 @@ def main(args: argparse.Namespace): # Save config and results to json if args.save_result: - result_json = {} + result_json: Dict[str, Any] = {} # Setup current_dt = datetime.now().strftime("%Y%m%d-%H%M%S") @@ -499,6 +507,8 @@ def main(args: argparse.Namespace): # Save to file base_model_id = model_id.split("/")[-1] file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa + if args.result_filename: + file_name = args.result_filename if args.result_dir: file_name = os.path.join(args.result_dir, file_name) with open(file_name, "w") as outfile: @@ -506,7 +516,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the online serving throughput.") parser.add_argument( "--backend", @@ -639,6 +649,15 @@ def main(args: argparse.Namespace): help="Specify directory to save benchmark json results." "If not specified, results are saved in the current directory.", ) + parser.add_argument( + "--result-filename", + type=str, + default=None, + help="Specify the filename to save benchmark json results." + "If not specified, results will be saved in " + "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" + " format.", + ) args = parser.parse_args() main(args) diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index 90f7433e0ae28..a52e67bbbe7e3 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -10,7 +10,9 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) +from vllm.engine.arg_utils import EngineArgs from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS +from vllm.utils import FlexibleArgumentParser def sample_requests( @@ -81,6 +83,7 @@ def run_vllm( distributed_executor_backend: Optional[str], gpu_memory_utilization: float = 0.9, download_dir: Optional[str] = None, + load_format: str = EngineArgs.load_format, ) -> float: from vllm import LLM, SamplingParams llm = LLM( @@ -102,11 +105,12 @@ def run_vllm( enable_chunked_prefill=enable_chunked_prefill, max_num_batched_tokens=max_num_batched_tokens, distributed_executor_backend=distributed_executor_backend, + load_format=load_format, ) # Add the requests to the engine. - prompts = [] - sampling_params = [] + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] for prompt, _, output_len in requests: prompts.append(prompt) sampling_params.append( @@ -228,7 +232,7 @@ def main(args: argparse.Namespace): args.quantization_param_path, args.device, args.enable_prefix_caching, args.enable_chunked_prefill, args.max_num_batched_tokens, args.distributed_executor_backend, - args.gpu_memory_utilization, args.download_dir) + args.gpu_memory_utilization, args.download_dir, args.load_format) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -258,7 +262,7 @@ def main(args: argparse.Namespace): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Benchmark the throughput.") + parser = FlexibleArgumentParser(description="Benchmark the throughput.") parser.add_argument("--backend", type=str, choices=["vllm", "hf", "mii"], @@ -345,9 +349,10 @@ def main(args: argparse.Namespace): parser.add_argument( "--device", type=str, - default="cuda", - choices=["cuda", "cpu"], - help='device type for vLLM execution, supporting CUDA and CPU.') + default="auto", + choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], + help='device type for vLLM execution, supporting CUDA, OpenVINO and ' + 'CPU.') parser.add_argument( "--enable-prefix-caching", action='store_true', @@ -377,6 +382,29 @@ def main(args: argparse.Namespace): help='Backend to use for distributed serving. When more than 1 GPU ' 'is used, will be automatically set to "ray" if installed ' 'or "mp" (multiprocessing) otherwise.') + parser.add_argument( + '--load-format', + type=str, + default=EngineArgs.load_format, + choices=[ + 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer', + 'bitsandbytes' + ], + help='The format of the model weights to load.\n\n' + '* "auto" will try to load the weights in the safetensors format ' + 'and fall back to the pytorch bin format if safetensors format ' + 'is not available.\n' + '* "pt" will load the weights in the pytorch bin format.\n' + '* "safetensors" will load the weights in the safetensors format.\n' + '* "npcache" will load the weights in pytorch format and store ' + 'a numpy cache to speed up the loading.\n' + '* "dummy" will initialize the weights with random values, ' + 'which is mainly for profiling.\n' + '* "tensorizer" will load the weights using tensorizer from ' + 'CoreWeave. See the Tensorize vLLM Model script in the Examples' + 'section for more information.\n' + '* "bitsandbytes" will load the weights using bitsandbytes ' + 'quantization.\n') args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py index 6de56f618700d..377f8683c021f 100644 --- a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py +++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py @@ -11,6 +11,7 @@ from weight_shapes import WEIGHT_SHAPES from vllm import _custom_ops as ops +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -46,7 +47,7 @@ def make_rand_tensors(dtype: torch.dtype, m: int, n: int, # impl -def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, +def pytorch_mm_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, scale_b: torch.tensor, out_dtype: torch.dtype) -> torch.tensor: return torch.mm(a, b) @@ -76,11 +77,7 @@ def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor, def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor, scale_b: torch.tensor, out_dtype: torch.dtype) -> torch.tensor: - return ops.cutlass_scaled_mm_dq(a, - b, - scale_a, - scale_b, - out_dtype=out_dtype) + return ops.cutlass_scaled_mm(a, b, scale_a, scale_b, out_dtype=out_dtype) # bench @@ -119,14 +116,13 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str, timers.append( bench_fn(a.to(dtype=torch.bfloat16, device="cuda"), b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b, - torch.bfloat16, label, sub_label, pytorch_i8_impl, + torch.bfloat16, label, sub_label, pytorch_mm_impl, "pytorch_bf16_bf16_bf16_matmul-no-scales")) # cutlass impl timers.append( - bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), - torch.bfloat16, label, sub_label, cutlass_impl, - "cutlass_i8_i8_bf16_scaled_mm")) + bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, + cutlass_impl, "cutlass_i8_i8_bf16_scaled_mm")) return timers @@ -140,6 +136,13 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, timers = [] + # pytorch impl w. bf16 + timers.append( + bench_fn(a.to(dtype=torch.bfloat16, device="cuda"), + b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b, + torch.bfloat16, label, sub_label, pytorch_mm_impl, + "pytorch_bf16_bf16_bf16_matmul-no-scales")) + # pytorch impl: bf16 output, without fp8 fast accum timers.append( bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, @@ -164,14 +167,12 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str, # cutlass impl: bf16 output timers.append( - bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), - torch.bfloat16, label, sub_label, cutlass_impl, - "cutlass_fp8_fp8_bf16_scaled_mm")) + bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label, + cutlass_impl, "cutlass_fp8_fp8_bf16_scaled_mm")) # cutlass impl: fp16 output timers.append( - bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"), - torch.float16, label, sub_label, cutlass_impl, - "cutlass_fp8_fp8_fp16_scaled_mm")) + bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label, + cutlass_impl, "cutlass_fp8_fp8_fp16_scaled_mm")) return timers @@ -293,7 +294,7 @@ def to_torch_dtype(dt): return torch.float8_e4m3fn raise ValueError("unsupported dtype") - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description=""" Benchmark Cutlass GEMM. diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py index 7ad4a53d376b6..25ec9d6028627 100644 --- a/benchmarks/cutlass_benchmarks/weight_shapes.py +++ b/benchmarks/cutlass_benchmarks/weight_shapes.py @@ -22,6 +22,12 @@ ([4096, 22016], 1), ([11008, 4096], 0), ], + "meta-llama/Llama-3-8b": [ + ([4096, 6144], 1), + ([4096, 4096], 0), + ([4096, 28672], 1), + ([14336, 4096], 0), + ], "meta-llama/Llama-2-13b-hf": [ ([5120, 15360], 1), ([5120, 5120], 0), diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py index 59392947b15c8..601c4ea439aea 100644 --- a/benchmarks/kernels/benchmark_aqlm.py +++ b/benchmarks/kernels/benchmark_aqlm.py @@ -1,4 +1,3 @@ -import argparse import os import sys from typing import Optional @@ -10,6 +9,7 @@ from vllm.model_executor.layers.quantization.aqlm import ( dequantize_weight, generic_dequantize_gemm, get_int_dtype, optimized_dequantize_gemm) +from vllm.utils import FlexibleArgumentParser os.environ['CUDA_VISIBLE_DEVICES'] = '0' @@ -86,9 +86,9 @@ def dequant_no_scale( # Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against # the generic pytorch version. # Just visual comparison. -def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: +def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None: - n = parts.sum().item() + n = int(parts.sum().item()) device = torch.device('cuda:0') @@ -137,7 +137,7 @@ def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None: def main(): - parser = argparse.ArgumentParser(description="Benchmark aqlm performance.") + parser = FlexibleArgumentParser(description="Benchmark aqlm performance.") # Add arguments parser.add_argument("--nbooks", @@ -204,7 +204,7 @@ def main(): sys.stdout = sys.__stdout__ -def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, +def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods): # I didn't see visible improvements from increasing these, but feel free :) @@ -252,10 +252,10 @@ def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int, print('') -def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor, +def run_timing(num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method) -> float: - n = parts.sum().item() + n = int(parts.sum().item()) device = torch.device('cuda:0') diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py index b771911781574..261f5829631ee 100644 --- a/benchmarks/kernels/benchmark_marlin.py +++ b/benchmarks/kernels/benchmark_marlin.py @@ -1,4 +1,4 @@ -import argparse +from typing import List import torch import torch.utils.benchmark as benchmark @@ -15,6 +15,7 @@ MarlinWorkspace, marlin_24_quantize, marlin_quantize) from vllm.model_executor.layers.quantization.utils.quant_utils import ( gptq_pack, quantize_weights, sort_weights) +from vllm.utils import FlexibleArgumentParser DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"] DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512] @@ -23,8 +24,9 @@ K_FULL_OPTS = [False, True] -def bench_run(results, model, act_order, is_k_full, num_bits, group_size, - size_m, size_k, size_n): +def bench_run(results: List[benchmark.Measurement], model: str, + act_order: bool, is_k_full: bool, num_bits: int, group_size: int, + size_m: int, size_k: int, size_n: int): label = "Quant Matmul" sub_label = ("{}, act={} k_full={}, b={}, g={}, " @@ -156,7 +158,7 @@ def main(args): for i, model in enumerate(args.models): print(f"[{i}] {model}") - results = [] + results: List[benchmark.Measurement] = [] for model in args.models: for layer in WEIGHT_SHAPES[model]: @@ -209,7 +211,7 @@ def main(args): # python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501 # if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark Marlin across specified models/shapes/batches") parser.add_argument( "--models", diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index be5dd32bd6f91..e00696d6d43cb 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -1,7 +1,7 @@ import argparse import time from datetime import datetime -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple, TypedDict import ray import torch @@ -10,10 +10,20 @@ from transformers import AutoConfig from vllm.model_executor.layers.fused_moe.fused_moe import * +from vllm.utils import FlexibleArgumentParser + + +class BenchmarkConfig(TypedDict): + BLOCK_SIZE_M: int + BLOCK_SIZE_N: int + BLOCK_SIZE_K: int + GROUP_SIZE_M: int + num_warps: int + num_stages: int def benchmark_config( - config: Dict[str, int], + config: BenchmarkConfig, num_tokens: int, num_experts: int, shard_intermediate_size: int, @@ -92,7 +102,7 @@ def run(): start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) - latencies = [] + latencies: List[float] = [] for i in range(num_iters): prepare(i) torch.cuda.synchronize() @@ -111,7 +121,7 @@ def get_configs_compute_bound() -> List[Dict[str, int]]: # Reduced search space for faster tuning. # TODO(woosuk): Increase the search space and use a performance model to # prune the search space. - configs = [] + configs: List[BenchmarkConfig] = [] for num_stages in [2, 3, 4, 5]: for block_m in [16, 32, 64, 128, 256]: for block_k in [64, 128, 256]: @@ -175,8 +185,8 @@ def tune( topk: int, dtype: torch.dtype, use_fp8: bool, - search_space: List[Dict[str, int]], - ) -> Dict[str, int]: + search_space: List[BenchmarkConfig], + ) -> BenchmarkConfig: best_config = None best_time = float("inf") for config in tqdm(search_space): @@ -199,10 +209,11 @@ def tune( best_config = config now = datetime.now() print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") + assert best_config is not None return best_config -def sort_config(config: Dict[str, int]) -> Dict[str, int]: +def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: return { "BLOCK_SIZE_M": config["BLOCK_SIZE_M"], "BLOCK_SIZE_N": config["BLOCK_SIZE_N"], @@ -214,7 +225,7 @@ def sort_config(config: Dict[str, int]) -> Dict[str, int]: def save_configs( - configs: Dict[int, Dict[str, int]], + configs: Dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, @@ -305,7 +316,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: if __name__ == "__main__": - parser = argparse.ArgumentParser() + parser = FlexibleArgumentParser() parser.add_argument("--model", type=str, default="mistralai/Mixtral-8x7B-Instruct-v0.1") diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index e6f4e9e6b9716..16de60477c305 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -1,12 +1,12 @@ -import argparse import random import time -from typing import Optional +from typing import List, Optional import torch from vllm import _custom_ops as ops -from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random +from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, + create_kv_caches_with_random) NUM_BLOCKS = 1024 PARTITION_SIZE = 512 @@ -54,14 +54,17 @@ def main( # Create the block tables. max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size - block_tables = [] + block_tables_lst: List[List[int]] = [] for _ in range(num_seqs): block_table = [ random.randint(0, NUM_BLOCKS - 1) for _ in range(max_num_blocks_per_seq) ] - block_tables.append(block_table) - block_tables = torch.tensor(block_tables, dtype=torch.int, device=device) + block_tables_lst.append(block_table) + + block_tables = torch.tensor(block_tables_lst, + dtype=torch.int, + device=device) # Create the KV cache. key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS, @@ -158,14 +161,14 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float: if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the paged attention kernel.") parser.add_argument("--version", type=str, choices=["v1", "v2"], default="v2") parser.add_argument("--batch-size", type=int, default=8) - parser.add_argument("--seq_len", type=int, default=4096) + parser.add_argument("--seq-len", type=int, default=4096) parser.add_argument("--num-query-heads", type=int, default=64) parser.add_argument("--num-kv-heads", type=int, default=8) parser.add_argument("--head-size", diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py index 00e55f6060b52..78736c7a7ba6f 100644 --- a/benchmarks/kernels/benchmark_rope.py +++ b/benchmarks/kernels/benchmark_rope.py @@ -1,11 +1,12 @@ -import argparse from itertools import accumulate -from typing import Optional +from typing import List, Optional import nvtx import torch -from vllm.model_executor.layers.rotary_embedding import get_rope +from vllm.model_executor.layers.rotary_embedding import (RotaryEmbedding, + get_rope) +from vllm.utils import FlexibleArgumentParser def benchmark_rope_kernels_multi_lora( @@ -37,7 +38,7 @@ def benchmark_rope_kernels_multi_lora( }) # non-batched RoPE takes only one scaling factor, we create multiple # instances to simulate the same behavior - non_batched_ropes = [] + non_batched_ropes: List[RotaryEmbedding] = [] for scaling_factor in scaling_factors: non_batched_ropes.append( get_rope(head_size, rotary_dim, max_position, base, is_neox_style, @@ -85,7 +86,7 @@ def benchmark_rope_kernels_multi_lora( if __name__ == '__main__': - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description="Benchmark the rotary embedding kernels.") parser.add_argument("--is-neox-style", type=bool, default=True) parser.add_argument("--batch-size", type=int, default=16) diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py index c846e47de1fcf..203699e9a8d06 100644 --- a/benchmarks/overheads/benchmark_hashing.py +++ b/benchmarks/overheads/benchmark_hashing.py @@ -1,8 +1,8 @@ -import argparse import cProfile import pstats from vllm import LLM, SamplingParams +from vllm.utils import FlexibleArgumentParser # A very long prompt, total number of tokens is about 15k. LONG_PROMPT = ["You are an expert in large language models, aren't you?" @@ -47,7 +47,7 @@ def main(args): if __name__ == "__main__": - parser = argparse.ArgumentParser( + parser = FlexibleArgumentParser( description='Benchmark the performance of hashing function in' 'automatic prefix caching.') parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k') diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index 61d4843838ba0..690559ee265e9 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -33,9 +33,23 @@ function (find_isa CPUINFO TARGET OUT) endif() endfunction() +function (is_avx512_disabled OUT) + set(DISABLE_AVX512 $ENV{VLLM_CPU_DISABLE_AVX512}) + if(DISABLE_AVX512 AND DISABLE_AVX512 STREQUAL "true") + set(${OUT} ON PARENT_SCOPE) + else() + set(${OUT} OFF PARENT_SCOPE) + endif() +endfunction() + +is_avx512_disabled(AVX512_DISABLED) + +find_isa(${CPUINFO} "avx2" AVX2_FOUND) find_isa(${CPUINFO} "avx512f" AVX512_FOUND) +find_isa(${CPUINFO} "POWER10" POWER10_FOUND) +find_isa(${CPUINFO} "POWER9" POWER9_FOUND) -if (AVX512_FOUND) +if (AVX512_FOUND AND NOT AVX512_DISABLED) list(APPEND CXX_COMPILE_FLAGS "-mavx512f" "-mavx512vl" @@ -53,8 +67,18 @@ if (AVX512_FOUND) else() message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.") endif() +elseif (AVX2_FOUND) + list(APPEND CXX_COMPILE_FLAGS "-mavx2") + message(WARNING "vLLM CPU backend using AVX2 ISA") +elseif (POWER9_FOUND OR POWER10_FOUND) + message(STATUS "PowerPC detected") + # Check for PowerPC VSX support + list(APPEND CXX_COMPILE_FLAGS + "-mvsx" + "-mcpu=native" + "-mtune=native") else() - message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.") + message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.") endif() message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") diff --git a/cmake/utils.cmake b/cmake/utils.cmake index f3c1286dd8498..4869cad541135 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -147,16 +147,23 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) if (${GPU_LANG} STREQUAL "HIP") # # `GPU_ARCHES` controls the `--offload-arch` flags. - # `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled - # via the `PYTORCH_ROCM_ARCH` env variable. # - + # If PYTORCH_ROCM_ARCH env variable exists, then we take it as a list, + # if not, then we use CMAKE_HIP_ARCHITECTURES which was generated by calling + # "rocm_agent_enumerator" in "enable_language(HIP)" + # (in file Modules/CMakeDetermineHIPCompiler.cmake) + # + if(DEFINED ENV{PYTORCH_ROCM_ARCH}) + set(HIP_ARCHITECTURES $ENV{PYTORCH_ROCM_ARCH}) + else() + set(HIP_ARCHITECTURES ${CMAKE_HIP_ARCHITECTURES}) + endif() # # Find the intersection of the supported + detected architectures to # set the module architecture flags. # set(${GPU_ARCHES}) - foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES}) + foreach (_ARCH ${HIP_ARCHITECTURES}) if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST) list(APPEND ${GPU_ARCHES} ${_ARCH}) endif() @@ -164,7 +171,7 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES) if(NOT ${GPU_ARCHES}) message(FATAL_ERROR - "None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is" + "None of the detected ROCm architectures: ${HIP_ARCHITECTURES} is" " supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.") endif() diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 86ac2e75e78ee..5ed1dc3b8f792 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -135,6 +135,12 @@ __device__ __forceinline__ T gelu_fast_kernel(const T& x) { return ((T)0.5) * x * (((T)1.0) + t); } +template +__device__ __forceinline__ T gelu_quick_kernel(const T& x) { + // x * sigmoid(1.702 * x) + return (T)(((float)x) / (1.0f + expf(-1.702f * (float)x))); +} + } // namespace vllm void gelu_new(torch::Tensor& out, // [..., d] @@ -148,3 +154,9 @@ void gelu_fast(torch::Tensor& out, // [..., d] { LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); } + +void gelu_quick(torch::Tensor& out, // [..., d] + torch::Tensor& input) // [..., d] +{ + LAUNCH_ACTIVATION_KERNEL(vllm::gelu_quick_kernel); +} diff --git a/csrc/cpu/activation.cpp b/csrc/cpu/activation.cpp index becd2ac42f17a..039b8d5c30d46 100644 --- a/csrc/cpu/activation.cpp +++ b/csrc/cpu/activation.cpp @@ -59,6 +59,13 @@ FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) { return w3 * x * (ones + t); } +FORCE_INLINE vec_op::FP32Vec8 gelu_quick_act(const vec_op::FP32Vec8& x) { + const vec_op::FP32Vec8 zeros(0.0); + const vec_op::FP32Vec8 ones(1.0); + const vec_op::FP32Vec8 w1(1.702f); + return x / (ones + (zeros - w1 * x).exp()); +} + FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) { const vec_op::FP32Vec8 ones(1.0); const vec_op::FP32Vec8 w1(M_SQRT1_2); @@ -142,3 +149,15 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input) { CPU_KERNEL_GUARD_OUT(gelu_fast_impl) }); } + +void gelu_quick(torch::Tensor& out, torch::Tensor& input) { + int num_tokens = input.numel() / input.size(-1); + int d = input.size(-1); + + VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_quick_impl", [&] { + CPU_KERNEL_GUARD_IN(gelu_quick_impl) + activation_kernel( + num_tokens, d, input.data_ptr(), out.data_ptr()); + CPU_KERNEL_GUARD_OUT(gelu_quick_impl) + }); +} diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp index 034c406a532d5..0213be09105ed 100644 --- a/csrc/cpu/cpu_types.hpp +++ b/csrc/cpu/cpu_types.hpp @@ -2,351 +2,14 @@ #ifndef CPU_TYPES_HPP #define CPU_TYPES_HPP -#include -#include - -namespace vec_op { - -// FIXME: FP16 is not fully supported in Torch-CPU -#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ - AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ - AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) - -#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) - -#ifndef CPU_OP_GUARD -#define CPU_KERNEL_GUARD_IN(NAME) -#define CPU_KERNEL_GUARD_OUT(NAME) +#if defined(__x86_64__) + //x86 implementation + #include "cpu_types_x86.hpp" +#elif defined(__POWER9_VECTOR__) + //ppc implementation + #include "cpu_types_vsx.hpp" #else -#define CPU_KERNEL_GUARD_IN(NAME) \ - std::cout << #NAME << " invoked." << std::endl; -#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; -#endif - -#define FORCE_INLINE __attribute__((always_inline)) inline - -namespace { -template -constexpr void unroll_loop_item(std::integer_sequence, F &&f) { - (f(std::integral_constant{}), ...); -} -}; // namespace - -template >> -constexpr void unroll_loop(F &&f) { - unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); -} - -template struct Vec { - constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } -}; - -struct FP32Vec8; -struct FP32Vec16; - -#ifdef __AVX512FP16__ -struct FP16Vec8 : public Vec { - constexpr static int VEC_ELEM_NUM = 8; - - __m128h reg; - - explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {} - - explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {} - - explicit FP16Vec8(__m128h data) : reg(data) {} - - FP16Vec8 operator*(const FP16Vec8 &b) const { - return FP16Vec8(_mm_mul_ph(reg, b.reg)); - } - - FP16Vec8 operator+(const FP16Vec8 &b) const { - return FP16Vec8(_mm_add_ph(reg, b.reg)); - } - - FP16Vec8 operator-(const FP16Vec8 &b) const { - return FP16Vec8(_mm_sub_ph(reg, b.reg)); - } - - FP16Vec8 operator/(const FP16Vec8 &b) const { - return FP16Vec8(_mm_div_ph(reg, b.reg)); - } - - void save(void *ptr) const { _mm_storeu_ph(ptr, reg); } -}; + #warning "unsupported vLLM cpu implementation" #endif -struct BF16Vec8 : public Vec { - constexpr static int VEC_ELEM_NUM = 8; - - __m128i reg; - - explicit BF16Vec8(const void *ptr) - : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} - - explicit BF16Vec8(const FP32Vec8 &); - - void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } -}; - -struct BF16Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - - __m256i reg; - - explicit BF16Vec16(const void *ptr) - : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} - - explicit BF16Vec16(const FP32Vec16 &); - - void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } -}; - -struct BF16Vec32 : public Vec { - constexpr static int VEC_ELEM_NUM = 32; - - __m512i reg; - - explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} - - explicit BF16Vec32(__m512i data) : reg(data) {} - - explicit BF16Vec32(BF16Vec8 &vec8_data) - : reg((__m512i)_mm512_inserti32x4( - _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( - (__m128i)vec8_data.reg), - (__m128i)vec8_data.reg, 1), - (__m128i)vec8_data.reg, 2), - (__m128i)vec8_data.reg, 3)) {} - - void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } -}; - -struct FP32Vec4 : public Vec { - constexpr static int VEC_ELEM_NUM = 4; - union AliasReg { - __m128 reg; - float values[VEC_ELEM_NUM]; - }; - - __m128 reg; - - explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {} - - explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {} - - explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {} - - explicit FP32Vec4(__m128 data) : reg(data) {} - - explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} -}; - -struct FP32Vec8 : public Vec { - constexpr static int VEC_ELEM_NUM = 8; - union AliasReg { - __m256 reg; - float values[VEC_ELEM_NUM]; - }; - - __m256 reg; - - explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {} - - explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {} - - explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {} - - explicit FP32Vec8(__m256 data) : reg(data) {} - - explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} - -#ifdef __AVX512FP16__ - explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {} -#endif - - explicit FP32Vec8(const BF16Vec8 &v) - : reg(_mm256_castsi256_ps( - _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {} - - float reduce_sum() const { - AliasReg ar; - ar.reg = reg; - float result = 0; - unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); - - return result; - } - - FP32Vec8 exp() const { - AliasReg ar; - ar.reg = reg; - return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]), - expf(ar.values[5]), expf(ar.values[4]), - expf(ar.values[3]), expf(ar.values[2]), - expf(ar.values[1]), expf(ar.values[0]))); - } - - FP32Vec8 tanh() const { - AliasReg ar; - ar.reg = reg; - return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]), - tanhf(ar.values[5]), tanhf(ar.values[4]), - tanhf(ar.values[3]), tanhf(ar.values[2]), - tanhf(ar.values[1]), tanhf(ar.values[0]))); - } - - FP32Vec8 er() const { - AliasReg ar; - ar.reg = reg; - return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]), - erf(ar.values[5]), erf(ar.values[4]), - erf(ar.values[3]), erf(ar.values[2]), - erf(ar.values[1]), erf(ar.values[0]))); - } - - FP32Vec8 operator*(const FP32Vec8 &b) const { - return FP32Vec8(_mm256_mul_ps(reg, b.reg)); - } - - FP32Vec8 operator+(const FP32Vec8 &b) const { - return FP32Vec8(_mm256_add_ps(reg, b.reg)); - } - - FP32Vec8 operator-(const FP32Vec8 &b) const { - return FP32Vec8(_mm256_sub_ps(reg, b.reg)); - } - - FP32Vec8 operator/(const FP32Vec8 &b) const { - return FP32Vec8(_mm256_div_ps(reg, b.reg)); - } - - void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } -}; - -struct FP32Vec16 : public Vec { - constexpr static int VEC_ELEM_NUM = 16; - union AliasReg { - __m512 reg; - float values[VEC_ELEM_NUM]; - }; - - __m512 reg; - - explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {} - - explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} - - explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {} - - explicit FP32Vec16(__m512 data) : reg(data) {} - - explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} - - explicit FP32Vec16(const FP32Vec4 &data) - : reg((__m512)_mm512_inserti32x4( - _mm512_inserti32x4( - _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg), - (__m128i)data.reg, 1), - (__m128i)data.reg, 2), - (__m128i)data.reg, 3)) {} - - explicit FP32Vec16(const FP32Vec8 &data) - : reg((__m512)_mm512_inserti32x8( - _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {} - - explicit FP32Vec16(const BF16Vec16 &v) - : reg(_mm512_castsi512_ps( - _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} - - explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} - - FP32Vec16 operator*(const FP32Vec16 &b) const { - return FP32Vec16(_mm512_mul_ps(reg, b.reg)); - } - - FP32Vec16 operator+(const FP32Vec16 &b) const { - return FP32Vec16(_mm512_add_ps(reg, b.reg)); - } - - FP32Vec16 operator-(const FP32Vec16 &b) const { - return FP32Vec16(_mm512_sub_ps(reg, b.reg)); - } - - FP32Vec16 operator/(const FP32Vec16 &b) const { - return FP32Vec16(_mm512_div_ps(reg, b.reg)); - } - - float reduce_sum() const { return _mm512_reduce_add_ps(reg); } - - template float reduce_sub_sum(int idx) { - static_assert(VEC_ELEM_NUM % group_size == 0); - constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); - __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); - return _mm512_mask_reduce_add_ps(mask, reg); - } - - void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } -}; - -template struct VecType { using vec_type = void; }; - -template using vec_t = typename VecType::vec_type; - -template <> struct VecType { using vec_type = FP32Vec8; }; - -#ifdef __AVX512FP16__ -template <> struct VecType { using vec_type = FP16Vec16; }; -#endif - -template <> struct VecType { using vec_type = BF16Vec8; }; - -template void storeFP32(float v, T *ptr) { *ptr = v; } - -#ifdef __AVX512FP16__ -template <> inline void storeFP32(float v, c10::Half *ptr) { - *reinterpret_cast<_Float16 *>(ptr) = v; -} -#endif - -inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { - acc = acc + a * b; -} - -#ifdef __AVX512BF16__ -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); -} - -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) - : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {} - -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) - : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {} - -inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { - acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg); -} -#else -template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { - c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = - reinterpret_cast(&v); - *ptr = *(v_ptr + 1); -} - -inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) - : reg(_mm256_cvtepi32_epi16( - _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} - -inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) - : reg(_mm512_cvtepi32_epi16( - _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} -#endif - -inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } - -}; // namespace vec_op - #endif diff --git a/csrc/cpu/cpu_types_vsx.hpp b/csrc/cpu/cpu_types_vsx.hpp new file mode 100644 index 0000000000000..b50bdadc5713d --- /dev/null +++ b/csrc/cpu/cpu_types_vsx.hpp @@ -0,0 +1,491 @@ + +#ifndef CPU_TYPES_VSX_HPP +#define CPU_TYPES_VSX_HPP + +#include +#include +#include + +namespace vec_op { + +// FIXME: FP16 is not fully supported in Torch-CPU +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#ifndef CPU_OP_GUARD +#define CPU_KERNEL_GUARD_IN(NAME) +#define CPU_KERNEL_GUARD_OUT(NAME) +#else +#define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; +#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +namespace { +template +constexpr void unroll_loop_item(std::integer_sequence, F &&f) { + (f(std::integral_constant{}), ...); +} +}; // namespace + +template >> +constexpr void unroll_loop(F &&f) { + unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); +} + +template struct Vec { + constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } +}; + +typedef struct ss16x8x2_t { + __vector signed short val[2]; +} ss16x8x2_t; + +typedef struct ss16x8x4_t { + __vector signed short val[4]; +} ss16x8x4_t; + +typedef struct f32x4x2_t { + __vector float val[2]; +} f32x4x2_t; + +typedef struct f32x4x4_t { + __vector float val[4]; +} f32x4x4_t; + +struct FP32Vec8; +struct FP32Vec16; + +struct BF16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + __vector signed short reg; + + explicit BF16Vec8(const void *ptr) + : reg((__vector signed short)vec_xl(0, (__vector signed short *)ptr)) {} + + explicit BF16Vec8(const FP32Vec8 &); + + void save(void *ptr) const { *reinterpret_cast<__vector signed short *>(ptr) = reg; } +}; + +struct BF16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + ss16x8x2_t reg; + + explicit BF16Vec16(const void *ptr) { + // Load 256 bits in two parts + reg.val[0] = (__vector signed short)vec_xl(0, (signed short *)ptr); + reg.val[1] = (__vector signed short)vec_xl(16, (signed short *)ptr); + } + + explicit BF16Vec16(const FP32Vec16 &); + + void save(void *ptr) const { + // Save 256 bits in two parts + vec_xst(reg.val[0], 0, (signed short *)ptr); + vec_xst(reg.val[1], 16, (signed short *)ptr); + } +}; + +const static __vector signed short zero = vec_splats((signed short)0); + +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + ss16x8x4_t reg; + explicit BF16Vec32(const void *ptr) + : reg(*reinterpret_cast(ptr)) {} + + explicit BF16Vec32(ss16x8x4_t data) : reg(data) {} + + explicit BF16Vec32(const BF16Vec8 &vec8_data) : reg({ + vec8_data.reg, + vec8_data.reg, + vec8_data.reg, + vec8_data.reg + }) {} + + void save(void *ptr) const { *reinterpret_cast(ptr) = reg; } +}; + +struct FP32Vec4 : public Vec { + constexpr static int VEC_ELEM_NUM = 4; + union AliasReg { + __vector float reg; + float values[VEC_ELEM_NUM]; + }; + + __vector float reg; + + explicit FP32Vec4(float v) : reg(vec_splats(v)) {} + + explicit FP32Vec4() : reg(vec_splats(0.0f)) {} + + explicit FP32Vec4(const float *ptr) : reg(vec_xl(0, ptr)) {} + + explicit FP32Vec4(__vector float data) : reg(data) {} + + explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} +}; + +struct FP32Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + union AliasReg { + f32x4x2_t reg; + float values[VEC_ELEM_NUM]; + }; + + f32x4x2_t reg; + + explicit FP32Vec8(float v) { + reg.val[0] = vec_splats(v); + reg.val[1] = vec_splats(v); + } + + explicit FP32Vec8() { + reg.val[0] = vec_splats(0.0f); + reg.val[1] = vec_splats(0.0f); + } + + explicit FP32Vec8(const float *ptr) { + reg.val[0] = vec_xl(0, ptr); + reg.val[1] = vec_xl(16, ptr); + } + + explicit FP32Vec8(f32x4x2_t data) : reg(data) {} + + explicit FP32Vec8(const FP32Vec8 &data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + } + + explicit FP32Vec8(const BF16Vec8 &v) { + reg.val[0] = (__vector float)vec_mergeh(zero, v.reg); + reg.val[1] = (__vector float)vec_mergel(zero, v.reg); + } + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + FP32Vec8 exp() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::exp(ar.values[0]); + ret.val[0][1] = std::exp(ar.values[1]); + ret.val[0][2] = std::exp(ar.values[2]); + ret.val[0][3] = std::exp(ar.values[3]); + ret.val[1][0] = std::exp(ar.values[4]); + ret.val[1][1] = std::exp(ar.values[5]); + ret.val[1][2] = std::exp(ar.values[6]); + ret.val[1][3] = std::exp(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 tanh() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::tanh(ar.values[0]); + ret.val[0][1] = std::tanh(ar.values[1]); + ret.val[0][2] = std::tanh(ar.values[2]); + ret.val[0][3] = std::tanh(ar.values[3]); + ret.val[1][0] = std::tanh(ar.values[4]); + ret.val[1][1] = std::tanh(ar.values[5]); + ret.val[1][2] = std::tanh(ar.values[6]); + ret.val[1][3] = std::tanh(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 er() const { + // TODO: Vectorize this + AliasReg ar; + ar.reg = reg; + f32x4x4_t ret; + ret.val[0][0] = std::erf(ar.values[0]); + ret.val[0][1] = std::erf(ar.values[1]); + ret.val[0][2] = std::erf(ar.values[2]); + ret.val[0][3] = std::erf(ar.values[3]); + ret.val[1][0] = std::erf(ar.values[4]); + ret.val[1][1] = std::erf(ar.values[5]); + ret.val[1][2] = std::erf(ar.values[6]); + ret.val[1][3] = std::erf(ar.values[7]); + return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]})); + } + + FP32Vec8 operator*(const FP32Vec8 &b) const { + return FP32Vec8({vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator+(const FP32Vec8 &b) const { + return FP32Vec8({vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator-(const FP32Vec8 &b) const { + return FP32Vec8({vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])}); + } + + FP32Vec8 operator/(const FP32Vec8 &b) const { + return FP32Vec8({vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])}); + } + + void save(float *ptr) const { + vec_xst(reg.val[0], 0, ptr); + vec_xst(reg.val[1], 16, ptr); + } +}; + +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + union AliasReg { + f32x4x4_t reg; + float values[VEC_ELEM_NUM]; + }; + + f32x4x4_t reg; + + explicit FP32Vec16(float v) { + reg.val[0] = vec_splats(v); + reg.val[1] = vec_splats(v); + reg.val[2] = vec_splats(v); + reg.val[3] = vec_splats(v); + } + + explicit FP32Vec16() { + reg.val[0] = vec_splats(0.0f); + reg.val[1] = vec_splats(0.0f); + reg.val[2] = vec_splats(0.0f); + reg.val[3] = vec_splats(0.0f); + } + + explicit FP32Vec16(const float *ptr) { + reg.val[0] = vec_xl(0, ptr); + reg.val[1] = vec_xl(16, ptr); + reg.val[2] = vec_xl(32, ptr); + reg.val[3] = vec_xl(48, ptr); + } + + explicit FP32Vec16(f32x4x4_t data) : reg(data) {} + + explicit FP32Vec16(const FP32Vec16 &data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[2]; + reg.val[3] = data.reg.val[3]; + } + + explicit FP32Vec16(const FP32Vec4 &data) { + reg.val[0] = data.reg; + reg.val[1] = data.reg; + reg.val[2] = data.reg; + reg.val[3] = data.reg; + } + + explicit FP32Vec16(const FP32Vec8 &data) { + reg.val[0] = data.reg.val[0]; + reg.val[1] = data.reg.val[1]; + reg.val[2] = data.reg.val[0]; + reg.val[3] = data.reg.val[1]; + } + + explicit FP32Vec16(const BF16Vec16 &v) { + reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]); + reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]); + reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]); + reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]); + } + + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + + FP32Vec16 operator*(const FP32Vec16 &b) const { + return FP32Vec16(f32x4x4_t({ + vec_mul(reg.val[0], b.reg.val[0]), + vec_mul(reg.val[1], b.reg.val[1]), + vec_mul(reg.val[2], b.reg.val[2]), + vec_mul(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator+(const FP32Vec16 &b) const { + return FP32Vec16(f32x4x4_t({ + vec_add(reg.val[0], b.reg.val[0]), + vec_add(reg.val[1], b.reg.val[1]), + vec_add(reg.val[2], b.reg.val[2]), + vec_add(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator-(const FP32Vec16 &b) const { + return FP32Vec16(f32x4x4_t({ + vec_sub(reg.val[0], b.reg.val[0]), + vec_sub(reg.val[1], b.reg.val[1]), + vec_sub(reg.val[2], b.reg.val[2]), + vec_sub(reg.val[3], b.reg.val[3])})); + } + + FP32Vec16 operator/(const FP32Vec16 &b) const { + return FP32Vec16(f32x4x4_t({ + vec_div(reg.val[0], b.reg.val[0]), + vec_div(reg.val[1], b.reg.val[1]), + vec_div(reg.val[2], b.reg.val[2]), + vec_div(reg.val[3], b.reg.val[3])})); + } + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + template float reduce_sub_sum(int idx) { + static_assert(VEC_ELEM_NUM % group_size == 0); + + AliasReg ar; + ar.reg = reg; + float result = 0; + const int start = idx * group_size; + unroll_loop( + [&result, &start, ar](int i) { result += ar.values[start + i]; }); + + return result; + } + + void save(float *ptr) const { + vec_xst(reg.val[0], 0, ptr); + vec_xst(reg.val[1], 16, ptr); + vec_xst(reg.val[2], 32, ptr); + vec_xst(reg.val[3], 48, ptr); + } +}; + +template struct VecType { using vec_type = void; }; + +template using vec_t = typename VecType::vec_type; + +template <> struct VecType { using vec_type = FP32Vec8; }; + +template <> struct VecType { using vec_type = BF16Vec8; }; + +template void storeFP32(float v, T *ptr) { *ptr = v; } + +inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { + acc = acc + a * b; +} + +template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { + c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = + reinterpret_cast(&v); + *ptr = *(v_ptr + 1); +} + +#ifndef __VEC_CLASS_FP_NAN +#define __VEC_CLASS_FP_NAN (1 << 6) +#endif + +const static __vector unsigned char omask = { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; +#ifndef _ARCH_PWR10 +const static __vector unsigned int bias = { 0x00007fff, 0x00007fff, 0x00007fff, 0x00007fff }; +const static __vector unsigned int nan = { 0x7fc00000, 0x7fc00000, 0x7fc00000, 0x7fc00000 }; +const static __vector unsigned int sh16 = { 16, 16, 16, 16 }; +const static __vector unsigned int one = { 1, 1, 1, 1 }; +#endif + +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) { +#ifdef _ARCH_PWR10 + __vector signed short ret[2]; + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); + reg = vec_perm(ret[0], ret[1], omask); +#elif defined(_ARCH_PWR9) + __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); + __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]); + __vector unsigned int lsb0 = vec_sr(inp0, sh16); + __vector unsigned int lsb1 = vec_sr(inp1, sh16); + lsb0 = vec_and(lsb0, one); + lsb1 = vec_and(lsb1, one); + __vector unsigned int rnd0 = vec_add(lsb0, bias); + __vector unsigned int rnd1 = vec_add(lsb1, bias); + inp0 = vec_add(inp0, rnd0); + inp1 = vec_add(inp1, rnd1); + __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + inp0 = vec_sel(inp0, nan, sel0); + inp1 = vec_sel(inp1, nan, sel1); + inp0 = vec_sr(inp0, sh16); + inp1 = vec_sr(inp1, sh16); + reg = (__vector signed short)vec_perm(inp0, inp1, omask); +#endif +} + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { +#ifdef _ARCH_PWR10 + __vector signed short ret[4]; + ret[0] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[0]); + ret[1] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[1]); + ret[2] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[2]); + ret[3] = (__vector signed short)__builtin_vsx_xvcvspbf16((__vector unsigned char)v.reg.val[3]); + reg.val[0] = vec_perm(ret[0], ret[1], omask); + reg.val[1] = vec_perm(ret[2], ret[3], omask); +#elif defined(_ARCH_PWR9) + __vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]); + __vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]); + __vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]); + __vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]); + __vector unsigned int lsb0 = vec_sr(inp0, sh16); + __vector unsigned int lsb1 = vec_sr(inp1, sh16); + __vector unsigned int lsb2 = vec_sr(inp2, sh16); + __vector unsigned int lsb3 = vec_sr(inp3, sh16); + lsb0 = vec_and(lsb0, one); + lsb1 = vec_and(lsb1, one); + lsb2 = vec_and(lsb2, one); + lsb3 = vec_and(lsb3, one); + __vector unsigned int rnd0 = vec_add(lsb0, bias); + __vector unsigned int rnd1 = vec_add(lsb1, bias); + __vector unsigned int rnd2 = vec_add(lsb2, bias); + __vector unsigned int rnd3 = vec_add(lsb3, bias); + inp0 = vec_add(inp0, rnd0); + inp1 = vec_add(inp1, rnd1); + inp2 = vec_add(inp2, rnd2); + inp3 = vec_add(inp3, rnd3); + __vector __bool int sel0 = vec_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN); + __vector __bool int sel1 = vec_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN); + __vector __bool int sel2 = vec_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN); + __vector __bool int sel3 = vec_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN); + inp0 = vec_sel(inp0, nan, sel0); + inp1 = vec_sel(inp1, nan, sel1); + inp2 = vec_sel(inp2, nan, sel2); + inp3 = vec_sel(inp3, nan, sel3); + inp0 = vec_sr(inp0, sh16); + inp1 = vec_sr(inp1, sh16); + inp2 = vec_sr(inp2, sh16); + inp3 = vec_sr(inp3, sh16); + reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask); + reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask); +#endif +} + +inline void prefetch(const void *addr) { + __asm__ __volatile__("dcbt 0, %0" : : "r"(addr) : "memory"); +} + +}; // namespace vec_op + +#endif diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp new file mode 100644 index 0000000000000..f50620a5287d4 --- /dev/null +++ b/csrc/cpu/cpu_types_x86.hpp @@ -0,0 +1,515 @@ + +#ifndef CPU_TYPES_X86_HPP +#define CPU_TYPES_X86_HPP + +#include +#include + +#ifndef __AVX2__ +static_assert(false, "AVX2 must be supported for the current implementation."); +#endif + +namespace vec_op { + +// FIXME: FP16 is not fully supported in Torch-CPU +#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) + +#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) + +#ifndef CPU_OP_GUARD +#define CPU_KERNEL_GUARD_IN(NAME) +#define CPU_KERNEL_GUARD_OUT(NAME) +#else +#define CPU_KERNEL_GUARD_IN(NAME) \ + std::cout << #NAME << " invoked." << std::endl; +#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; +#endif + +#define FORCE_INLINE __attribute__((always_inline)) inline + +namespace { +template +constexpr void unroll_loop_item(std::integer_sequence, F &&f) { + (f(std::integral_constant{}), ...); +} +}; // namespace + +template >> +constexpr void unroll_loop(F &&f) { + unroll_loop_item(std::make_integer_sequence{}, std::forward(f)); +} + +template struct Vec { + constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; } +}; + +struct FP32Vec8; +struct FP32Vec16; + +#ifdef __AVX512FP16__ +struct FP16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + __m128h reg; + + explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {} + + explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {} + + explicit FP16Vec8(__m128h data) : reg(data) {} + + FP16Vec8 operator*(const FP16Vec8 &b) const { + return FP16Vec8(_mm_mul_ph(reg, b.reg)); + } + + FP16Vec8 operator+(const FP16Vec8 &b) const { + return FP16Vec8(_mm_add_ph(reg, b.reg)); + } + + FP16Vec8 operator-(const FP16Vec8 &b) const { + return FP16Vec8(_mm_sub_ph(reg, b.reg)); + } + + FP16Vec8 operator/(const FP16Vec8 &b) const { + return FP16Vec8(_mm_div_ph(reg, b.reg)); + } + + void save(void *ptr) const { _mm_storeu_ph(ptr, reg); } +}; +#endif + +struct BF16Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + + __m128i reg; + + explicit BF16Vec8(const void *ptr) + : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {} + + explicit BF16Vec8(const FP32Vec8 &); + + void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; } +}; + +struct BF16Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + __m256i reg; + + explicit BF16Vec16(const void *ptr) + : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {} + + explicit BF16Vec16(const FP32Vec16 &); + + void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } +}; + +#ifdef __AVX512F__ +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + __m512i reg; + + explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {} + + explicit BF16Vec32(__m512i data) : reg(data) {} + + explicit BF16Vec32(BF16Vec8 &vec8_data) + : reg((__m512i)_mm512_inserti32x4( + _mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512( + (__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1), + (__m128i)vec8_data.reg, 2), + (__m128i)vec8_data.reg, 3)) {} + + void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; } +}; +#else +struct BF16Vec32 : public Vec { + constexpr static int VEC_ELEM_NUM = 32; + + __m256i reg_low; + __m256i reg_high; + + explicit BF16Vec32(const void *ptr) + : reg_low(_mm256_loadu_si256((__m256i const *)ptr)), + reg_high(_mm256_loadu_si256((__m256i const *)ptr + 1)) {} + + explicit BF16Vec32(__m256i low, __m256i high) : reg_low(low), + reg_high(high) {} + + explicit BF16Vec32(BF16Vec8 &vec8_data) + : reg_low((__m256i)_mm256_inserti32x4( + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)), + reg_high((__m256i)_mm256_inserti32x4( + _mm256_castsi128_si256((__m128i)vec8_data.reg), + (__m128i)vec8_data.reg, 1)) {} + + void save(void *ptr) const { + *reinterpret_cast<__m256i *>(ptr) = reg_low; + *reinterpret_cast<__m256i *>((__m256i *)ptr + 1) = reg_high; + } +}; +#endif + +struct FP32Vec4 : public Vec { + constexpr static int VEC_ELEM_NUM = 4; + union AliasReg { + __m128 reg; + float values[VEC_ELEM_NUM]; + }; + + __m128 reg; + + explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {} + + explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {} + + explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {} + + explicit FP32Vec4(__m128 data) : reg(data) {} + + explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {} +}; + +struct FP32Vec8 : public Vec { + constexpr static int VEC_ELEM_NUM = 8; + union AliasReg { + __m256 reg; + float values[VEC_ELEM_NUM]; + }; + + __m256 reg; + + explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {} + + explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {} + + explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {} + + explicit FP32Vec8(__m256 data) : reg(data) {} + + explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {} + +#ifdef __AVX512FP16__ + explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {} +#endif + + explicit FP32Vec8(const BF16Vec8 &v) + : reg(_mm256_castsi256_ps( + _mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {} + + float reduce_sum() const { + AliasReg ar; + ar.reg = reg; + float result = 0; + unroll_loop([&result, &ar](int i) { result += ar.values[i]; }); + + return result; + } + + FP32Vec8 exp() const { + AliasReg ar; + ar.reg = reg; + return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]), + expf(ar.values[5]), expf(ar.values[4]), + expf(ar.values[3]), expf(ar.values[2]), + expf(ar.values[1]), expf(ar.values[0]))); + } + + FP32Vec8 tanh() const { + AliasReg ar; + ar.reg = reg; + return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]), + tanhf(ar.values[5]), tanhf(ar.values[4]), + tanhf(ar.values[3]), tanhf(ar.values[2]), + tanhf(ar.values[1]), tanhf(ar.values[0]))); + } + + FP32Vec8 er() const { + AliasReg ar; + ar.reg = reg; + return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]), + erf(ar.values[5]), erf(ar.values[4]), + erf(ar.values[3]), erf(ar.values[2]), + erf(ar.values[1]), erf(ar.values[0]))); + } + + FP32Vec8 operator*(const FP32Vec8 &b) const { + return FP32Vec8(_mm256_mul_ps(reg, b.reg)); + } + + FP32Vec8 operator+(const FP32Vec8 &b) const { + return FP32Vec8(_mm256_add_ps(reg, b.reg)); + } + + FP32Vec8 operator-(const FP32Vec8 &b) const { + return FP32Vec8(_mm256_sub_ps(reg, b.reg)); + } + + FP32Vec8 operator/(const FP32Vec8 &b) const { + return FP32Vec8(_mm256_div_ps(reg, b.reg)); + } + + void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } +}; + +#ifdef __AVX512F__ +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + union AliasReg { + __m512 reg; + float values[VEC_ELEM_NUM]; + }; + + __m512 reg; + + explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {} + + explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {} + + explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {} + + explicit FP32Vec16(__m512 data) : reg(data) {} + + explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {} + + explicit FP32Vec16(const FP32Vec4 &data) + : reg((__m512)_mm512_inserti32x4( + _mm512_inserti32x4( + _mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg), + (__m128i)data.reg, 1), + (__m128i)data.reg, 2), + (__m128i)data.reg, 3)) {} + + explicit FP32Vec16(const FP32Vec8 &data) + : reg((__m512)_mm512_inserti32x8( + _mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {} + + explicit FP32Vec16(const BF16Vec16 &v) + : reg(_mm512_castsi512_ps( + _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {} + + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + + FP32Vec16 operator*(const FP32Vec16 &b) const { + return FP32Vec16(_mm512_mul_ps(reg, b.reg)); + } + + FP32Vec16 operator+(const FP32Vec16 &b) const { + return FP32Vec16(_mm512_add_ps(reg, b.reg)); + } + + FP32Vec16 operator-(const FP32Vec16 &b) const { + return FP32Vec16(_mm512_sub_ps(reg, b.reg)); + } + + FP32Vec16 operator/(const FP32Vec16 &b) const { + return FP32Vec16(_mm512_div_ps(reg, b.reg)); + } + + float reduce_sum() const { return _mm512_reduce_add_ps(reg); } + + template float reduce_sub_sum(int idx) { + static_assert(VEC_ELEM_NUM % group_size == 0); + constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); + __mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size)); + return _mm512_mask_reduce_add_ps(mask, reg); + } + + void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } +}; +#else +struct FP32Vec16 : public Vec { + constexpr static int VEC_ELEM_NUM = 16; + + union AliasReg { + __m256 reg; + float values[8]; + }; + + __m256 reg_low; + __m256 reg_high; + + explicit FP32Vec16(float v) : reg_low(_mm256_set1_ps(v)), + reg_high(_mm256_set1_ps(v)) {} + + explicit FP32Vec16() : reg_low(_mm256_set1_ps(0.0)), + reg_high(_mm256_set1_ps(0.0)) {} + + explicit FP32Vec16(const float *ptr) : reg_low(_mm256_loadu_ps(ptr)), + reg_high(_mm256_loadu_ps(ptr + 8)) {} + + explicit FP32Vec16(__m256 low, __m256 high) : reg_low(low), reg_high(high) {} + + explicit FP32Vec16(const FP32Vec16 &data) : reg_low(data.reg_low), + reg_high(data.reg_high) {} + + explicit FP32Vec16(const FP32Vec4 &data) + : reg_low((__m256)_mm256_inserti128_si256( + _mm256_castsi128_si256((__m128i)data.reg), + (__m128i)data.reg, 1)), + reg_high((__m256)_mm256_inserti128_si256( + _mm256_castsi128_si256((__m128i)data.reg), + (__m128i)data.reg, 1)) {} + + explicit FP32Vec16(const FP32Vec8 &data) + : reg_low(data.reg), reg_high(data.reg) {} + + explicit FP32Vec16(const BF16Vec16 &v) { + __m128i low = _mm256_extractf128_si256(v.reg, 0); + __m128i high = _mm256_extractf128_si256(v.reg, 1); + + __m256i v_low_epi32 = _mm256_cvtepu16_epi32(low); + __m256i v_high_epi32 = _mm256_cvtepu16_epi32(high); + + __m256i v_low_shifted = _mm256_bslli_epi128(v_low_epi32, 2); + __m256i v_high_shifted = _mm256_bslli_epi128(v_high_epi32, 2); + + reg_low = _mm256_castsi256_ps(v_low_shifted); + reg_high = _mm256_castsi256_ps(v_high_shifted); + } + + explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} + + FP32Vec16 operator*(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_mul_ps(reg_low, b.reg_low), + _mm256_mul_ps(reg_high, b.reg_high)); + } + + FP32Vec16 operator+(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_add_ps(reg_low, b.reg_low), + _mm256_add_ps(reg_high, b.reg_high)); + } + + FP32Vec16 operator-(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_sub_ps(reg_low, b.reg_low), + _mm256_sub_ps(reg_high, b.reg_high)); + } + + FP32Vec16 operator/(const FP32Vec16 &b) const { + return FP32Vec16(_mm256_div_ps(reg_low, b.reg_low), + _mm256_div_ps(reg_high, b.reg_high)); + } + + float reduce_sum() const { + FP32Vec8 low = FP32Vec8(reg_low); + FP32Vec8 high = FP32Vec8(reg_high); + return low.reduce_sum() + high.reduce_sum(); + } + + template float reduce_sub_sum(int idx) { + float sum = 0.0; + static_assert(VEC_ELEM_NUM % group_size == 0); + constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); + uint32_t mask = base_mask << (idx * group_size); + + AliasReg ar; + + auto func = [&sum, &mask, &ar](int i) { + int flag = mask & 0x1; + mask = mask >> 1; + if (flag != 0) sum += ar.values[i]; + }; + + ar.reg = reg_low; + unroll_loop(func); + + ar.reg = reg_high; + unroll_loop(func); + + return sum; + } + + void save(float *ptr) const { + _mm256_storeu_ps(ptr, reg_low); + _mm256_storeu_ps(ptr + 8, reg_high); + } +}; +#endif + +template struct VecType { using vec_type = void; }; + +template using vec_t = typename VecType::vec_type; + +template <> struct VecType { using vec_type = FP32Vec8; }; + +#ifdef __AVX512FP16__ +template <> struct VecType { using vec_type = FP16Vec16; }; +#endif + +template <> struct VecType { using vec_type = BF16Vec8; }; + +template void storeFP32(float v, T *ptr) { *ptr = v; } + +#ifdef __AVX512FP16__ +template <> inline void storeFP32(float v, c10::Half *ptr) { + *reinterpret_cast<_Float16 *>(ptr) = v; +} +#endif + +inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) { + acc = acc + a * b; +} + +#ifdef __AVX512BF16__ +template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { + *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v); +} + +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + : reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {} + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) + : reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {} + +inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) { + acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg); +} +#else +template <> inline void storeFP32(float v, c10::BFloat16 *ptr) { + c10::BFloat16 __attribute__((__may_alias__)) *v_ptr = + reinterpret_cast(&v); + *ptr = *(v_ptr + 1); +} + +#ifdef __AVX512F__ +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + : reg(_mm256_cvtepi32_epi16( + _mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {} + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) + : reg(_mm512_cvtepi32_epi16( + _mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {} +#else +namespace{ +__m128i FP32Vec8_to_BF16Vec8_avx2(__m256 a) { + __m256i ai = _mm256_castps_si256(a); + ai = _mm256_srli_epi32(ai, 16); + ai = _mm256_packus_epi32(ai, ai); + ai = _mm256_permute4x64_epi64(ai, 0b00111001); + return _mm256_extracti128_si256(ai, 0); +} +} + +inline BF16Vec8::BF16Vec8(const FP32Vec8 &v) + : reg(FP32Vec8_to_BF16Vec8_avx2(v.reg)) {} + +inline BF16Vec16::BF16Vec16(const FP32Vec16 &v) { + BF16Vec8 low = BF16Vec8(FP32Vec8(v.reg_low)); + BF16Vec8 high = BF16Vec8(FP32Vec8(v.reg_high)); + reg = _mm256_insertf128_si256(_mm256_castsi128_si256(low.reg), high.reg, 1); +} +#endif // __AVX512F__ +#endif // __AVX512BF16__ + +inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); } + +}; // namespace vec_op + +#endif diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index a2bf0d49adba5..39e8cf3ed3c10 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -58,6 +58,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("gelu_fast(Tensor! out, Tensor input) -> ()"); ops.impl("gelu_fast", torch::kCPU, &gelu_fast); + // Quick GELU implementation. + ops.def("gelu_quick(Tensor! out, Tensor input) -> ()"); + ops.impl("gelu_quick", torch::kCPU, &gelu_quick); + // Layernorm // Apply Root Mean Square (RMS) Normalization to the input tensor. ops.def( diff --git a/csrc/ops.h b/csrc/ops.h index 0c270a78c331f..8a92afdc81a9b 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -1,5 +1,6 @@ #pragma once +#include #include void paged_attention_v1( @@ -49,6 +50,8 @@ void gelu_new(torch::Tensor& out, torch::Tensor& input); void gelu_fast(torch::Tensor& out, torch::Tensor& input); +void gelu_quick(torch::Tensor& out, torch::Tensor& input); + #ifndef USE_ROCM torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes, const torch::Tensor& codebooks, @@ -90,9 +93,12 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, int64_t size_k, int64_t size_n, int64_t num_bits); -void cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a, - torch::Tensor const& b, torch::Tensor const& a_scales, - torch::Tensor const& b_scales); +bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); + +void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + c10::optional const& bias); #endif diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h index 4b376261d30d2..2c8d007d8719f 100644 --- a/csrc/punica/bgmv/bgmv_config.h +++ b/csrc/punica/bgmv/bgmv_config.h @@ -16,14 +16,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 512) \ f(in_T, out_T, W_T, narrow, 640) \ f(in_T, out_T, W_T, narrow, 768) \ + f(in_T, out_T, W_T, narrow, 896) \ f(in_T, out_T, W_T, narrow, 1024) \ f(in_T, out_T, W_T, narrow, 1152) \ + f(in_T, out_T, W_T, narrow, 1216) \ f(in_T, out_T, W_T, narrow, 1280) \ f(in_T, out_T, W_T, narrow, 1536) \ + f(in_T, out_T, W_T, narrow, 1664) \ f(in_T, out_T, W_T, narrow, 1728) \ f(in_T, out_T, W_T, narrow, 1792) \ f(in_T, out_T, W_T, narrow, 2048) \ + f(in_T, out_T, W_T, narrow, 2240) \ f(in_T, out_T, W_T, narrow, 2304) \ + f(in_T, out_T, W_T, narrow, 2368) \ + f(in_T, out_T, W_T, narrow, 2432) \ f(in_T, out_T, W_T, narrow, 2560) \ f(in_T, out_T, W_T, narrow, 2752) \ f(in_T, out_T, W_T, narrow, 2816) \ @@ -31,32 +37,47 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 3328) \ f(in_T, out_T, W_T, narrow, 3456) \ f(in_T, out_T, W_T, narrow, 3584) \ + f(in_T, out_T, W_T, narrow, 3712) \ f(in_T, out_T, W_T, narrow, 4096) \ + f(in_T, out_T, W_T, narrow, 4480) \ f(in_T, out_T, W_T, narrow, 4608) \ + f(in_T, out_T, W_T, narrow, 4736) \ + f(in_T, out_T, W_T, narrow, 4864) \ f(in_T, out_T, W_T, narrow, 5120) \ f(in_T, out_T, W_T, narrow, 5504) \ f(in_T, out_T, W_T, narrow, 5632) \ + f(in_T, out_T, W_T, narrow, 5888) \ f(in_T, out_T, W_T, narrow, 6144) \ f(in_T, out_T, W_T, narrow, 6400) \ f(in_T, out_T, W_T, narrow, 6848) \ f(in_T, out_T, W_T, narrow, 6912) \ f(in_T, out_T, W_T, narrow, 7168) \ + f(in_T, out_T, W_T, narrow, 7424) \ f(in_T, out_T, W_T, narrow, 8192) \ + f(in_T, out_T, W_T, narrow, 8960) \ f(in_T, out_T, W_T, narrow, 9216) \ + f(in_T, out_T, W_T, narrow, 9472) \ f(in_T, out_T, W_T, narrow, 10240) \ f(in_T, out_T, W_T, narrow, 11008) \ + f(in_T, out_T, W_T, narrow, 11264) \ f(in_T, out_T, W_T, narrow, 12288) \ f(in_T, out_T, W_T, narrow, 13696) \ f(in_T, out_T, W_T, narrow, 13824) \ f(in_T, out_T, W_T, narrow, 14336) \ + f(in_T, out_T, W_T, narrow, 14784) \ + f(in_T, out_T, W_T, narrow, 14848) \ f(in_T, out_T, W_T, narrow, 15360) \ f(in_T, out_T, W_T, narrow, 16384) \ + f(in_T, out_T, W_T, narrow, 18944) \ f(in_T, out_T, W_T, narrow, 20480) \ f(in_T, out_T, W_T, narrow, 22016) \ + f(in_T, out_T, W_T, narrow, 22528) \ f(in_T, out_T, W_T, narrow, 24576) \ f(in_T, out_T, W_T, narrow, 27392) \ f(in_T, out_T, W_T, narrow, 27648) \ f(in_T, out_T, W_T, narrow, 28672) \ + f(in_T, out_T, W_T, narrow, 29568) \ + f(in_T, out_T, W_T, narrow, 29696) \ f(in_T, out_T, W_T, narrow, 32000) \ f(in_T, out_T, W_T, narrow, 32256) \ f(in_T, out_T, W_T, narrow, 32512) \ @@ -65,6 +86,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 36864) \ f(in_T, out_T, W_T, narrow, 43264) \ f(in_T, out_T, W_T, narrow, 49152) \ + f(in_T, out_T, W_T, narrow, 49408) \ + f(in_T, out_T, W_T, narrow, 60544) \ + f(in_T, out_T, W_T, narrow, 60672) \ f(in_T, out_T, W_T, narrow, 64000) \ f(in_T, out_T, W_T, narrow, 64256) \ f(in_T, out_T, W_T, narrow, 64512) \ @@ -74,12 +98,14 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, narrow, 128000) \ f(in_T, out_T, W_T, narrow, 128256) \ f(in_T, out_T, W_T, narrow, 128512) \ + + // Keep above in sync with vllm/lora/layers::LogitsProcessorWithLoRA // and vllm/tests/lora/test_punica.py -// Used for defining kernels going from the variety of +// Used for defining kernels going from the variety of // dim in to the narrow dim out - // Using it for the fully sharded column + // Using it for the fully sharded column // parallel LoRA A which splits the rank dim #define FOR_INST_BGMV_NARROW(f, in_T, out_T, W_T, narrow) \ f(in_T, out_T, W_T, 128, narrow) \ @@ -87,14 +113,20 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 512, narrow) \ f(in_T, out_T, W_T, 640, narrow) \ f(in_T, out_T, W_T, 768, narrow) \ + f(in_T, out_T, W_T, 896, narrow) \ f(in_T, out_T, W_T, 1024, narrow) \ f(in_T, out_T, W_T, 1152, narrow) \ + f(in_T, out_T, W_T, 1216, narrow) \ f(in_T, out_T, W_T, 1280, narrow) \ f(in_T, out_T, W_T, 1536, narrow) \ + f(in_T, out_T, W_T, 1664, narrow) \ f(in_T, out_T, W_T, 1728, narrow) \ f(in_T, out_T, W_T, 1792, narrow) \ f(in_T, out_T, W_T, 2048, narrow) \ + f(in_T, out_T, W_T, 2240, narrow) \ f(in_T, out_T, W_T, 2304, narrow) \ + f(in_T, out_T, W_T, 2368, narrow) \ + f(in_T, out_T, W_T, 2432, narrow) \ f(in_T, out_T, W_T, 2560, narrow) \ f(in_T, out_T, W_T, 2752, narrow) \ f(in_T, out_T, W_T, 2816, narrow) \ @@ -102,32 +134,47 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 3328, narrow) \ f(in_T, out_T, W_T, 3456, narrow) \ f(in_T, out_T, W_T, 3584, narrow) \ + f(in_T, out_T, W_T, 3712, narrow) \ f(in_T, out_T, W_T, 4096, narrow) \ + f(in_T, out_T, W_T, 4480, narrow) \ f(in_T, out_T, W_T, 4608, narrow) \ + f(in_T, out_T, W_T, 4736, narrow) \ + f(in_T, out_T, W_T, 4864, narrow) \ f(in_T, out_T, W_T, 5120, narrow) \ f(in_T, out_T, W_T, 5504, narrow) \ f(in_T, out_T, W_T, 5632, narrow) \ + f(in_T, out_T, W_T, 5888, narrow) \ f(in_T, out_T, W_T, 6144, narrow) \ f(in_T, out_T, W_T, 6400, narrow) \ f(in_T, out_T, W_T, 6848, narrow) \ f(in_T, out_T, W_T, 6912, narrow) \ f(in_T, out_T, W_T, 7168, narrow) \ + f(in_T, out_T, W_T, 7424, narrow) \ f(in_T, out_T, W_T, 8192, narrow) \ + f(in_T, out_T, W_T, 8960, narrow) \ f(in_T, out_T, W_T, 9216, narrow) \ + f(in_T, out_T, W_T, 9472, narrow) \ f(in_T, out_T, W_T, 10240, narrow) \ f(in_T, out_T, W_T, 11008, narrow) \ + f(in_T, out_T, W_T, 11264, narrow) \ f(in_T, out_T, W_T, 12288, narrow) \ f(in_T, out_T, W_T, 13696, narrow) \ f(in_T, out_T, W_T, 13824, narrow) \ f(in_T, out_T, W_T, 14336, narrow) \ + f(in_T, out_T, W_T, 14784, narrow) \ + f(in_T, out_T, W_T, 14848, narrow) \ f(in_T, out_T, W_T, 15360, narrow) \ f(in_T, out_T, W_T, 16384, narrow) \ + f(in_T, out_T, W_T, 18944, narrow) \ f(in_T, out_T, W_T, 20480, narrow) \ f(in_T, out_T, W_T, 22016, narrow) \ + f(in_T, out_T, W_T, 22528, narrow) \ f(in_T, out_T, W_T, 24576, narrow) \ f(in_T, out_T, W_T, 27392, narrow) \ f(in_T, out_T, W_T, 27648, narrow) \ f(in_T, out_T, W_T, 28672, narrow) \ + f(in_T, out_T, W_T, 29568, narrow) \ + f(in_T, out_T, W_T, 29696, narrow) \ f(in_T, out_T, W_T, 32000, narrow) \ f(in_T, out_T, W_T, 32256, narrow) \ f(in_T, out_T, W_T, 32512, narrow) \ @@ -136,6 +183,9 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X, f(in_T, out_T, W_T, 36864, narrow) \ f(in_T, out_T, W_T, 43264, narrow) \ f(in_T, out_T, W_T, 49152, narrow) \ + f(in_T, out_T, W_T, 49408, narrow) \ + f(in_T, out_T, W_T, 60544, narrow) \ + f(in_T, out_T, W_T, 60672, narrow) \ f(in_T, out_T, W_T, 64000, narrow) \ f(in_T, out_T, W_T, 64256, narrow) \ f(in_T, out_T, W_T, 64512, narrow) \ diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp index 8f38bbf507901..877a9f5b9e5de 100644 --- a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp +++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp @@ -153,7 +153,7 @@ struct Sm90RowOrScalarBroadcast { CUTLASS_DEVICE void begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) { - if (params.ptr_row == nullptr) { + if (!params.row_broadcast) { return; } diff --git a/csrc/quantization/cutlass_w8a8/common.hpp b/csrc/quantization/cutlass_w8a8/common.hpp index 999b7b251ab33..bf04bb400790f 100644 --- a/csrc/quantization/cutlass_w8a8/common.hpp +++ b/csrc/quantization/cutlass_w8a8/common.hpp @@ -1,6 +1,7 @@ #pragma once #include "cutlass/cutlass.h" +#include /** * Helper function for checking CUTLASS errors @@ -10,3 +11,17 @@ TORCH_CHECK(status == cutlass::Status::kSuccess, \ cutlassGetStatusString(status)) \ } + +inline uint32_t next_pow_2(uint32_t const num) { + if (num <= 1) return num; + return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1)); +} + +inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { + int max_shared_mem_per_block_opt_in = 0; + cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, + cudaDevAttrMaxSharedMemoryPerBlockOptin, + device); + return max_shared_mem_per_block_opt_in; +} + diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu new file mode 100644 index 0000000000000..6ce25c5ac897b --- /dev/null +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -0,0 +1,609 @@ +#include +#include + +#include + +// clang-format will break include orders +// clang-format off +#include "cute/tensor.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cutlass/numeric_types.h" + +#include "cutlass/util/device_memory.h" + +#include "cutlass/cutlass.h" +#include "cutlass/gemm_coord.h" +#include "cutlass/arch/mma_sm75.h" +#include "cutlass/arch/arch.h" +#include "cutlass/arch/mma.h" +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" + +#include "cutlass/epilogue/threadblock/fusion/visitors.hpp" +#include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h" + +#include "broadcast_load_epilogue_c2x.hpp" +#include "common.hpp" +// clang-format on + +using namespace cute; + +/* + This file defines quantized GEMM operations using the CUTLASS 2.x API, for + NVIDIA GPUs with SM versions prior to sm90 (Hopper). + + Epilogue functions can be defined to post-process the output before it is + written to GPU memory. + Epilogues must contain a public type named EVTCompute of type Sm80EVT, + as well as a static prepare_args function that constructs an + EVTCompute::Arguments struct. +*/ + +namespace { + +// Wrappers for the GEMM kernel that is used to guard against compilation on +// architectures that will never use the kernel. The purpose of this is to +// reduce the size of the compiled binary. +// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef +// into code that will be executed on the device where it is defined. +template +struct enable_sm75_to_sm80 : Kernel { + template + CUTLASS_DEVICE static void invoke(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800 + Kernel::invoke(std::forward(args)...); +#endif + } +}; + +template +struct enable_sm80_to_sm89 : Kernel { + template + CUTLASS_DEVICE static void invoke(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890 + Kernel::invoke(std::forward(args)...); +#endif + } +}; + +template +struct enable_sm89_to_sm90 : Kernel { + template + CUTLASS_DEVICE static void invoke(Args&&... args) { +#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900 + Kernel::invoke(std::forward(args)...); +#endif + } +}; + +/* + * This class provides the common ScaleA and ScaleB descriptors for the + * ScaledEpilogue and ScaledEpilogueBias classes. + */ +template +struct ScaledEpilogueBase { + protected: + using Accum = cutlass::epilogue::threadblock::VisitorAccFetch; + + using ScaleA = cutlass::epilogue::threadblock::VisitorColOrScalarBroadcast< + OutputTileThreadMap, float, Stride, Int<0>, Int<0>>>; + + using ScaleB = cutlass::epilogue::threadblock::VisitorRowOrScalarBroadcast< + OutputTileThreadMap, float, Stride, Int<1>, Int<0>>>; +}; + +/* + This epilogue function defines a quantized GEMM operation similar to + torch._scaled_mm. + + A and B may be both either int8 or fp8_e4m3. A can be quantized per-tensor or + per-row. B can be quantized per-tensor or per-column. + Any combination of per-tensor and per-row or column is supported. + A and B must have symmetric quantization (zero point == 0). + + So the GEMM operation is D = (a_scales * A) (b_scales * B), where the + scales are applied elementwise with numpy-style broadcasting. + + ScaleA and ScaleB define the epilogue functions that apply the scales for + the A and B operands respectively. These scales may be either per-tensor or + per row or column. +*/ +template +struct ScaledEpilogue + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::ScaleA; + using ScaleB = typename SUPER::ScaleB; + + using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::threadblock::Sm80EVT; + + using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + public: + using EVTCompute = + cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales) { + using ScaleAArgs = typename ScaleA::Arguments; + using ScaleBArgs = typename ScaleB::Arguments; + + ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; + ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; + + typename EVTCompute0::Arguments evt0_compute_args{b_args}; + + typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args}; + return evt_compute_args; + } +}; + +template +struct ScaledEpilogueBias + : private ScaledEpilogueBase { + private: + using SUPER = ScaledEpilogueBase; + using Accum = typename SUPER::Accum; + using ScaleA = typename SUPER::ScaleA; + using ScaleB = typename SUPER::ScaleB; + + using Compute0 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiplies, float, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using EVTCompute0 = + cutlass::epilogue::threadblock::Sm80EVT; + + using Compute1 = cutlass::epilogue::threadblock::VisitorCompute< + cutlass::multiply_add, ElementD, float, + cutlass::FloatRoundStyle::round_to_nearest>; + + using Bias = cutlass::epilogue::threadblock::VisitorRowBroadcast< + OutputTileThreadMap, ElementD, Stride, Int<1>, Int<0>>>; + + public: + using EVTCompute = cutlass::epilogue::threadblock::Sm80EVT; + using ArgumentType = typename EVTCompute::Arguments; + + static ArgumentType prepare_args(torch::Tensor const& a_scales, + torch::Tensor const& b_scales, + torch::Tensor const& bias) { + using ScaleAArgs = typename ScaleA::Arguments; + using ScaleBArgs = typename ScaleB::Arguments; + using BiasArgs = typename Bias::Arguments; + + ScaleBArgs b_args{b_scales.data_ptr(), b_scales.numel() != 1, {}}; + ScaleAArgs a_args{a_scales.data_ptr(), a_scales.numel() != 1, {}}; + BiasArgs bias_args{static_cast(bias.data_ptr()), {}}; + + typename EVTCompute0::Arguments evt0_compute_args{b_args}; + + typename EVTCompute::Arguments evt_compute_args{a_args, evt0_compute_args, + bias_args}; + return evt_compute_args; + } +}; + +template typename ArchGuard, + typename ElementAB_, typename ElementD_, + template typename Epilogue_, typename TileShape, + typename WarpShape, typename InstructionShape, int32_t MainLoopStages> +struct cutlass_2x_gemm { + using ElementAB = ElementAB_; + using ElementD = ElementD_; + + using ElementAcc = + typename std::conditional, int32_t, + float>::type; + + using Operator = + typename std::conditional, + cutlass::arch::OpMultiplyAddSaturate, + cutlass::arch::OpMultiplyAdd>::type; + + using OutputTileThreadMap = + cutlass::epilogue::threadblock::OutputTileThreadLayout< + TileShape, WarpShape, float, 4, 1 /* epilogue stages */ + >; + + using Epilogue = Epilogue_; + using EVTCompute = typename Epilogue::EVTCompute; + + using D = cutlass::epilogue::threadblock::VisitorAuxStore< + OutputTileThreadMap, ElementD, cutlass::FloatRoundStyle::round_to_nearest, + Stride, Int<0>>>; + + using EVTD = cutlass::epilogue::threadblock::Sm80EVT; + + // clang-format off + using RowMajor = typename cutlass::layout::RowMajor; + using ColumnMajor = typename cutlass::layout::ColumnMajor; + using KernelType = + ArchGuard::GemmKernel>; + // clang-format on + + using Op = cutlass::gemm::device::GemmUniversalAdapter; +}; + +template +void cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... epilogue_params) { + using ElementAB = typename Gemm::ElementAB; + using ElementD = typename Gemm::ElementD; + + int32_t m = a.size(0); + int32_t n = b.size(1); + int32_t k = a.size(1); + cutlass::gemm::GemmCoord problem_size{m, n, k}; + + int64_t lda = a.stride(0); + int64_t ldb = b.stride(1); + int64_t ldc = out.stride(0); + + using StrideC = Stride, Int<0>>; + StrideC c_stride{ldc, Int<1>{}, Int<0>{}}; + + auto a_ptr = static_cast(a.data_ptr()); + auto b_ptr = static_cast(b.data_ptr()); + auto c_ptr = static_cast(out.data_ptr()); + + typename Gemm::D::Arguments d_args{c_ptr, c_stride}; + + using Epilogue = typename Gemm::Epilogue; + auto evt_args = + Epilogue::prepare_args(std::forward(epilogue_params)...); + + typename Gemm::EVTD::Arguments epilogue_args{ + evt_args, + d_args, + }; + + typename Gemm::Op::Arguments args{ + cutlass::gemm::GemmUniversalMode::kGemmSplitKParallel, // universal mode + problem_size, // problem size + 1, // batch count + epilogue_args, + a_ptr, + b_ptr, + nullptr, + nullptr, + 0, + 0, + 0, + 0, + lda, + ldb, + ldc, + ldc}; + + // Launch the CUTLASS GEMM kernel. + typename Gemm::Op gemm_op; + size_t workspace_size = gemm_op.get_workspace_size(args); + cutlass::device_memory::allocation workspace(workspace_size); + + auto stream = at::cuda::getCurrentCUDAStream(a.get_device()); + + CUTLASS_CHECK(gemm_op.can_implement(args)); + cutlass::Status status = gemm_op(args, workspace.get(), stream); + CUTLASS_CHECK(status); +} + +template +void fallback_cutlass_gemm_caller(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + // In some cases, the GPU isn't able to accommodate the + // shared memory requirements of the Gemm. In such cases, use + // the FallbackGemm instead. + static const int max_shared_mem_per_block_opt_in = + get_cuda_max_shared_memory_per_block_opt_in(0); + + size_t const gemm_shared_mem_size = + sizeof(typename Gemm::KernelType::SharedStorage); + size_t const fallback_gemm_shared_mem_size = + sizeof(typename FallbackGemm::KernelType::SharedStorage); + + if (gemm_shared_mem_size <= max_shared_mem_per_block_opt_in) { + return cutlass_gemm_caller(out, a, b, + std::forward(args)...); + } else { + TORCH_CHECK(fallback_gemm_shared_mem_size <= + max_shared_mem_per_block_opt_in); + return cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + +template typename Epilogue> +struct sm80_config_default { + // This config is used in 2 cases, + // - M in (128, inf) + // - M in (64, 128] and N >= 8192 + // Shared Memory required by this Gemm - 81920 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<128, 128, 64>; + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M64 { + // This config is used in 2 cases, + // - M in (32, 64] + // - M in (64, 128] and N < 8192 + // Shared Memory required by this Gemm - 122880 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<64, 128, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<64, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M32 { + // M in (16, 32] + // Shared Memory required by this Gemm - 61440 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<32, 64, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<32, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +template typename Epilogue> +struct sm80_config_M16 { + // M in [1, 16] + // Shared Memory required by this Gemm - 51200 bytes + static_assert(std::is_same()); + using TileShape = typename cutlass::gemm::GemmShape<16, 64, 128>; + using WarpShape = typename cutlass::gemm::GemmShape<16, 64, 64>; + using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>; + using Cutlass2xGemm = + cutlass_2x_gemm; +}; + +} // namespace + +template typename Epilogue, + typename... EpilogueArgs> +void cutlass_gemm_sm80_dispatch(torch::Tensor& out, torch::Tensor const& a, + torch::Tensor const& b, + EpilogueArgs&&... args) { + static_assert(std::is_same()); + TORCH_CHECK(a.dtype() == torch::kInt8); + TORCH_CHECK(b.dtype() == torch::kInt8); + + using Cutlass2xGemmDefault = + typename sm80_config_default::Cutlass2xGemm; + using Cutlass2xGemmM128BigN = + typename sm80_config_default::Cutlass2xGemm; + using Cutlass2xGemmM128SmallN = + typename sm80_config_M64::Cutlass2xGemm; + using Cutlass2xGemmM64 = + typename sm80_config_M64::Cutlass2xGemm; + using Cutlass2xGemmM32 = + typename sm80_config_M32::Cutlass2xGemm; + using Cutlass2xGemmM16 = + typename sm80_config_M16::Cutlass2xGemm; + + // Due to shared memory requirements, some Gemms may fail to run on some + // GPUs. As the name indicates, the Fallback Gemm is used as an alternative + // in such cases. + // sm80_config_M16 has the least shared-memory requirement. However, + // based on some profiling, we select sm80_config_M32 as a better alternative + // performance wise. + using FallbackGemm = + typename sm80_config_M32::Cutlass2xGemm; + + uint32_t const m = a.size(0); + uint32_t const mp2 = + std::max(static_cast(16), next_pow_2(m)); // next power of 2 + if (mp2 <= 16) { + // M in [1, 16] + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 32) { + // M in (16, 32] + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 64) { + // M in (32, 64] + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else if (mp2 <= 128) { + // M in (64, 128] + uint32_t const n = out.size(1); + bool const small_n = n < 8192; + if (small_n) { + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } else { + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } + } else { + // M in (128, inf) + return fallback_cutlass_gemm_caller( + out, a, b, std::forward(args)...); + } +} + +template