Skip to content

Commit

Permalink
Merge branch 'main' into phi_4_bug_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
adarshxs authored Mar 9, 2025
2 parents 3fb076b + df84ab2 commit 10cccb8
Show file tree
Hide file tree
Showing 102 changed files with 3,307 additions and 1,489 deletions.
12 changes: 6 additions & 6 deletions .github/workflows/pr-test-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ jobs:
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.3.post2-rocm630
docker pull lmsysorg/sglang:v0.4.3.post4-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.3.post2-rocm630
lmsysorg/sglang:v0.4.3.post4-rocm630
- name: Install dependencies
run: |
Expand Down Expand Up @@ -71,12 +71,12 @@ jobs:
else
DEVICE_FLAG="--device /dev/dri"
fi
docker pull lmsysorg/sglang:v0.4.3.post2-rocm630
docker pull lmsysorg/sglang:v0.4.3.post4-rocm630
docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
-v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
--cap-add=SYS_PTRACE -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} --security-opt seccomp=unconfined \
-w /sglang-checkout --name ci_sglang \
lmsysorg/sglang:v0.4.3.post2-rocm630
lmsysorg/sglang:v0.4.3.post4-rocm630
- name: Install dependencies
run: |
Expand All @@ -90,11 +90,11 @@ jobs:
- name: MLA TEST
timeout-minutes: 20
run: |
docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_mla.py TestMLA
docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_mla.py
finish:
needs: [
accuracy-test-1-gpu-amd
accuracy-test-1-gpu-amd, mla-test-1-gpu-amd
]
runs-on: ubuntu-latest
steps:
Expand Down
35 changes: 33 additions & 2 deletions .github/workflows/pr-test-sgl-kernel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
with:
source: sgl-kernel
extensions: h,c,cpp,hpp,cu,cuh,cc
clangFormatVersion: 16
clangFormatVersion: 18
style: file

build-wheels:
Expand Down Expand Up @@ -95,8 +95,39 @@ jobs:
run: |
pip3 uninstall sgl-kernel -y
mla-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
needs: build-wheels
runs-on: 1-gpu-runner
steps:
- uses: actions/checkout@v4

- name: Download artifacts
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-*

- name: Install
run: |
bash scripts/ci_install_dependency.sh
pip3 uninstall sgl-kernel -y || true
pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
pip3 list | grep sgl-kernel
- name: Run test
timeout-minutes: 30
run: |
cd test/srt
python3 test_mla_deepseek_v3.py
- name: Uninstall dependencies
run: |
pip3 uninstall sgl-kernel -y
finish:
needs: [unit-test, lint]
needs: [unit-test, mla-test, lint]
runs-on: ubuntu-latest
steps:
- name: Finish
Expand Down
2 changes: 2 additions & 0 deletions .github/workflows/pr-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ jobs:
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
- name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 10
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release-pypi-kernel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
branches:
- main
paths:
- sgl-kernel/src/sgl-kernel/version.py
- sgl-kernel/python/sgl_kernel/version.py
workflow_dispatch:

concurrency:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/release-whl-kernel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ on:
branches:
- main
paths:
- sgl-kernel/src/sgl-kernel/version.py
- sgl-kernel/python/sgl_kernel/version.py

jobs:
build-wheels:
Expand Down Expand Up @@ -59,7 +59,7 @@ jobs:
id: set_tag_name
run: |
if [ -z "${{ inputs.tag_name }}" ]; then
TAG_NAME="v$(cat sgl-kernel/src/sgl-kernel/version.py | cut -d'"' -f2)"
TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
else
echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT
Expand Down
3 changes: 0 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,3 @@
[submodule "sgl-kernel/3rdparty/flashinfer"]
path = sgl-kernel/3rdparty/flashinfer
url = https://github.com/flashinfer-ai/flashinfer.git
[submodule "sgl-kernel/3rdparty/turbomind"]
path = sgl-kernel/3rdparty/turbomind
url = https://github.com/InternLM/turbomind
39 changes: 7 additions & 32 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,44 +30,19 @@ ARG CUDA_VERSION
RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
&& git clone --depth=1 https://github.com/sgl-project/sglang.git \
&& if [ "$CUDA_VERSION" = "12.1.1" ]; then \
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu121; \
export CUINDEX=121; \
elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124; \
export CUINDEX=124; \
elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124; \
export CUINDEX=124; \
elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118; \
python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
export CUINDEX=118; \
python3 -m pip install --no-cache-dir sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
else \
echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
fi \
&& python3 -m pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu${CUINDEX} \
&& cd sglang \
&& if [ "$BUILD_TYPE" = "srt" ]; then \
if [ "$CUDA_VERSION" = "12.1.1" ]; then \
python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu121/torch2.5/flashinfer-python; \
elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu118/torch2.5/flashinfer-python; \
python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
else \
echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
fi; \
else \
if [ "$CUDA_VERSION" = "12.1.1" ]; then \
python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu121/torch2.5/flashinfer-python; \
elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu118/torch2.5/flashinfer-python; \
python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
else \
echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
fi; \
fi
&& python3 -m pip --no-cache-dir install -e "python[${BUILD_TYPE}]" --find-links https://flashinfer.ai/whl/cu${CUINDEX}/torch2.5/flashinfer-python

ENV DEBIAN_FRONTEND=interactive
104 changes: 104 additions & 0 deletions docker/k8s-sglang-distributed-sts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Two Nodes Sglang example

apiVersion: apps/v1
kind: StatefulSet
metadata:
name: distributed-sglang
spec:
replicas: 2 # number of nodes/pods to run distributed sglang
selector:
matchLabels:
app: distributed-sglang
serviceName: ""
template:
metadata:
labels:
app: distributed-sglang
spec:
containers:
- name: sglang-container
image: docker.io/lmsysorg/sglang:latest
imagePullPolicy: Always # image may be replaced by official CI versioned image
command:
- /bin/bash
- -c
# please modify the sglang serving arguments below, as necessary.
# NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1
args:
- |
python3 -m sglang.launch_server \
--model /llm-folder \
--dist-init-addr sglang-master-pod:5000 \
--tensor-parallel-size 16 \
--nnodes 2 \
--node-rank $POD_INDEX \
--trust-remote-code \
--host 0.0.0.0 \
--port 8000 \
--enable-metrics \
--enable-ep-moe \
--expert-parallel-size 16
env:
- name: POD_INDEX # reflects the node-rank
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
- name: NCCL_DEBUG
value: INFO
resources:
limits:
nvidia.com/gpu: "8"
requests:
volumeMounts:
- mountPath: /dev/shm
name: dshm
- mountPath: /llm-folder
name: llm
securityContext:
privileged: true # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true
hostNetwork: true
volumes:
- emptyDir:
medium: Memory
sizeLimit: 10Gi
name: dshm
- hostPath:
path: /llm-folder # replace with PVC or hostPath with your model weights
type: DirectoryOrCreate
name: llm
#- persistentVolumeClaim:
# claimName: llm-pvc
# name: llm
---
apiVersion: v1
kind: Service
metadata:
name: sglang-master-pod
spec:
type: ClusterIP
selector:
app: distributed-sglang
apps.kubernetes.io/pod-index: "0"
ports:
- name: dist-port
port: 5000
targetPort: 5000
---
# the serving service
apiVersion: v1
kind: Service
metadata:
name: sglang-serving-on-master
spec:
type: NodePort
selector:
app: distributed-sglang
apps.kubernetes.io/pod-index: "0"
ports:
- name: serving
port: 8000
targetPort: 8000
- name: metrics
port: 8080
targetPort: 8080
12 changes: 11 additions & 1 deletion docs/backend/separate_reasoning.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
"## Supported Models\n",
"\n",
"Currently, SGLang supports the following reasoning models:\n",
"- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `<think>` and `</think>` tags."
"- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `<think>` and `</think>` tags.\n",
"- [QwQ](https://huggingface.co/Qwen/QwQ-32B): The reasoning content is wrapped with `<think>` and `</think>` tags."
]
},
{
Expand Down Expand Up @@ -55,6 +56,15 @@
"wait_for_server(f\"http://localhost:{port}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that `--reasoning-parser` defines the parser used to interpret responses. Currently supported parsers include:\n",
"\n",
"- deepseek-r1: DeepSeek R1 series and QwQ (e.g. deepseek-ai/DeepSeek-R1, Qwen/QwQ-32B)."
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
2 changes: 1 addition & 1 deletion docs/references/deepseek.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ Please refer to [the example](https://github.com/sgl-project/sglang/tree/main/be

- **Weight Absorption**: By applying the associative law of matrix multiplication to reorder computation steps, this method balances computation and memory access and improves efficiency in the decoding phase.

- **Flashinfer MLA Wrapper**: By providing `--enable-flashinfer-mla` argument, the server will use MLA kernels customized by Flashinfer. More details can be referred to [this document](https://docs.flashinfer.ai/api/mla.html). Under long input scenarios, flashinfer mla can improve performance significantly. Optimized triton kernels will be used when flashinfer mla is turned off.
- **Flashinfer MLA Wrapper**: By providing `--enable-flashinfer-mla` argument, the server will use MLA kernels customized by Flashinfer. More details can be referred to [this document](https://docs.flashinfer.ai/api/mla.html). Under long input scenarios, flashinfer mla can improve performance significantly. Optimized triton kernels will be used when flashinfer mla is turned off. Currently when using flashinfer mla wrapper and speculative decoding together, the `speculative_eagle_topk` parameter should be set to 1.

- **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption.

Expand Down
1 change: 1 addition & 0 deletions docs/references/general.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ General Guidance
faq.md
learn_more.md
modelscope.md
production_metrics.md
16 changes: 15 additions & 1 deletion docs/start/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,21 @@ drun v0.4.3.post4-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --in
2. Execute the command `docker compose up -d` in your terminal.
</details>

## Method 5: Run on Kubernetes or Clouds with SkyPilot
## Method 5: Using Kubernetes

<details>
<summary>More</summary>

1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.

2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
</details>



## Method 6: Run on Kubernetes or Clouds with SkyPilot

<details>
<summary>More</summary>
Expand Down
10 changes: 5 additions & 5 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
[project.optional-dependencies]
runtime_common = [
"aiohttp",
"datasets",
"decord",
"fastapi",
"hf_transfer",
"huggingface_hub",
"interegular",
"llguidance>=0.6.15",
"modelscope",
"ninja",
"orjson",
"packaging",
"pillow",
Expand All @@ -33,18 +36,15 @@ runtime_common = [
"python-multipart",
"pyzmq>=25.1.2",
"torchao>=0.7.0",
"transformers @ git+https://github.com/huggingface/[email protected]",
"uvicorn",
"uvloop",
"xgrammar==0.1.14",
"ninja",
"transformers @ git+https://github.com/huggingface/transformers.git@84f0186",
"llguidance>=0.6.15",
"datasets"
]

srt = [
"sglang[runtime_common]",
"sgl-kernel==0.0.3.post6",
"sgl-kernel==0.0.4",
"flashinfer_python==0.2.2.post1",
"torch==2.5.1",
"vllm>=0.6.4.post1,<=0.7.2",
Expand Down
Loading

0 comments on commit 10cccb8

Please sign in to comment.