Merge branch 'main' into phi_4_bug_fix

sgl-project · Mar 9, 2025 · 10cccb8 · 10cccb8
2 parents 3fb076b + df84ab2
commit 10cccb8
Show file tree

Hide file tree

Showing 102 changed files with 3,307 additions and 1,489 deletions.
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
@@ -35,12 +35,12 @@ jobs:
           else
             DEVICE_FLAG="--device /dev/dri"
           fi
-          docker pull lmsysorg/sglang:v0.4.3.post2-rocm630
+          docker pull lmsysorg/sglang:v0.4.3.post4-rocm630
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
             -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
             --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
             -w /sglang-checkout --name ci_sglang \
-            lmsysorg/sglang:v0.4.3.post2-rocm630
+            lmsysorg/sglang:v0.4.3.post4-rocm630
 
       - name: Install dependencies
         run: |
@@ -71,12 +71,12 @@ jobs:
           else
             DEVICE_FLAG="--device /dev/dri"
           fi
-          docker pull lmsysorg/sglang:v0.4.3.post2-rocm630
+          docker pull lmsysorg/sglang:v0.4.3.post4-rocm630
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
             -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
             --cap-add=SYS_PTRACE -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} --security-opt seccomp=unconfined \
             -w /sglang-checkout --name ci_sglang \
-            lmsysorg/sglang:v0.4.3.post2-rocm630
+            lmsysorg/sglang:v0.4.3.post4-rocm630
 
       - name: Install dependencies
         run: |
@@ -90,11 +90,11 @@ jobs:
       - name: MLA TEST
         timeout-minutes: 20
         run: |
-          docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_mla.py TestMLA
+          docker exec -w /sglang-checkout/test/srt ci_sglang python3 test_mla.py
 
   finish:
     needs: [
-      accuracy-test-1-gpu-amd
+      accuracy-test-1-gpu-amd, mla-test-1-gpu-amd
     ]
     runs-on: ubuntu-latest
     steps:

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
@@ -27,7 +27,7 @@ jobs:
         with:
           source: sgl-kernel
           extensions: h,c,cpp,hpp,cu,cuh,cc
-          clangFormatVersion: 16
+          clangFormatVersion: 18
           style: file
 
   build-wheels:
@@ -95,8 +95,39 @@ jobs:
         run: |
           pip3 uninstall sgl-kernel -y
 
+  mla-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    needs: build-wheels
+    runs-on: 1-gpu-runner
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Download artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: sgl-kernel/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - name: Install
+        run: |
+          bash scripts/ci_install_dependency.sh
+          pip3 uninstall sgl-kernel -y || true
+          pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
+          pip3 list | grep sgl-kernel
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          cd test/srt
+          python3 test_mla_deepseek_v3.py
+
+      - name: Uninstall dependencies
+        run: |
+          pip3 uninstall sgl-kernel -y
+
   finish:
-    needs: [unit-test, lint]
+    needs: [unit-test, mla-test, lint]
     runs-on: ubuntu-latest
     steps:
       - name: Finish

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -269,6 +269,8 @@ jobs:
           cd test/srt
           python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
+          USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+
       - name: Benchmark single latency + torch.compile (TP=2)
         timeout-minutes: 10
         run: |

diff --git a/.github/workflows/release-pypi-kernel.yml b/.github/workflows/release-pypi-kernel.yml
@@ -5,7 +5,7 @@ on:
     branches:
       - main
     paths:
-      - sgl-kernel/src/sgl-kernel/version.py
+      - sgl-kernel/python/sgl_kernel/version.py
   workflow_dispatch:
 
 concurrency:

diff --git a/.github/workflows/release-whl-kernel.yml b/.github/workflows/release-whl-kernel.yml
@@ -9,7 +9,7 @@ on:
     branches:
       - main
     paths:
-      - sgl-kernel/src/sgl-kernel/version.py
+      - sgl-kernel/python/sgl_kernel/version.py
 
 jobs:
   build-wheels:
@@ -59,7 +59,7 @@ jobs:
         id: set_tag_name
         run: |
           if [ -z "${{ inputs.tag_name }}" ]; then
-            TAG_NAME="v$(cat sgl-kernel/src/sgl-kernel/version.py | cut -d'"' -f2)"
+            TAG_NAME="v$(cat sgl-kernel/python/sgl_kernel/version.py | cut -d'"' -f2)"
             echo "tag_name=$TAG_NAME" >> $GITHUB_OUTPUT
           else
             echo "tag_name=${{ inputs.tag_name }}" >> $GITHUB_OUTPUT

diff --git a/.gitmodules b/.gitmodules
@@ -7,6 +7,3 @@
 [submodule "sgl-kernel/3rdparty/flashinfer"]
 	path = sgl-kernel/3rdparty/flashinfer
 	url = https://github.com/flashinfer-ai/flashinfer.git
-[submodule "sgl-kernel/3rdparty/turbomind"]
-	path = sgl-kernel/3rdparty/turbomind
-	url = https://github.com/InternLM/turbomind
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -30,44 +30,19 @@ ARG CUDA_VERSION
 RUN python3 -m pip install --upgrade pip setuptools wheel html5lib six \
     && git clone --depth=1 https://github.com/sgl-project/sglang.git \
     && if [ "$CUDA_VERSION" = "12.1.1" ]; then \
-         python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu121; \
+         export CUINDEX=121; \
        elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
-         python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124; \
+         export CUINDEX=124; \
        elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
-         python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu124; \
+         export CUINDEX=124; \
        elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
-         python3 -m pip install torch --index-url https://download.pytorch.org/whl/cu118; \
-         python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
+         export CUINDEX=118; \
+         python3 -m pip install --no-cache-dir sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
        else \
          echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
        fi \
+    && python3 -m pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu${CUINDEX} \
     && cd sglang \
-    && if [ "$BUILD_TYPE" = "srt" ]; then \
-         if [ "$CUDA_VERSION" = "12.1.1" ]; then \
-           python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu121/torch2.5/flashinfer-python; \
-         elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
-           python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
-         elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
-           python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
-         elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
-           python3 -m pip --no-cache-dir install -e "python[srt]" --find-links https://flashinfer.ai/whl/cu118/torch2.5/flashinfer-python; \
-           python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
-         else \
-           echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
-         fi; \
-       else \
-         if [ "$CUDA_VERSION" = "12.1.1" ]; then \
-           python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu121/torch2.5/flashinfer-python; \
-         elif [ "$CUDA_VERSION" = "12.4.1" ]; then \
-           python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
-         elif [ "$CUDA_VERSION" = "12.5.1" ]; then \
-           python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu124/torch2.5/flashinfer-python; \
-         elif [ "$CUDA_VERSION" = "11.8.0" ]; then \
-           python3 -m pip --no-cache-dir install -e "python[all]" --find-links https://flashinfer.ai/whl/cu118/torch2.5/flashinfer-python; \
-           python3 -m pip install sgl-kernel -i https://docs.sglang.ai/whl/cu118; \
-         else \
-           echo "Unsupported CUDA version: $CUDA_VERSION" && exit 1; \
-         fi; \
-       fi
+    && python3 -m pip --no-cache-dir install -e "python[${BUILD_TYPE}]" --find-links https://flashinfer.ai/whl/cu${CUINDEX}/torch2.5/flashinfer-python
 
 ENV DEBIAN_FRONTEND=interactive
diff --git a/docker/k8s-sglang-distributed-sts.yaml b/docker/k8s-sglang-distributed-sts.yaml
@@ -0,0 +1,104 @@
+# Two Nodes Sglang example
+
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: distributed-sglang
+spec:
+  replicas: 2   # number of nodes/pods to run distributed sglang
+  selector:
+    matchLabels:
+      app: distributed-sglang
+  serviceName: ""
+  template:
+    metadata:
+      labels:
+        app: distributed-sglang
+    spec:
+      containers:
+      - name: sglang-container
+        image: docker.io/lmsysorg/sglang:latest
+        imagePullPolicy: Always # image may be replaced by official CI versioned image
+        command:
+        - /bin/bash
+        - -c
+        # please modify the sglang serving arguments below, as necessary.
+        # NOTE: the --expert-parallel-size and --enable-ep-moe are for MoE model like DeepSeek-R1
+        args:
+        - |
+          python3 -m sglang.launch_server \
+          --model /llm-folder \
+          --dist-init-addr sglang-master-pod:5000 \
+          --tensor-parallel-size 16 \
+          --nnodes 2 \
+          --node-rank $POD_INDEX \
+          --trust-remote-code \
+          --host 0.0.0.0 \
+          --port 8000 \
+          --enable-metrics \
+          --enable-ep-moe \
+          --expert-parallel-size 16
+        env:
+        - name: POD_INDEX     # reflects the node-rank
+          valueFrom:
+            fieldRef:
+              apiVersion: v1
+              fieldPath: metadata.labels['apps.kubernetes.io/pod-index']
+        - name: NCCL_DEBUG
+          value: INFO
+        resources:
+          limits:
+            nvidia.com/gpu: "8"
+          requests:
+        volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+        - mountPath: /llm-folder
+          name: llm
+        securityContext:
+          privileged: true   # to leverage RDMA/InfiniBand device, co-work with HostNetwork=true
+      hostNetwork: true
+      volumes:
+      - emptyDir:
+          medium: Memory
+          sizeLimit: 10Gi
+        name: dshm
+      - hostPath:
+          path: /llm-folder # replace with PVC or hostPath with your model weights
+          type: DirectoryOrCreate
+        name: llm
+      #- persistentVolumeClaim:
+      #  claimName: llm-pvc
+      #  name: llm
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: sglang-master-pod
+spec:
+  type: ClusterIP
+  selector:
+    app: distributed-sglang
+    apps.kubernetes.io/pod-index: "0"
+  ports:
+  - name: dist-port
+    port: 5000
+    targetPort: 5000
+---
+# the serving service
+apiVersion: v1
+kind: Service
+metadata:
+  name: sglang-serving-on-master
+spec:
+  type: NodePort
+  selector:
+    app: distributed-sglang
+    apps.kubernetes.io/pod-index: "0"
+  ports:
+  - name: serving
+    port: 8000
+    targetPort: 8000
+  - name: metrics
+    port: 8080
+    targetPort: 8080
diff --git a/docs/backend/separate_reasoning.ipynb b/docs/backend/separate_reasoning.ipynb
@@ -11,7 +11,8 @@
     "## Supported Models\n",
     "\n",
     "Currently, SGLang supports the following reasoning models:\n",
-    "- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `<think>` and `</think>` tags."
+    "- [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d): The reasoning content is wrapped with `<think>` and `</think>` tags.\n",
+    "- [QwQ](https://huggingface.co/Qwen/QwQ-32B): The reasoning content is wrapped with `<think>` and `</think>` tags."
    ]
   },
   {
@@ -55,6 +56,15 @@
     "wait_for_server(f\"http://localhost:{port}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that `--reasoning-parser` defines the parser used to interpret responses. Currently supported parsers include:\n",
+    "\n",
+    "- deepseek-r1: DeepSeek R1 series and QwQ (e.g. deepseek-ai/DeepSeek-R1, Qwen/QwQ-32B)."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/docs/references/deepseek.md b/docs/references/deepseek.md
@@ -84,7 +84,7 @@ Please refer to [the example](https://github.com/sgl-project/sglang/tree/main/be
 
 - **Weight Absorption**: By applying the associative law of matrix multiplication to reorder computation steps, this method balances computation and memory access and improves efficiency in the decoding phase.
 
-- **Flashinfer MLA Wrapper**: By providing `--enable-flashinfer-mla` argument, the server will use MLA kernels customized by Flashinfer. More details can be referred to [this document](https://docs.flashinfer.ai/api/mla.html). Under long input scenarios, flashinfer mla can improve performance significantly. Optimized triton kernels will be used when flashinfer mla is turned off.
+- **Flashinfer MLA Wrapper**: By providing `--enable-flashinfer-mla` argument, the server will use MLA kernels customized by Flashinfer. More details can be referred to [this document](https://docs.flashinfer.ai/api/mla.html). Under long input scenarios, flashinfer mla can improve performance significantly. Optimized triton kernels will be used when flashinfer mla is turned off. Currently when using flashinfer mla wrapper and speculative decoding together, the `speculative_eagle_topk` parameter should be set to 1.
 
 - **FP8 Quantization**: W8A8 FP8 and KV Cache FP8 quantization enables efficient FP8 inference. Additionally, we have implemented Batched Matrix Multiplication (BMM) operator to facilitate FP8 inference in MLA with weight absorption.
 

diff --git a/docs/references/general.rst b/docs/references/general.rst
@@ -11,3 +11,4 @@ General Guidance
    faq.md
    learn_more.md
    modelscope.md
+   production_metrics.md
diff --git a/docs/start/install.md b/docs/start/install.md
@@ -98,7 +98,21 @@ drun v0.4.3.post4-rocm630 python3 -m sglang.bench_one_batch --batch-size 32 --in
 2. Execute the command `docker compose up -d` in your terminal.
 </details>
 
-## Method 5: Run on Kubernetes or Clouds with SkyPilot
+## Method 5: Using Kubernetes
+
+<details>
+<summary>More</summary>
+
+1. Option 1: For single node serving (typically when the model size fits into GPUs on one node)
+   Execute command `kubectl apply -f docker/k8s-sglang-service.yaml`, to create k8s deployment and service, with llama-31-8b as example.
+
+2. Option 2: For multi-node serving (usually when a large model requires more than one GPU node, such as `DeepSeek-R1`)
+   Modify the LLM model path and arguments as necessary, then execute command `kubectl apply -f docker/k8s-sglang-distributed-sts.yaml`, to create two nodes k8s statefulset and serving service.
+</details>
+
+
+
+## Method 6: Run on Kubernetes or Clouds with SkyPilot
 
 <details>
 <summary>More</summary>

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -18,12 +18,15 @@ dependencies = ["requests", "tqdm", "numpy", "IPython", "setproctitle"]
 [project.optional-dependencies]
 runtime_common = [
     "aiohttp",
+    "datasets",
     "decord",
     "fastapi",
     "hf_transfer",
     "huggingface_hub",
     "interegular",
+    "llguidance>=0.6.15",
     "modelscope",
+    "ninja",
     "orjson",
     "packaging",
     "pillow",
@@ -33,18 +36,15 @@ runtime_common = [
     "python-multipart",
     "pyzmq>=25.1.2",
     "torchao>=0.7.0",
+    "transformers @ git+https://github.com/huggingface/[email protected]",
     "uvicorn",
     "uvloop",
     "xgrammar==0.1.14",
-    "ninja",
-    "transformers @ git+https://github.com/huggingface/transformers.git@84f0186",
-    "llguidance>=0.6.15",
-    "datasets"
 ]
 
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.0.3.post6",
+    "sgl-kernel==0.0.4",
     "flashinfer_python==0.2.2.post1",
     "torch==2.5.1",
     "vllm>=0.6.4.post1,<=0.7.2",