expand benchmark to h100's (#371)

SUMMARY: * updated "build test" to accept an array of benchmarking labels * updated "remote push" and "nightly" workflows to include benchmarking on h100's * adjusted docker job to have same criteria as upload job. did this since upload could fail, but for auth reasons and this shouldn't stop us from push docker. TEST PLAN: runs on remote push --------- Co-authored-by: andy-neuma <[email protected]>
neuralmagic · Jul 9, 2024 · f43762f · f43762f · github-actions · Jul 10, 2024
1 parent 8772fb3
commit f43762f
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 6 deletions.
diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
@@ -58,10 +58,10 @@ on:
         required: true
 
       # benchmark related parameters
-      benchmark_label:
-        description: "requested benchmark label (specifies instance)"
+      benchmark_labels:
+        description: "stringified Json array of benchmark labels"
         type: string
-        default: ""
+        required: true
       benchmark_config_list_file:
         description: "benchmark configs file, e.g. 'nm_benchmark_nightly_configs_list.txt'"
         type: string
@@ -136,9 +136,12 @@ jobs:
     BENCHMARK:
         needs: [BUILD]
         if: success()
+        strategy:
+            matrix:
+                benchmark_label: ${{ fromJson(inputs.benchmark_labels) }}
         uses: ./.github/workflows/nm-benchmark.yml
         with:
-            label: ${{ inputs.benchmark_label }}
+            label: ${{ matrix.benchmark_label }}
             benchmark_config_list_file: ${{ inputs.benchmark_config_list_file }}
             timeout: ${{ inputs.benchmark_timeout }}
             gitref: ${{ github.ref }}

diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
@@ -39,7 +39,7 @@ jobs:
                             {"python":"3.11.4","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/full.txt"}]'
             test_timeout: 480
 
-            benchmark_label: gcp-k8s-l4-solo
+            benchmark_labels: '["gcp-k8s-l4-solo", "k8s-h100-solo"]'
             benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
             benchmark_timeout: 480
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"

diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml
@@ -25,7 +25,7 @@ jobs:
                             {"python":"3.11.4","label":"gcp-k8s-l4-solo","test":"neuralmagic/tests/test_skip_env_vars/smoke.txt"}]'
             test_timeout: 480
 
-            benchmark_label: gcp-k8s-l4-solo
+            benchmark_labels: '["gcp-k8s-l4-solo", "k8s-h100-solo"]'
             benchmark_config_list_file: ./.github/data/nm_benchmark_base_config_list.txt
             benchmark_timeout: 480
Benchmark suite	Current: `f43762f`	Previous: `537957c`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`43.03678457159549` ms
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`7.407163346345588` ms
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`33.11668229910235` ms
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA H100 80GB HBM3 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`11.089211572268924` ms
Benchmark suite	Current: `f43762f`	Previous: `537957c`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`24.985181336936268` ms	`23.563603496677388` ms	`1.06`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`6.125900855552969` ms	`5.977048247888172` ms	`1.02`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`183.55203526016945` ms	`186.92139306662284` ms	`0.98`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`84.68003539366877` ms	`83.59149550139291` ms	`1.01`