vllm-project · hmellor · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
@@ -6,7 +6,7 @@
 import time
 import traceback
 from dataclasses import dataclass, field
-from typing import List, Optional, Union
+from typing import Optional, Union
 
 import aiohttp
 import huggingface_hub.constants
@@ -39,8 +39,8 @@ class RequestFuncOutput:
     latency: float = 0.0
     output_tokens: int = 0
     ttft: float = 0.0  # Time to first token
-    itl: List[float] = field(
-        default_factory=list)  # List of inter-token latencies
+    itl: list[float] = field(
+        default_factory=list)  # list of inter-token latencies
     tpot: float = 0.0  # avg next-token latencies
     prompt_len: int = 0
     error: str = ""

diff --git a/benchmarks/benchmark_guided.py b/benchmarks/benchmark_guided.py
@@ -6,7 +6,6 @@
 import os
 import random
 import time
-from typing import List
 
 import datasets
 import pandas as pd
@@ -39,7 +38,7 @@ class SampleRequest:
     completion: str = None
 
 
-def run_vllm(requests: List[SampleRequest],
+def run_vllm(requests: list[SampleRequest],
              engine_args: EngineArgs,
              n: int,
              guided_decoding_rate: float = 1.0,
@@ -54,8 +53,8 @@ def run_vllm(requests: List[SampleRequest],
             " prompt_len and expected_output_len for all requests.")
 
     # Add the requests to the engine.
-    prompts: List[str] = []
-    sampling_params: List[SamplingParams] = []
+    prompts: list[str] = []
+    sampling_params: list[SamplingParams] = []
     # create a list containing random selected true or false
     guided_decoding_req_idx = random.sample(
         range(len(requests)), int(len(requests) * guided_decoding_rate))
@@ -110,7 +109,7 @@ def run_vllm(requests: List[SampleRequest],
 
 
 async def run_vllm_async(
-        requests: List[SampleRequest],
+        requests: list[SampleRequest],
         engine_args: AsyncEngineArgs,
         n: int,
         guided_decoding_rate: float = 1.0,
@@ -129,8 +128,8 @@ async def run_vllm_async(
                 " prompt_len and expected_output_len for all requests.")
 
         # Add the requests to the engine.
-        prompts: List[str] = []
-        sampling_params: List[SamplingParams] = []
+        prompts: list[str] = []
+        sampling_params: list[SamplingParams] = []
         guided_decoding_req_idx = random.sample(
             range(len(requests)), int(len(requests) * guided_decoding_rate))
 
@@ -203,7 +202,7 @@ async def run_vllm_async(
 
 
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                    args: argparse.Namespace) -> List[SampleRequest]:
+                    args: argparse.Namespace) -> list[SampleRequest]:
     if args.dataset == 'json':
         if args.json_schema_path is None:
             dir_path = os.path.dirname(os.path.realpath(__file__))
@@ -287,7 +286,7 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
 
     elif args.dataset == "xgrammar_bench":
         args.warmup = False
-        requests: List[SampleRequest] = []
+        requests: list[SampleRequest] = []
         dataset = datasets.load_dataset("NousResearch/json-mode-eval",
                                         split="train")
         print(f"dataset has {len(dataset)} entries")

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -7,7 +7,7 @@
 import os
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional
+from typing import Any, Optional
 
 import numpy as np
 import torch
@@ -22,7 +22,7 @@
 
 
 def save_to_pytorch_benchmark_format(args: argparse.Namespace,
-                                     results: Dict[str, Any]) -> None:
+                                     results: dict[str, Any]) -> None:
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={"latency": results["latencies"]},
@@ -57,7 +57,7 @@ def main(args: argparse.Namespace):
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
-    dummy_prompts: List[PromptType] = [{
+    dummy_prompts: list[PromptType] = [{
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
@@ -31,7 +31,7 @@
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from transformers import PreTrainedTokenizerBase
 
@@ -77,9 +77,9 @@ def sample_requests_from_dataset(
     dataset_path: str,
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
-    input_length_range: Tuple[int, int],
+    input_length_range: tuple[int, int],
     fixed_output_len: Optional[int],
-) -> List[Request]:
+) -> list[Request]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -99,7 +99,7 @@ def sample_requests_from_dataset(
     assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
 
     # Filter out sequences that are too long or too short
-    filtered_requests: List[Request] = []
+    filtered_requests: list[Request] = []
 
     for i in range(len(dataset)):
         if len(filtered_requests) == num_requests:
@@ -122,10 +122,10 @@ def sample_requests_from_dataset(
 def sample_requests_from_random(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
-    input_length_range: Tuple[int, int],
+    input_length_range: tuple[int, int],
     fixed_output_len: Optional[int],
     prefix_len: int,
-) -> List[Request]:
+) -> list[Request]:
 
     requests = []
     prefix_token_ids = sample_tokens(tokenizer, prefix_len)
@@ -144,9 +144,9 @@ def sample_requests_from_random(
     return requests
 
 
-def repeat_and_sort_requests(requests: List[Request],
+def repeat_and_sort_requests(requests: list[Request],
                              repeat_count: int,
-                             sort: bool = False) -> List[str]:
+                             sort: bool = False) -> list[str]:
     repeated_requests = requests * repeat_count
     if sort:
         repeated_requests.sort(key=lambda x: x[1])

diff --git a/benchmarks/benchmark_prioritization.py b/benchmarks/benchmark_prioritization.py
@@ -5,7 +5,7 @@
 import json
 import random
 import time
-from typing import List, Optional, Tuple
+from typing import Optional
 
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
@@ -23,7 +23,7 @@ def sample_requests(
     num_requests: int,
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+) -> list[tuple[str, int, int]]:
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
 
@@ -40,7 +40,7 @@ def sample_requests(
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
+    filtered_dataset: list[tuple[str, int, int]] = []
     for i in range(len(dataset)):
         if len(filtered_dataset) == num_requests:
             break
@@ -68,7 +68,7 @@ def sample_requests(
 
 
 def run_vllm(
-    requests: List[Tuple[str, int, int]],
+    requests: list[tuple[str, int, int]],
     n: int,
     engine_args: EngineArgs,
 ) -> float: