From ad55c7f766b4f5133894e5418670069b3a1246d9 Mon Sep 17 00:00:00 2001
From: AdrianAbeyta <Adrian.Abeyta@amd.com>
Date: Wed, 12 Feb 2025 02:54:04 +0000
Subject: [PATCH 1/2] Update profiling benchmarks

---
 benchmarks/profiling/benchmark_latency.py    |  37 ++-
 benchmarks/profiling/benchmark_throughput.py | 295 +++++++++++++++----
 2 files changed, 264 insertions(+), 68 deletions(-)

diff --git a/benchmarks/profiling/benchmark_latency.py b/benchmarks/profiling/benchmark_latency.py
index 34b157eb6ab6f..ab1f7dff12cc4 100644
--- a/benchmarks/profiling/benchmark_latency.py
+++ b/benchmarks/profiling/benchmark_latency.py
@@ -16,6 +16,7 @@
 from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
 from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -87,37 +88,47 @@ def get_profiling_context(profile_result_dir: Optional[str] = None):
     dummy_prompt_token_ids = np.random.randint(10000,
                                                size=(args.batch_size,
                                                      args.input_len))
-    dummy_inputs: List[PromptType] = [{
+    dummy_prompts: List[PromptType] = [{
         "prompt_token_ids": batch
     } for batch in dummy_prompt_token_ids.tolist()]
 
-    def run_to_completion(profile_result_dir: Optional[str] = None):
-        if profile_result_dir:
-            with get_profiling_context(profile_result_dir):
-                llm.generate(dummy_inputs,
-                             sampling_params=sampling_params,
-                             use_tqdm=False)
-        else:
-            start_time = time.perf_counter()
-            llm.generate(dummy_inputs,
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts,
                          sampling_params=sampling_params,
                          use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ))
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with get_profiling_context(profile_dir):
+                llm_generate()
+        else:
+            start_time = time.perf_counter()
+            llm_generate()
             end_time = time.perf_counter()
             latency = end_time - start_time
             return latency
 
     print("Warming up...")
     for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
-        run_to_completion(profile_result_dir=None)
+        run_to_completion(profile_dir=None)
 
     if args.profile_torch or args.profile_rpd:
-        run_to_completion(profile_result_dir=profile_result_dir)
+        run_to_completion(profile_dir=profile_result_dir)
         return
 
     # Benchmark.
     latencies = []
     for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
-        latencies.append(run_to_completion(profile_result_dir=None))
+        latencies.append(run_to_completion(profile_dir=None))
     latencies = np.array(latencies)
     percentages = [10, 25, 50, 75, 90, 99]
     percentiles = np.percentile(latencies, percentages)
diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py
index dbf689de95257..51dbedf2e00f4 100644
--- a/benchmarks/profiling/benchmark_throughput.py
+++ b/benchmarks/profiling/benchmark_throughput.py
@@ -6,12 +6,15 @@
 import os
 import random
 import time
+from functools import cache
+from typing import Dict, List, Optional, Tuple
 from contextlib import contextmanager, nullcontext
 from pathlib import Path
-from typing import List, Optional, Tuple
+
 
 import torch
 import uvloop
+from PIL import Image
 from tqdm import tqdm
 from transformers import (AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedTokenizerBase)
@@ -19,41 +22,131 @@
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.entrypoints.openai.api_server import (
     build_async_engine_client_from_engine_args)
+from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 from vllm.utils import FlexibleArgumentParser, merge_async_iterators
 
 
-def sample_requests(
-    dataset_path: str,
-    num_requests: int,
-    tokenizer: PreTrainedTokenizerBase,
-    fixed_output_len: Optional[int],
-) -> List[Tuple[str, int, int]]:
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        lora_request: Optional LoRARequest specifying the LoRA to use.
+    """
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[MultiModalDataDict] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
+
+
+def get_random_lora_request(
+        args: argparse.Namespace
+) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
+    global lora_tokenizer_cache
+    lora_id = random.randint(1, args.max_loras)
+    lora_request = LoRARequest(lora_name=str(lora_id),
+                               lora_int_id=lora_id,
+                               lora_path=lora_path_on_disk(args.lora_path))
+    if lora_id not in lora_tokenizer_cache:
+        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+    return lora_request, lora_tokenizer_cache[lora_id]
+
+
+def sample_requests(tokenizer: PreTrainedTokenizerBase,
+                   args: argparse.Namespace) -> List[SampleRequest]:
+    
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
-
+    
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
-    # Only keep the first two turns of each conversation.
-    dataset = [(data["conversations"][0]["value"],
-                data["conversations"][1]["value"]) for data in dataset]
-
     # Shuffle the dataset.
     random.shuffle(dataset)
 
     # Filter out sequences that are too long or too short
-    filtered_dataset: List[Tuple[str, int, int]] = []
-    for i in range(len(dataset)):
+    filtered_dataset: List[SampleRequest] = []
+    for data in tqdm(dataset,
+                     total=len(filtered_dataset),
+                     desc="sampling requests"):
         if len(filtered_dataset) == num_requests:
             break
 
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path,
+                              str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert(
+                    "RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
+        request_tokenizer = tokenizer
+        lora_request: Optional[LoRARequest] = None
+        if args.enable_lora:
+            lora_request, lora_tokenizer = get_random_lora_request(args)
+            if lora_tokenizer:
+                request_tokenizer = lora_tokenizer
+
         # Tokenize the prompts and completions.
-        prompt = dataset[i][0]
-        prompt_token_ids = tokenizer(prompt).input_ids
-        completion = dataset[i][1]
-        completion_token_ids = tokenizer(completion).input_ids
+        prompt_token_ids = request_tokenizer(prompt).input_ids
+        completion_token_ids = request_tokenizer(completion).input_ids
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
                          ) if fixed_output_len is None else fixed_output_len
@@ -63,13 +156,18 @@ def sample_requests(
         if prompt_len > 1024 or prompt_len + output_len > 2048:
             # Prune too long sequences.
             continue
-        filtered_dataset.append((prompt, prompt_len, output_len))
+        filtered_dataset.append(
+            SampleRequest(prompt=prompt,
+                          prompt_len=prompt_len,
+                          expected_output_len=output_len,
+                          multi_modal_data=multi_modal_data,
+                          lora_request=lora_request))
 
     return filtered_dataset
 
 
 def run_vllm(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     n: int,
     engine_args: EngineArgs,
 ) -> float:
@@ -125,32 +223,56 @@ def get_profiling_context(profile_dir: Optional[str] = None):
     llm = LLM(**dataclasses.asdict(engine_args))
 
     # Add the requests to the engine.
-    prompts: List[str] = []
+    prompts: List[TextPrompt] = []
     sampling_params: List[SamplingParams] = []
-    for prompt, _, output_len in requests:
-        prompts.append(prompt)
+    for request in requests:
+        prompts.append(
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
         sampling_params.append(
             SamplingParams(
                 n=n,
                 temperature=1.0,
                 top_p=1.0,
                 ignore_eos=True,
-                max_tokens=output_len,
+                max_tokens=request.expected_output_len,
             ))
+    lora_requests: Optional[List[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
+
+    use_beam_search = False
+
+    if not use_beam_search:
+        execute = lambda: llm.generate(prompts, sampling_params, lora_request=lora_requests, use_tqdm=True)
+    else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
+        prompts = [request.prompt for request in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0][2]
+        for request in requests:
+            assert request.expected_output_len == output_len
+        execute = lambda: llm.beam_search(
+                             prompts,
+                             BeamSearchParams(
+                             beam_width=n,
+                             max_tokens=output_len,
+                             ignore_eos=True,
+                          ))
 
     if args.profile_torch or args.profile_rpd:
         with get_profiling_context(profile_dir):
-            llm.generate(prompts, sampling_params, use_tqdm=True)
+            execute()
         return
     else:
         start = time.perf_counter()
-        llm.generate(prompts, sampling_params, use_tqdm=True)
+        execute()
         end = time.perf_counter()
-        return end - start
+        return end - start        
 
 
 async def run_vllm_async(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     n: int,
     engine_args: AsyncEngineArgs,
     disable_frontend_multiprocessing: bool = False,
@@ -161,23 +283,31 @@ async def run_vllm_async(
             engine_args, disable_frontend_multiprocessing) as llm:
 
         # Add the requests to the engine.
-        prompts: List[str] = []
+        prompts: List[TextPrompt] = []
         sampling_params: List[SamplingParams] = []
-        for prompt, _, output_len in requests:
-            prompts.append(prompt)
+        lora_requests: List[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                    TextPrompt(prompt=request.prompt,
+                               multi_modal_data=request.multi_modal_data))
             sampling_params.append(
                 SamplingParams(
                     n=n,
                     temperature=1.0,
                     top_p=1.0,
                     ignore_eos=True,
-                    max_tokens=output_len,
+                    max_tokens=request.lora_requests,
                 ))
+            lora_requests.append(request.lora_request)
 
         generators = []
         start = time.perf_counter()
-        for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
-            generator = llm.generate(prompt, sp, request_id=f"test{i}")
+        for i, (prompt, sp,
+                lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
+            generator = llm.generate(prompt, 
+                                     sp,
+                                     lora_request=lr,
+                                     request_id=f"test{i}")
             generators.append(generator)
         all_gens = merge_async_iterators(*generators)
         async for i, res in all_gens:
@@ -187,15 +317,13 @@ async def run_vllm_async(
 
 
 def run_hf(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     model: str,
     tokenizer: PreTrainedTokenizerBase,
     n: int,
-    use_beam_search: bool,
     max_batch_size: int,
     trust_remote_code: bool,
 ) -> float:
-    assert not use_beam_search
     llm = AutoModelForCausalLM.from_pretrained(
         model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
     if llm.config.model_type == "llama":
@@ -227,7 +355,7 @@ def run_hf(
                               padding=True).input_ids
         llm_outputs = llm.generate(
             input_ids=input_ids.cuda(),
-            do_sample=not use_beam_search,
+            do_sample=True,
             num_return_sequences=n,
             temperature=1.0,
             top_p=1.0,
@@ -247,14 +375,14 @@ def run_hf(
 
 
 def run_mii(
-    requests: List[Tuple[str, int, int]],
+    requests: List[SampleRequest],
     model: str,
     tensor_parallel_size: int,
     output_len: int,
 ) -> float:
     from mii import client, serve
     llm = serve(model, tensor_parallel=tensor_parallel_size)
-    prompts = [prompt for prompt, _, _ in requests]
+    prompts = [request.prompt for request in requests]
 
     start = time.perf_counter()
     llm.generate(prompts, max_new_tokens=output_len)
@@ -272,15 +400,51 @@ def main(args: argparse.Namespace):
     tokenizer = AutoTokenizer.from_pretrained(
         args.tokenizer, trust_remote_code=args.trust_remote_code)
     if args.dataset is None:
-        # Synthesize a prompt with the given input length.
-        prompt = { "prompt_token_ids" : [42] * (args.input_len - 1) } \
-            if args.skip_tokenizer_init else "hi" * (args.input_len - 1)
-        requests = [(prompt, args.input_len, args.output_len)
-                    for _ in range(args.num_prompts)]
+       vocab_size = tokenizer.vocab_size
+       requests = []
+       for _ in range(args.num_prompts):
+           
+           request_tokenizer = tokenizer
+           lora_request: Optional[LoRARequest] = None
+           if args.enable_lora:
+               lora_request, lora_tokenizer = get_random_lora_request(args)
+               if lora_tokenizer:
+                   request_tokenizer = lora_tokenizer
+
+           # Synthesize a prompt with the given input length.
+           candidate_ids = [
+               random.randint(0, vocab_size - 1)
+               for _ in range(args.input_len)
+           ]
+           # As tokenizer may add additional tokens like BOS, we need to try
+           # different lengths to get the desired input length.
+           for _ in range(5):  # Max attempts to correct
+               candidate_prompt = request_tokenizer.decode(candidate_ids)
+               tokenized_len = len(request_tokenizer.encode(candidate_prompt))
+
+               if tokenized_len == args.input_len:
+                   break
+
+               # Adjust length based on difference
+               diff = args.input_len - tokenized_len
+               if diff > 0:
+                   candidate_ids.extend([
+                       random.randint(100, vocab_size - 100)
+                       for _ in range(diff)
+                   ])
+               else:
+                   candidate_ids = candidate_ids[:diff]
+           requests.append(
+               SampleRequest(prompt=candidate_prompt,
+                             prompt_len=args.input_len,
+                             expected_output_len=args.output_len,
+                             lora_request=lora_request))
     else:
-        requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
-                                   args.output_len)
+        requests = sample_requests(tokenizer, args)
 
+    is_multi_modal = any(request.multi_modal_data is not None
+                         for request in requests)
+    
     if args.backend == "vllm":
         if args.async_engine:
             elapsed_time = uvloop.run(
@@ -296,22 +460,29 @@ def main(args: argparse.Namespace):
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
+                              args.hf_max_batch_size, args.trust_remote_code)
     elif args.backend == "mii":
         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                                args.output_len)
     else:
         raise ValueError(f"Unknown backend: {args.backend}")
-    total_num_tokens = sum(prompt_len + output_len
-                           for _, prompt_len, output_len in requests)
-
+    total_num_tokens = sum(request.prompt_len + request.expected_output_len
+                           for request in requests)
+    total_output_tokens = sum(request.expected_output_len
+                              for request in requests)
+    
     if args.profile_torch or args.profile_rpd:
         # Profiling complete
         pass
     else:
+        if is_multi_modal:
+            print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
+              "following metrics are not accurate because image tokens are not"
+              " counted. See vllm-project/vllm/issues/9778 for details.")
+        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
         print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
-              f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+              f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+              f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
 
         # Output JSON results if specified
         if args.output_json:
@@ -370,6 +541,13 @@ def main(args: argparse.Namespace):
                         action='store_true',
                         default=False,
                         help="Disable decoupled async engine frontend.")
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.")
     parser.add_argument(
         '--profile-torch',
         action='store_true',
@@ -384,6 +562,7 @@ def main(args: argparse.Namespace):
         default=None,
         help=('path to save the profiler output. Can be visualized '
               'with ui.perfetto.dev or Tensorboard.'))
+
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
     if args.tokenizer is None:
@@ -393,6 +572,8 @@ def main(args: argparse.Namespace):
         assert args.output_len is not None
     else:
         assert args.input_len is None
+    if args.enable_lora:
+        assert args.lora_path is not None
 
     if args.backend == "vllm":
         if args.hf_max_batch_size is not None:
@@ -402,13 +583,14 @@ def main(args: argparse.Namespace):
             raise ValueError("HF max batch size is required for HF backend.")
         if args.quantization is not None:
             raise ValueError("Quantization is only for vLLM backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
     elif args.backend == "mii":
         if args.dtype != "auto":
             raise ValueError("dtype must be auto for MII backend.")
         if args.n != 1:
             raise ValueError("n must be 1 for MII backend.")
-        if args.use_beam_search:
-            raise ValueError("Beam search is not supported for MII backend.")
         if args.quantization is not None:
             raise ValueError("Quantization is only for vLLM backend.")
         if args.hf_max_batch_size is not None:
@@ -416,4 +598,7 @@ def main(args: argparse.Namespace):
         if args.tokenizer != args.model:
             raise ValueError("Tokenizer must be the same as the model for MII "
                              "backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM"
+                             " backend")
     main(args)

From 1228eb0bbc357cf7c3cfd3a5053de66f43a8451d Mon Sep 17 00:00:00 2001
From: AdrianAbeyta <Adrian.Abeyta@amd.com>
Date: Wed, 12 Feb 2025 16:59:12 +0000
Subject: [PATCH 2/2] Fix linter errors

---
 benchmarks/profiling/benchmark_throughput.py | 129 ++++++++++---------
 1 file changed, 66 insertions(+), 63 deletions(-)

diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py
index 51dbedf2e00f4..6de6132427515 100644
--- a/benchmarks/profiling/benchmark_throughput.py
+++ b/benchmarks/profiling/benchmark_throughput.py
@@ -6,11 +6,10 @@
 import os
 import random
 import time
-from functools import cache
-from typing import Dict, List, Optional, Tuple
 from contextlib import contextmanager, nullcontext
+from functools import cache
 from pathlib import Path
-
+from typing import Dict, List, Optional, Tuple
 
 import torch
 import uvloop
@@ -93,15 +92,15 @@ def get_random_lora_request(
 
 
 def sample_requests(tokenizer: PreTrainedTokenizerBase,
-                   args: argparse.Namespace) -> List[SampleRequest]:
-    
+                    args: argparse.Namespace) -> List[SampleRequest]:
+
     dataset_path: str = args.dataset
     num_requests: int = args.num_prompts
     fixed_output_len: Optional[int] = args.output_len
     model: str = args.model
     if fixed_output_len is not None and fixed_output_len < 4:
         raise ValueError("output_len too small")
-    
+
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
@@ -227,8 +226,8 @@ def get_profiling_context(profile_dir: Optional[str] = None):
     sampling_params: List[SamplingParams] = []
     for request in requests:
         prompts.append(
-                TextPrompt(prompt=request.prompt,
-                           multi_modal_data=request.multi_modal_data))
+            TextPrompt(prompt=request.prompt,
+                       multi_modal_data=request.multi_modal_data))
         sampling_params.append(
             SamplingParams(
                 n=n,
@@ -244,7 +243,10 @@ def get_profiling_context(profile_dir: Optional[str] = None):
     use_beam_search = False
 
     if not use_beam_search:
-        execute = lambda: llm.generate(prompts, sampling_params, lora_request=lora_requests, use_tqdm=True)
+        execute = lambda: llm.generate(prompts,
+                                       sampling_params,
+                                       lora_request=lora_requests,
+                                       use_tqdm=True)
     else:
         assert lora_requests is None, "BeamSearch API does not support LoRA"
         prompts = [request.prompt for request in requests]
@@ -253,12 +255,12 @@ def get_profiling_context(profile_dir: Optional[str] = None):
         for request in requests:
             assert request.expected_output_len == output_len
         execute = lambda: llm.beam_search(
-                             prompts,
-                             BeamSearchParams(
-                             beam_width=n,
-                             max_tokens=output_len,
-                             ignore_eos=True,
-                          ))
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ))
 
     if args.profile_torch or args.profile_rpd:
         with get_profiling_context(profile_dir):
@@ -268,7 +270,7 @@ def get_profiling_context(profile_dir: Optional[str] = None):
         start = time.perf_counter()
         execute()
         end = time.perf_counter()
-        return end - start        
+        return end - start
 
 
 async def run_vllm_async(
@@ -288,8 +290,8 @@ async def run_vllm_async(
         lora_requests: List[Optional[LoRARequest]] = []
         for request in requests:
             prompts.append(
-                    TextPrompt(prompt=request.prompt,
-                               multi_modal_data=request.multi_modal_data))
+                TextPrompt(prompt=request.prompt,
+                           multi_modal_data=request.multi_modal_data))
             sampling_params.append(
                 SamplingParams(
                     n=n,
@@ -304,7 +306,7 @@ async def run_vllm_async(
         start = time.perf_counter()
         for i, (prompt, sp,
                 lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
-            generator = llm.generate(prompt, 
+            generator = llm.generate(prompt,
                                      sp,
                                      lora_request=lr,
                                      request_id=f"test{i}")
@@ -400,51 +402,51 @@ def main(args: argparse.Namespace):
     tokenizer = AutoTokenizer.from_pretrained(
         args.tokenizer, trust_remote_code=args.trust_remote_code)
     if args.dataset is None:
-       vocab_size = tokenizer.vocab_size
-       requests = []
-       for _ in range(args.num_prompts):
-           
-           request_tokenizer = tokenizer
-           lora_request: Optional[LoRARequest] = None
-           if args.enable_lora:
-               lora_request, lora_tokenizer = get_random_lora_request(args)
-               if lora_tokenizer:
-                   request_tokenizer = lora_tokenizer
-
-           # Synthesize a prompt with the given input length.
-           candidate_ids = [
-               random.randint(0, vocab_size - 1)
-               for _ in range(args.input_len)
-           ]
-           # As tokenizer may add additional tokens like BOS, we need to try
-           # different lengths to get the desired input length.
-           for _ in range(5):  # Max attempts to correct
-               candidate_prompt = request_tokenizer.decode(candidate_ids)
-               tokenized_len = len(request_tokenizer.encode(candidate_prompt))
-
-               if tokenized_len == args.input_len:
-                   break
-
-               # Adjust length based on difference
-               diff = args.input_len - tokenized_len
-               if diff > 0:
-                   candidate_ids.extend([
-                       random.randint(100, vocab_size - 100)
-                       for _ in range(diff)
-                   ])
-               else:
-                   candidate_ids = candidate_ids[:diff]
-           requests.append(
-               SampleRequest(prompt=candidate_prompt,
-                             prompt_len=args.input_len,
-                             expected_output_len=args.output_len,
-                             lora_request=lora_request))
+        vocab_size = tokenizer.vocab_size
+        requests = []
+        for _ in range(args.num_prompts):
+
+            request_tokenizer = tokenizer
+            lora_request: Optional[LoRARequest] = None
+            if args.enable_lora:
+                lora_request, lora_tokenizer = get_random_lora_request(args)
+                if lora_tokenizer:
+                    request_tokenizer = lora_tokenizer
+
+            # Synthesize a prompt with the given input length.
+            candidate_ids = [
+                random.randint(0, vocab_size - 1)
+                for _ in range(args.input_len)
+            ]
+            # As tokenizer may add additional tokens like BOS, we need to try
+            # different lengths to get the desired input length.
+            for _ in range(5):  # Max attempts to correct
+                candidate_prompt = request_tokenizer.decode(candidate_ids)
+                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
+
+                if tokenized_len == args.input_len:
+                    break
+
+                # Adjust length based on difference
+                diff = args.input_len - tokenized_len
+                if diff > 0:
+                    candidate_ids.extend([
+                        random.randint(100, vocab_size - 100)
+                        for _ in range(diff)
+                    ])
+                else:
+                    candidate_ids = candidate_ids[:diff]
+            requests.append(
+                SampleRequest(prompt=candidate_prompt,
+                              prompt_len=args.input_len,
+                              expected_output_len=args.output_len,
+                              lora_request=lora_request))
     else:
         requests = sample_requests(tokenizer, args)
 
     is_multi_modal = any(request.multi_modal_data is not None
                          for request in requests)
-    
+
     if args.backend == "vllm":
         if args.async_engine:
             elapsed_time = uvloop.run(
@@ -470,15 +472,16 @@ def main(args: argparse.Namespace):
                            for request in requests)
     total_output_tokens = sum(request.expected_output_len
                               for request in requests)
-    
+
     if args.profile_torch or args.profile_rpd:
         # Profiling complete
         pass
     else:
         if is_multi_modal:
-            print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
-              "following metrics are not accurate because image tokens are not"
-              " counted. See vllm-project/vllm/issues/9778 for details.")
+            print(
+                "\033[91mWARNING\033[0m: Multi-modal request detected. The "
+                "following metrics are not accurate because image tokens are"
+                " not counted. See vllm-project/vllm/issues/9778 for details.")
         # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
         print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
               f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "