tenstorrent · tstescoTT · Jan 28, 2025 · Jan 27, 2025 · Jan 28, 2025 · Jan 28, 2025
diff --git a/benchmarking/README.md b/benchmarking/README.md
@@ -1,8 +1,6 @@
 # Benchmarking
 
-# Llama 3.1 70B Instruct
-
-### vLLM offline benchmarking
+## vLLM offline benchmarking
 
 The vLLM benchmarking script is https://github.com/tenstorrent/vllm/blob/dev/examples/offline_inference_tt.py
 
@@ -16,7 +14,7 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --
 python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --perf_prompt_len 2048 --max_tokens 2048
 ```
 
-#### Command Line Arguments
+### Command Line Arguments
 
 - `--prompts_json` (default: `"tt_metal/prompts.json"`):
   - **Path to prompts JSON file** used for inference. Prompts should be in a list format. This will not be used if `measure_perf` is set.
@@ -36,14 +34,14 @@ python examples/offline_inference_tt.py --measure_perf --max_seqs_in_batch 32 --
 - `--max_seqs_in_batch` (default: `32`):
   - **Maximum batch size** for inference, determining the number of prompts processed in parallel.
 
-### Online Benchmarking
+## Online Benchmarking
 
-#### single user
+### single user
 
 ```bash
 python utils/prompt_client_cli.py \
     --num_prompts 32 \
-    --batch_size 1 \
+    --max_concurrent 1 \
     --tokenizer_model meta-llama/Llama-3.1-70B-Instruct \
     --max_prompt_length 128 \
     --input_seq_len 128 \
@@ -52,7 +50,7 @@ python utils/prompt_client_cli.py \
     --dataset random
 ```
 
-#### using vllm/benchmarking/benchmark_serving.py
+### using vllm/benchmarking/benchmark_serving.py
 Within the Docker container, use the benchmark_serving.patch file:
 ```
 cd ~/app/src
@@ -104,10 +102,19 @@ P99 ITL (ms):                            8.05
 ==================================================
 ```
 
-#### using tt-inference-server/benchmarking/prompt_client_online_benchmark.py
+### using tt-inference-server/benchmarking/prompt_client_online_benchmark.py
 
 ```bash
 export PYTHONPATH=$PYTHONPATH:$PWD
 python benchmarking/prompt_client_online_benchmark.py
 ```
 
+# Benchmark summary
+
+Generate a markdown table and .csv output file from multiple benchmarking runs:
+```bash
+# for vllm_online_benchmark.py
+python benchmarking/benchmark_summary.py ~/cache_root/vllm_online_benchmark_results/results_2025-01-17_17-19-28  --output-dir ./vllm_results_summary
+# or for prompt_client_online_benchmarking.py
+python benchmarking/benchmark_summary.py ~/cache_root/online_benchmark_results/results_2025-01-15_20-58-57 --output-dir ./results_summary
+```
diff --git a/benchmarking/benchmark_summary.py b/benchmarking/benchmark_summary.py
@@ -47,12 +47,12 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]:
     pattern = r"""
         benchmark_
         (?P<timestamp>\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2})  # Timestamp
-        _isl-(?P<isl>\d+)                                    # Input sequence length
-        _osl-(?P<osl>\d+)                                    # Output sequence length
-        _bsz-(?P<bsz>\d+)                                    # Batch size
-        _n-(?P<n>\d+)                                        # Number of requests
+        (_(?P<mesh_device>N150|N300|T3K_LINE|T3K_RING|TG))? # MESH_DEVICE
+        _isl-(?P<isl>\d+)                                   # Input sequence length
+        _osl-(?P<osl>\d+)                                   # Output sequence length
+        _bsz-(?P<bsz>\d+)                                   # Batch size
+        _n-(?P<n>\d+)                                       # Number of requests
     """
-
     match = re.search(pattern, filename, re.VERBOSE)
     if not match:
         raise ValueError(f"Could not extract parameters from filename: {filename}")
@@ -64,6 +64,7 @@ def extract_params_from_filename(filename: str) -> Dict[str, Any]:
     # Extract and convert numeric parameters
     params = {
         "timestamp": timestamp,
+        "mesh_device": match.group("mesh_device"),
         "input_sequence_length": int(match.group("isl")),
         "output_sequence_length": int(match.group("osl")),
         "batch_size": int(match.group("bsz")),
@@ -132,6 +133,7 @@ def process_benchmark_file(filepath: str) -> Dict[str, Any]:
         "timestamp": params["timestamp"],
         "model_id": data.get("model_id", ""),
         "backend": data.get("backend", ""),
+        "mesh_device": params.get("mesh_device", ""),
         "input_sequence_length": params["input_sequence_length"],
         "output_sequence_length": params["output_sequence_length"],
         "batch_size": params["batch_size"],
@@ -363,6 +365,7 @@ def main():
     metadata = (
         f"Model ID: {results[0].get('model_id')}\n"
         f"Backend: {results[0].get('backend')}\n"
+        f"mesh_device: {results[0].get('mesh_device')}\n"
     )
     display_md_str = get_markdown_table(display_results, metadata=metadata)
     print(display_md_str)

diff --git a/benchmarking/prompt_client_online_benchmark.py b/benchmarking/prompt_client_online_benchmark.py
@@ -15,7 +15,7 @@
 from utils.prompt_configs import PromptConfig, BatchConfig, EnvironmentConfig
 from utils.prompt_client import PromptClient
 from utils.batch_processor import BatchProcessor
-from utils.prompt_generation import generate_prompts
+from utils.prompt_generation import generate_prompts, generate_images
 from transformers import AutoTokenizer
 
 logging.basicConfig(
@@ -32,35 +32,37 @@ def run_sequence_length_test(
     file_prefix: str,
     num_iterations: int = 1,
 ) -> List[dict]:
+    # Initialize configurations
+    env_config = EnvironmentConfig(vllm_model=model)
+    prompt_client = PromptClient(env_config)
+    mesh_device = env_config.mesh_device
+
     # Create save directory
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    save_path = Path(result_dir) / f"results_{timestamp}"
+    save_path = Path(result_dir) / f"results_{timestamp}_{mesh_device}"
     save_path.mkdir(parents=True, exist_ok=True)
 
     # Initialize results storage
     all_results = []
 
-    # Initialize configurations
-    env_config = EnvironmentConfig(vllm_model=model)
-    prompt_client = PromptClient(env_config)
-
     # Test all combinations
     total_combinations = len(combinations)
     for idx, params in enumerate(combinations, 1):
         input_len = params["input_len"]
         output_len = params["output_len"]
-        batch_size = params["batch_size"]
+        max_concurrent = params["max_concurrent"]
         num_prompts = params["num_prompts"]
+        images_per_prompt = params.get("images_per_prompt", 0)
         run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
         results_file = (
             save_path
-            / f"{file_prefix}_{run_timestamp}_isl-{input_len}_osl-{output_len}_bsz-{batch_size}_n-{num_prompts}.json"
+            / f"{file_prefix}_{run_timestamp}_{mesh_device}_isl-{input_len}_osl-{output_len}_maxcon-{max_concurrent}_n-{num_prompts}.json"
         )
 
         logger.info(
             f"\nTesting combination {idx}/{total_combinations}:\n"
             f"input_len={input_len}, output_len={output_len}, "
-            f"batch_size={batch_size}, num_prompts={num_prompts}"
+            f"max_concurrent={max_concurrent}, num_prompts={num_prompts}"
         )
 
         # Configure prompt generation
@@ -74,32 +76,39 @@ def run_sequence_length_test(
             template=None,
             save_path=None,
             print_prompts=False,
+            include_images=images_per_prompt > 0,
+            images_per_prompt=images_per_prompt,
+            use_chat_api=images_per_prompt > 0,
         )
 
         # Generate prompts
         prompts, input_seq_lengths = generate_prompts(prompt_config)
+        images = generate_images(prompt_config)
 
         # Configure batch processing
         output_seq_lens = [output_len] * num_prompts
         batch_config = BatchConfig(
-            batch_size=batch_size,
+            max_concurrent=max_concurrent,
             output_seq_lens=output_seq_lens,
             num_full_iterations=num_iterations,
-            vary_batch_size=False,
+            vary_max_concurrent=False,
             inter_batch_delay=0,
             stream=True,
+            use_chat_api=images_per_prompt > 0,
         )
 
         # Initialize processor and tokenizer
         batch_processor = BatchProcessor(prompt_client, batch_config)
         tokenizer = AutoTokenizer.from_pretrained(model)
 
         # pre-capture traces so benchmark does not include 1st run trace capture time
+        # TODO: add support for image input to capture_traces
         prompt_client.capture_traces(context_lens=[(input_len, output_len)])
         # Process batches
         try:
             responses = batch_processor.process_batch(
                 prompts=prompts,
+                images=images,
                 input_seq_lengths=input_seq_lengths,
                 tokenizer=tokenizer,
             )
@@ -111,7 +120,7 @@ def run_sequence_length_test(
                 "timestamp": timestamp,
                 "input_sequence_length": input_len,
                 "output_sequence_length": output_len,
-                "batch_size": batch_size,
+                "max_concurrent": max_concurrent,
                 "num_requests": num_requests,
                 "mean_tpot_ms": np.mean([r["tpot_ms"] for r in responses]),
                 "std_tpot_ms": np.std([r["tpot_ms"] for r in responses]),
@@ -120,18 +129,17 @@ def run_sequence_length_test(
                 "total_input_tokens": sum([r["input_seq_len"] for r in responses]),
                 "total_output_tokens": sum([r["output_seq_len"] for r in responses]),
                 "mean_e2el_ms": mean_e2el_ms,
+                "request_throughput": max_concurrent / (mean_e2el_ms / 1000),
                 "num_iterations": num_iterations,
-                "request_throughput": num_requests / mean_e2el_ms,
             }
 
             all_results.append(stats)
 
             # Log results
             logger.info(
                 f"Results for combination {idx}/{total_combinations}:\n"
-                f"Mean TTFT: {stats['mean_ttft_ms']:.4f} ± {stats['std_ttft_ms']:.4f}"
-                f"Mean TPOT: {stats['mean_tpot_ms']:.4f} ± "
-                f"{stats['std_tpot_ms']:.4f}\n"
+                f"Mean TTFT: {stats['mean_ttft_ms']:.4f} ± {stats['std_ttft_ms']:.4f}\n"
+                f"Mean TPOT: {stats['mean_tpot_ms']:.4f} ± {stats['std_tpot_ms']:.4f}\n"
             )
 
             # Save results after each combination
@@ -148,28 +156,29 @@ def run_sequence_length_test(
 if __name__ == "__main__":
     # fmt: off
     combinations = [
+        # example for image input:
+        # {"input_len": 128, "output_len": 128, "max_concurrent": 16, "num_prompts": 32, "images_per_prompt": 1},
         # sweeps for batch-1
-        {"input_len": 128, "output_len": 10, "batch_size": 1, "num_prompts": 64},
-        {"input_len": 128, "output_len": 128, "batch_size": 1, "num_prompts": 64},
-        {"input_len": 128, "output_len": 1024, "batch_size": 1, "num_prompts": 16},
-        {"input_len": 128, "output_len": 2048, "batch_size": 1, "num_prompts": 8},
-        {"input_len": 128, "output_len": 4096, "batch_size": 1, "num_prompts": 8},
-        {"input_len": 2048, "output_len": 128, "batch_size": 1, "num_prompts": 32},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 1, "num_prompts": 8},
+        {"input_len": 128, "output_len": 10, "max_concurrent": 1, "num_prompts": 64},
+        {"input_len": 128, "output_len": 128, "max_concurrent": 1, "num_prompts": 64},
+        {"input_len": 128, "output_len": 1024, "max_concurrent": 1, "num_prompts": 16},
+        {"input_len": 128, "output_len": 2048, "max_concurrent": 1, "num_prompts": 8},
+        {"input_len": 128, "output_len": 4096, "max_concurrent": 1, "num_prompts": 8},
+        {"input_len": 2048, "output_len": 128, "max_concurrent": 1, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 2048, "max_concurrent": 1, "num_prompts": 8},
         # sweeps for batch-32
-        {"input_len": 128, "output_len": 10, "batch_size": 32, "num_prompts": 32 * 16},
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 16},
-        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 8},
-        {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4},
-        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 4},
-        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 8},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4},
+        {"input_len": 128, "output_len": 10, "max_concurrent": 32, "num_prompts": 32 * 16},
+        {"input_len": 128, "output_len": 128, "max_concurrent": 32, "num_prompts": 32 * 16},
+        {"input_len": 128, "output_len": 1024, "max_concurrent": 32, "num_prompts": 32 * 8},
+        {"input_len": 128, "output_len": 2048, "max_concurrent": 32, "num_prompts": 32 * 4},
+        {"input_len": 128, "output_len": 4096, "max_concurrent": 32, "num_prompts": 32 * 4},
+        {"input_len": 2048, "output_len": 128, "max_concurrent": 32, "num_prompts": 32 * 8},
+        {"input_len": 2048, "output_len": 2048, "max_concurrent": 32, "num_prompts": 32 * 4},
     ]
     # fmt: on
 
     # Create output directory
     cache_dir = Path(os.environ.get("CACHE_ROOT", ""))
-    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     result_dir = cache_dir / "online_benchmark_results"
     result_dir.mkdir(parents=True, exist_ok=True)
 

diff --git a/benchmarking/vllm_online_benchmark.py b/benchmarking/vllm_online_benchmark.py
@@ -63,11 +63,16 @@ def run_benchmark(
 def main():
     # Configuration
     env_config = EnvironmentConfig()
+    mesh_device = env_config.mesh_device
 
     # Create output directory
     cache_dir = Path(os.environ.get("CACHE_ROOT", ""))
     timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    result_dir = cache_dir / "vllm_online_benchmark_results" / f"results_{timestamp}"
+    result_dir = (
+        cache_dir
+        / "vllm_online_benchmark_results"
+        / f"results_{timestamp}_{mesh_device}"
+    )
     result_dir.mkdir(parents=True, exist_ok=True)
 
     prompt_client = PromptClient(env_config)
@@ -77,12 +82,28 @@ def main():
     # Get all benchmark combinations using the original function
     # fmt: off
     combinations = [
-        {"input_len": 128, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 32},
-        {"input_len": 128, "output_len": 1024, "batch_size": 32, "num_prompts": 32 * 16},
-        {"input_len": 128, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 8},
-        {"input_len": 128, "output_len": 4096, "batch_size": 32, "num_prompts": 32 * 4},
-        {"input_len": 2048, "output_len": 128, "batch_size": 32, "num_prompts": 32 * 16},
-        {"input_len": 2048, "output_len": 2048, "batch_size": 32, "num_prompts": 32 * 4},
+        # ttft batch-1
+        {"input_len": 128, "output_len": 128, "max_concurrent": 1, "num_prompts": 1},
+        {"input_len": 1024, "output_len": 128, "max_concurrent": 1, "num_prompts": 1},
+        {"input_len": 2048, "output_len": 128, "max_concurrent": 1, "num_prompts": 1},
+        # ttft batch-32
+        {"input_len": 128, "output_len": 128, "max_concurrent": 32, "num_prompts": 32},
+        {"input_len": 1024, "output_len": 128, "max_concurrent": 32, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 128, "max_concurrent": 32, "num_prompts": 32},
+        # sweeps for batch-1
+        {"input_len": 128, "output_len": 128, "max_concurrent": 1, "num_prompts": 64},
+        {"input_len": 128, "output_len": 1024, "max_concurrent": 1, "num_prompts": 16},
+        {"input_len": 128, "output_len": 2048, "max_concurrent": 1, "num_prompts": 8},
+        {"input_len": 128, "output_len": 4096, "max_concurrent": 1, "num_prompts": 8},
+        {"input_len": 2048, "output_len": 128, "max_concurrent": 1, "num_prompts": 32},
+        {"input_len": 2048, "output_len": 2048, "max_concurrent": 1, "num_prompts": 8},
+        # sweeps for batch-32
+        {"input_len": 128, "output_len": 128, "max_concurrent": 32, "num_prompts": 32 * 16},
+        {"input_len": 128, "output_len": 1024, "max_concurrent": 32, "num_prompts": 32 * 8},
+        {"input_len": 128, "output_len": 2048, "max_concurrent": 32, "num_prompts": 32 * 4},
+        {"input_len": 128, "output_len": 4096, "max_concurrent": 32, "num_prompts": 32 * 4},
+        {"input_len": 2048, "output_len": 128, "max_concurrent": 32, "num_prompts": 32 * 8},
+        {"input_len": 2048, "output_len": 2048, "max_concurrent": 32, "num_prompts": 32 * 4},
     ]
     # fmt: on
 
@@ -98,11 +119,11 @@ def main():
         run_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
         isl = params["input_len"]
         osl = params["output_len"]
-        bsz = params["batch_size"]
+        max_concurrent = params["max_concurrent"]
         num_prompts = params["num_prompts"]
         result_filename = (
             result_dir
-            / f"vllm_online_benchmark_{run_timestamp}_isl-{isl}_osl-{osl}_bsz-{bsz}_n-{num_prompts}.json"
+            / f"vllm_online_benchmark_{run_timestamp}_{mesh_device}_isl-{isl}_osl-{osl}_maxcon-{max_concurrent}_n-{num_prompts}.json"
         )
         logger.info(f"\nRunning benchmark {i}/{len(combinations)}")
         run_benchmark(

diff --git a/evals/README.md b/evals/README.md
@@ -68,5 +68,5 @@ Running the `run_evals.sh` script will:
 
 ```bash
 cd ~/app/evals
-./run_evals
+. run_evals.sh
 ```
diff --git a/evals/run_evals.sh b/evals/run_evals.sh
@@ -3,6 +3,13 @@
 #
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
+original_dir=$PWD
+
+if [[ -z "${HF_MODEL_REPO_ID}" ]]; then
+    echo "⛔ Error: env var HF_MODEL_REPO_ID is not set. This must be the model HF repo e.g. 'meta-llama/Llama-3.3-70B-Instruct'"
+    exit 1
+fi
+
 # set up lm_eval and evals datasets
 cd $HOME
 if python -c "import lm_eval" 2>/dev/null; then
@@ -34,8 +41,8 @@ cd $HOME/lm-evaluation-harness/
 # GPQA
 lm_eval \
 --model local-completions \
---model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True,timeout=None \
---gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \
+--model_args model=${HF_MODEL_REPO_ID},base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True,timeout=None \
+--gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
 --tasks meta_gpqa \
 --batch_size auto \
 --output_path /home/user/cache_root/eval_output \
@@ -46,11 +53,13 @@ lm_eval \
 # IFEval
 lm_eval \
 --model local-completions \
---model_args model=meta-llama/Llama-3.1-70B-Instruct,base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True,timeout=None \
---gen_kwargs model=meta-llama/Llama-3.1-70B-Instruct,stop="<|eot_id|>",stream=False \
+--model_args model=${HF_MODEL_REPO_ID},base_url=http://127.0.0.1:7000/v1/completions,num_concurrent=32,max_retries=4,tokenized_requests=False,add_bos_token=True,timeout=None \
+--gen_kwargs model=${HF_MODEL_REPO_ID},stop="<|eot_id|>",stream=False \
 --tasks meta_ifeval \
 --batch_size auto \
 --output_path /home/user/cache_root/eval_output \
 --include_path ./work_dir \
 --seed 42  \
 --log_samples
+
+cd $original_dir