Add end-to-end performance checks for UNet Shallow with trace+2CQ

tenstorrent · Jan 23, 2025 · 420066f · 420066f
1 parent cd78741
commit 420066f
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 149 deletions.
diff --git a/models/experimental/functional_unet/tests/common.py b/models/experimental/functional_unet/tests/common.py
@@ -2,6 +2,8 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
+from dataclasses import dataclass
+
 import ttnn
 from loguru import logger
 
@@ -10,6 +12,18 @@
 UNET_FULL_MODEL_PCC = 0.99999
 
 
+@dataclass
+class UNetPerformanceStatistics:
+    groups: int
+    batch: int
+    num_devices: int
+    inference_and_compile_time: float
+    inference_time: float
+
+    def get_fps(self) -> float:
+        return round(self.batch * self.groups * self.num_devices / self.inference_time, 4)
+
+
 def is_n300_with_eth_dispatch_cores(mesh_device) -> bool:
     all_devices_using_full_grid = all(
         [(8 == device.core_grid.x and 8 == device.core_grid.y) for device in mesh_device.get_devices()]

diff --git a/models/experimental/functional_unet/tests/test_unet_perf.py b/models/experimental/functional_unet/tests/test_unet_perf.py
@@ -2,28 +2,13 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
-import ttnn
 import pytest
 
 from loguru import logger
 
-from models.experimental.functional_unet.tt.model_preprocessing import (
-    create_unet_input_tensors,
-    create_unet_model_parameters,
-)
-from models.experimental.functional_unet.tt import unet_shallow_torch
-from models.experimental.functional_unet.tt import unet_shallow_ttnn
-from models.experimental.functional_unet.tests.common import (
-    verify_with_pcc,
-    is_n300_with_eth_dispatch_cores,
-    is_t3k_with_eth_dispatch_cores,
-    UNET_FULL_MODEL_PCC,
-)
-
 from models.perf.perf_utils import prep_perf_report
 from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report
 from models.utility_functions import (
-    profiler,
     skip_for_grayskull,
 )
 
@@ -59,167 +44,93 @@ def test_unet_perf_device(batch: int, groups: int, expected_device_perf_fps: flo
 
 @skip_for_grayskull("UNet not currently supported on GS")
 @pytest.mark.models_performance_bare_metal
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 79104}], indirect=True)
 @pytest.mark.parametrize(
-    "batch, groups, iterations, expected_compile_time, expected_inference_time_ms",
-    ((1, 2, 16, 25.0, 39.0),),
+    "device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
+)
+@pytest.mark.parametrize(
+    "batch, groups, iterations, expected_compile_time, expected_throughput",
+    ((1, 2, 128, 25.0, 830.0),),
 )
-def test_unet_perf_e2e(
+def test_unet_trace_perf(
     batch: int,
     groups: int,
     iterations: int,
     expected_compile_time: float,
-    expected_inference_time_ms: float,
+    expected_throughput: float,
     device,
     use_program_cache,
     reset_seeds,
 ):
-    profiler.clear()
-
-    torch_input, ttnn_input = create_unet_input_tensors(batch, groups, channel_order="first", pad=False, fold=False)
-
-    profiler.start(f"initialize_ref_model")
-    model = unet_shallow_torch.UNet.from_random_weights(groups=groups)
-    profiler.end(f"initialize_ref_model")
-
-    profiler.start(f"initialize_model")
-    parameters = create_unet_model_parameters(model, torch_input, groups=groups, device=device)
-    ttnn_model = unet_shallow_ttnn.UNet(parameters, device)
-    profiler.end(f"initialize_model")
-
-    torch_output_tensor = model(torch_input)
-
-    logger.info(f"Compiling model with warmup run")
-    profiler.start(f"inference_and_compile_time")
-    output_tensor = ttnn_model(ttnn_input).cpu()
-    profiler.end(f"inference_and_compile_time")
-
-    inference_and_compile_time = profiler.get("inference_and_compile_time")
-    logger.info(f"Model compiled with warmup run in {(inference_and_compile_time):.2f} s")
-
-    logger.info(f"Running inference for {iterations} iterations")
-    for idx in range(iterations):
-        profiler.start("inference_time")
-        profiler.start(f"inference_time_{idx}")
-        output_tensor = ttnn_model(ttnn_input).cpu()
-        profiler.end(f"inference_time_{idx}")
-        profiler.end("inference_time")
-
-    mean_inference_time = profiler.get("inference_time")
-    inference_time = profiler.get(f"inference_time_{iterations - 1}")
-    compile_time = inference_and_compile_time - inference_time
-    logger.info(f"Model compilation took {compile_time:.1f} s")
-    logger.info(f"Inference time on last iterations was completed in {(inference_time * 1000.0):.2f} ms")
-    logger.info(
-        f"Mean inference time for {batch} (batch) images was {(mean_inference_time * 1000.0):.2f} ms ({batch / mean_inference_time:.2f} fps)"
+    from models.experimental.functional_unet.tests.test_unet_trace import (
+        test_unet_trace_2cq_same_io,
     )
 
-    expected_inference_time = expected_inference_time_ms * 1e-3
+    logger.info(f"Invoking underlying model test for {iterations} iterations...")
+    result = test_unet_trace_2cq_same_io(batch, groups, iterations, device, use_program_cache, reset_seeds)
+
+    total_num_samples = result.batch * result.groups * result.num_devices
+    expected_inference_time = total_num_samples / expected_throughput
     prep_perf_report(
-        model_name=f"unet_shallow",
-        batch_size=batch,
-        inference_and_compile_time=inference_and_compile_time,
-        inference_time=inference_time,
+        model_name="unet_shallow-trace_2cq_same_io",
+        batch_size=total_num_samples,
+        inference_and_compile_time=result.inference_and_compile_time,
+        inference_time=result.inference_time,
         expected_compile_time=expected_compile_time,
         expected_inference_time=expected_inference_time,
-        comments="",
+        comments=f"batch_{result.batch}-groups_{result.groups}-num_devices_{result.num_devices}",
     )
-
-    logger.info(f"Running sanity check against reference model output")
-    B, C, H, W = torch_output_tensor.shape
-    ttnn_output_tensor = ttnn.to_torch(output_tensor).reshape(B, C, H, W)
-    verify_with_pcc(torch_output_tensor, ttnn_output_tensor, UNET_FULL_MODEL_PCC)
+    assert (
+        result.get_fps() >= expected_throughput
+    ), f"Expected end-to-end performance to exceed {expected_throughput:.2f} fps but was {result.get_fps():.2f} fps"
 
 
 @skip_for_grayskull("UNet not currently supported on GS")
 @pytest.mark.models_performance_bare_metal
-@pytest.mark.parametrize("enable_async_mode", (True,), indirect=True)
-@pytest.mark.parametrize("device_params", [{"l1_small_size": 79104}], indirect=True)
 @pytest.mark.parametrize(
-    "batch, groups, iterations, expected_compile_time, expected_inference_time_ms",
-    ((1, 2, 16, 25.0, 61.0),),
+    "device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
+)
+@pytest.mark.parametrize(
+    "batch, groups, iterations, expected_compile_time, expected_throughput, use_async_mode",
+    (
+        (1, 2, 128, 25.0, 1220.0, True),
+        (1, 2, 128, 25.0, 1650.0, False),
+    ),
 )
-def test_unet_data_parallel_perf_e2e(
+def test_unet_trace_perf_multi_device(
     batch: int,
     groups: int,
     iterations: int,
     expected_compile_time: float,
-    expected_inference_time_ms: float,
+    expected_throughput: float,
+    use_async_mode: bool,
     mesh_device,
     use_program_cache,
     reset_seeds,
-    enable_async_mode,
 ):
-    if not is_n300_with_eth_dispatch_cores(mesh_device) and not is_t3k_with_eth_dispatch_cores(mesh_device):
-        pytest.skip("Test is only valid for N300 or T3000")
-
-    profiler.clear()
-
-    inputs_mesh_mapper = ttnn.ShardTensorToMesh(mesh_device, dim=0)
-    weights_mesh_mapper = ttnn.ReplicateTensorToMesh(mesh_device)
-    output_mesh_composer = ttnn.ConcatMeshToTensor(mesh_device, dim=0)
-
-    torch_input, ttnn_input = create_unet_input_tensors(batch, groups)
-
-    profiler.start(f"initialize_ref_model")
-    model = unet_shallow_torch.UNet.from_random_weights(groups=groups)
-    profiler.end(f"initialize_ref_model")
-
-    profiler.start(f"initialize_model")
-    parameters = create_unet_model_parameters(model, torch_input, groups=groups, device=mesh_device)
-    ttnn_model = unet_shallow_ttnn.UNet(parameters, device=mesh_device, mesh_mapper=weights_mesh_mapper)
-    profiler.end(f"initialize_model")
-
-    num_devices = len(mesh_device.get_device_ids())
-    total_batch = num_devices * batch
-    torch_input, ttnn_input = create_unet_input_tensors(
-        total_batch, groups, channel_order="first", pad=False, fold=False, mesh_mapper=inputs_mesh_mapper
-    )
-    logger.info(f"Created reference input tensors: {list(torch_input.shape)}")
-    logger.info(
-        f"Created multi-device input tensors: shape={list(ttnn_input.shape)} on devices={mesh_device.get_device_ids()}"
+    from models.experimental.functional_unet.tests.test_unet_trace import (
+        test_unet_trace_2cq_same_io_multi_device,
     )
 
-    torch_output_tensor = model(torch_input)
-
-    logger.info(f"Compiling model with warmup run")
-    profiler.start(f"inference_and_compile_time")
-    output_tensor = ttnn.from_device(ttnn_model(ttnn_input), blocking=True)
-    profiler.end(f"inference_and_compile_time")
-
-    inference_and_compile_time = profiler.get("inference_and_compile_time")
-    logger.info(f"Model compiled with warmup run in {(inference_and_compile_time):.2f} s")
-
-    logger.info(f"Running inference for {iterations} iterations")
-    for idx in range(iterations):
-        profiler.start("inference_time")
-        profiler.start(f"inference_time_{idx}")
-        output_tensor = ttnn.from_device(ttnn_model(ttnn_input), blocking=False)
-        profiler.end(f"inference_time_{idx}")
-        profiler.end("inference_time")
-    ttnn.synchronize_devices(mesh_device)
-
-    mean_inference_time = profiler.get("inference_time")
-    inference_time = profiler.get(f"inference_time_{iterations - 1}")
-    compile_time = inference_and_compile_time - inference_time
-    logger.info(f"Model compilation took {compile_time:.1f} s")
-    logger.info(f"Inference time on last iterations was completed in {(inference_time * 1000.0):.2f} ms")
-    logger.info(
-        f"Mean inference time for {total_batch} (batch) images was {(mean_inference_time * 1000.0):.2f} ms ({total_batch / mean_inference_time:.2f} fps)"
+    mesh_device.enable_async(use_async_mode)
+    model_name = "unet_shallow-trace_2cq_same_io-multi_device"
+    model_name += "-async" if use_async_mode else "-no_async"
+
+    logger.info(f"Invoking underlying model test for {iterations} iterations...")
+    result = test_unet_trace_2cq_same_io_multi_device(
+        batch, groups, iterations, mesh_device, use_async_mode, use_program_cache, reset_seeds
     )
 
-    expected_inference_time = expected_inference_time_ms * 1e-3
+    total_num_samples = result.batch * result.groups * result.num_devices
+    expected_inference_time = total_num_samples / expected_throughput
     prep_perf_report(
-        model_name=f"unet_shallow-data_parallel",
-        batch_size=total_batch,
-        inference_and_compile_time=inference_and_compile_time,
-        inference_time=inference_time,
+        model_name="unet_shallow-trace_2cq_same_io-multi_device",
+        batch_size=total_num_samples,
+        inference_and_compile_time=result.inference_and_compile_time,
+        inference_time=result.inference_time,
         expected_compile_time=expected_compile_time,
         expected_inference_time=expected_inference_time,
-        comments=f"batch_{total_batch}-num_devices_{num_devices}",
+        comments=f"batch_{result.batch}-groups_{result.groups}-num_devices_{result.num_devices}",
     )
-
-    logger.info(f"Running sanity check against reference model output")
-    B, C, H, W = torch_output_tensor.shape
-    ttnn_output_tensor = ttnn.to_torch(output_tensor, mesh_composer=output_mesh_composer).reshape(B, C, H, W)
-    verify_with_pcc(torch_output_tensor, ttnn_output_tensor, UNET_FULL_MODEL_PCC)
+    assert (
+        result.get_fps() >= expected_throughput
+    ), f"Expected end-to-end performance to exceed {expected_throughput:.2f} fps but was {result.get_fps():.2f} fps"
diff --git a/models/experimental/functional_unet/tests/test_unet_trace.py b/models/experimental/functional_unet/tests/test_unet_trace.py
@@ -20,13 +20,13 @@
     is_n300_with_eth_dispatch_cores,
     is_t3k_with_eth_dispatch_cores,
     UNET_FULL_MODEL_PCC,
+    UNetPerformanceStatistics,
 )
 
 from models.utility_functions import skip_for_grayskull, divup
 
 
 @skip_for_grayskull("UNet not currently supported on GS")
-@pytest.mark.models_performance_bare_metal
 @pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 444416}], indirect=True)
 @pytest.mark.parametrize(
     "batch, groups, iterations",
@@ -107,7 +107,6 @@ def test_unet_trace(
 
 
 @skip_for_grayskull("UNet not currently supported on GS")
-@pytest.mark.models_performance_bare_metal
 @pytest.mark.parametrize(
     "device_params", [{"l1_small_size": 68864, "trace_region_size": 442368, "num_command_queues": 2}], indirect=True
 )
@@ -221,7 +220,6 @@ def buffer_address(tensor):
 
 
 @skip_for_grayskull("UNet not currently supported on GS")
-@pytest.mark.models_performance_bare_metal
 @pytest.mark.parametrize("enable_async_mode", (True,), indirect=True)
 @pytest.mark.parametrize(
     "device_params", [{"l1_small_size": 68864, "trace_region_size": 442368, "num_command_queues": 2}], indirect=True
@@ -344,7 +342,6 @@ def test_unet_trace_2cq_multi_device(
 
 
 @skip_for_grayskull("UNet not currently supported on GS")
-@pytest.mark.models_performance_bare_metal
 @pytest.mark.parametrize(
     "device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
 )
@@ -395,6 +392,7 @@ def test_unet_trace_2cq_same_io(
     ttnn.record_event(1, read_event)
 
     logger.info(f"Compiling model with warmup run")
+    start = time.time()
     ttnn.copy_host_to_device_tensor(ttnn_input, input_tensor, cq_id=1)
 
     ttnn.record_event(1, write_event)
@@ -414,6 +412,7 @@ def test_unet_trace_2cq_same_io(
         ttnn.TensorMemoryLayout.WIDTH_SHARDED, ttnn.BufferType.DRAM, output_dram_shard_spec
     )
     dram_output_tensor = ttnn.reshard(output_tensor, output_dram_memory_config)
+    inference_and_compile_time = time.time() - start
     logger.info(f"Done compile run")
 
     logger.info(f"Capturing trace")
@@ -468,17 +467,19 @@ def test_unet_trace_2cq_same_io(
     outputs.append(dram_output_tensor.cpu(blocking=False, cq_id=1))
     ttnn.synchronize_device(device)
     end = time.time()
-    logger.info(f"Average model time={1000.0 * (end-start) / iterations : .2f} ms")
+    inference_time = (end - start) / iterations
+    logger.info(f"Average model time={1000.0 * inference_time : .2f} ms")
     logger.info(f"Average model performance={iterations * groups * batch / (end-start) : .2f} fps")
 
     logger.info(f"Running sanity check against reference model output")
     B, C, H, W = torch_output_tensor.shape
     verify_with_pcc(torch_output_tensor, ttnn.to_torch(outputs[-1]).reshape(B, C, H, W), pcc=UNET_FULL_MODEL_PCC)
     ttnn.release_trace(device, tid)
 
+    return UNetPerformanceStatistics(groups, batch, 1, inference_and_compile_time, inference_time)
+
 
 @skip_for_grayskull("UNet not currently supported on GS")
-@pytest.mark.models_performance_bare_metal
 @pytest.mark.parametrize("enable_async_mode", (True, False), indirect=True)
 @pytest.mark.parametrize(
     "device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
@@ -551,6 +552,7 @@ def test_unet_trace_2cq_same_io_multi_device(
     ttnn.record_event(1, read_event)
 
     logger.info(f"Compiling model with warmup run")
+    start = time.time()
     ttnn.copy_host_to_device_tensor(ttnn_input, input_tensor, cq_id=1)
 
     ttnn.record_event(1, write_event)
@@ -570,6 +572,7 @@ def test_unet_trace_2cq_same_io_multi_device(
         ttnn.TensorMemoryLayout.WIDTH_SHARDED, ttnn.BufferType.DRAM, output_dram_shard_spec
     )
     dram_output_tensor = ttnn.reshard(output_tensor, output_dram_memory_config)
+    inference_and_compile_time = time.time() - start
     logger.info(f"Done compile run")
 
     logger.info(f"Capturing trace")
@@ -628,7 +631,9 @@ def test_unet_trace_2cq_same_io_multi_device(
     outputs.append(dram_output_tensor.cpu(blocking=False, cq_id=1))
     ttnn.synchronize_devices(mesh_device)
     end = time.time()
-    logger.info(f"Average model time={1000.0 * (end-start) / iterations : .2f} ms")
+
+    inference_time = (end - start) / iterations
+    logger.info(f"Average model time={1000.0 * inference_time : .2f} ms")
     logger.info(f"Average model performance={iterations * groups * total_batch / (end-start) : .2f} fps")
 
     logger.info(f"Running sanity check against reference model output")
@@ -639,3 +644,5 @@ def test_unet_trace_2cq_same_io_multi_device(
         pcc=UNET_FULL_MODEL_PCC,
     )
     ttnn.release_trace(mesh_device, tid)
+
+    return UNetPerformanceStatistics(groups, batch, num_devices, inference_and_compile_time, inference_time)
diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh
@@ -76,7 +76,7 @@ run_perf_models_cnn_javelin() {
     local test_marker=$2
 
     # Run tests
-    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests/test_unet_perf.py -m $test_marker
+    env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/experimental/functional_unet/tests -m $test_marker
     env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/stable_diffusion/tests -m $test_marker --timeout=480
 
     ## Merge all the generated reports