Skip to content

Commit

Permalink
Add end-to-end performance checks for UNet Shallow with trace+2CQ
Browse files Browse the repository at this point in the history
  • Loading branch information
esmalTT committed Jan 23, 2025
1 parent cd78741 commit 420066f
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 149 deletions.
14 changes: 14 additions & 0 deletions models/experimental/functional_unet/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

# SPDX-License-Identifier: Apache-2.0

from dataclasses import dataclass

import ttnn
from loguru import logger

Expand All @@ -10,6 +12,18 @@
UNET_FULL_MODEL_PCC = 0.99999


@dataclass
class UNetPerformanceStatistics:
groups: int
batch: int
num_devices: int
inference_and_compile_time: float
inference_time: float

def get_fps(self) -> float:
return round(self.batch * self.groups * self.num_devices / self.inference_time, 4)


def is_n300_with_eth_dispatch_cores(mesh_device) -> bool:
all_devices_using_full_grid = all(
[(8 == device.core_grid.x and 8 == device.core_grid.y) for device in mesh_device.get_devices()]
Expand Down
193 changes: 52 additions & 141 deletions models/experimental/functional_unet/tests/test_unet_perf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,13 @@

# SPDX-License-Identifier: Apache-2.0

import ttnn
import pytest

from loguru import logger

from models.experimental.functional_unet.tt.model_preprocessing import (
create_unet_input_tensors,
create_unet_model_parameters,
)
from models.experimental.functional_unet.tt import unet_shallow_torch
from models.experimental.functional_unet.tt import unet_shallow_ttnn
from models.experimental.functional_unet.tests.common import (
verify_with_pcc,
is_n300_with_eth_dispatch_cores,
is_t3k_with_eth_dispatch_cores,
UNET_FULL_MODEL_PCC,
)

from models.perf.perf_utils import prep_perf_report
from models.perf.device_perf_utils import run_device_perf, check_device_perf, prep_device_perf_report
from models.utility_functions import (
profiler,
skip_for_grayskull,
)

Expand Down Expand Up @@ -59,167 +44,93 @@ def test_unet_perf_device(batch: int, groups: int, expected_device_perf_fps: flo

@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize("device_params", [{"l1_small_size": 79104}], indirect=True)
@pytest.mark.parametrize(
"batch, groups, iterations, expected_compile_time, expected_inference_time_ms",
((1, 2, 16, 25.0, 39.0),),
"device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
)
@pytest.mark.parametrize(
"batch, groups, iterations, expected_compile_time, expected_throughput",
((1, 2, 128, 25.0, 830.0),),
)
def test_unet_perf_e2e(
def test_unet_trace_perf(
batch: int,
groups: int,
iterations: int,
expected_compile_time: float,
expected_inference_time_ms: float,
expected_throughput: float,
device,
use_program_cache,
reset_seeds,
):
profiler.clear()

torch_input, ttnn_input = create_unet_input_tensors(batch, groups, channel_order="first", pad=False, fold=False)

profiler.start(f"initialize_ref_model")
model = unet_shallow_torch.UNet.from_random_weights(groups=groups)
profiler.end(f"initialize_ref_model")

profiler.start(f"initialize_model")
parameters = create_unet_model_parameters(model, torch_input, groups=groups, device=device)
ttnn_model = unet_shallow_ttnn.UNet(parameters, device)
profiler.end(f"initialize_model")

torch_output_tensor = model(torch_input)

logger.info(f"Compiling model with warmup run")
profiler.start(f"inference_and_compile_time")
output_tensor = ttnn_model(ttnn_input).cpu()
profiler.end(f"inference_and_compile_time")

inference_and_compile_time = profiler.get("inference_and_compile_time")
logger.info(f"Model compiled with warmup run in {(inference_and_compile_time):.2f} s")

logger.info(f"Running inference for {iterations} iterations")
for idx in range(iterations):
profiler.start("inference_time")
profiler.start(f"inference_time_{idx}")
output_tensor = ttnn_model(ttnn_input).cpu()
profiler.end(f"inference_time_{idx}")
profiler.end("inference_time")

mean_inference_time = profiler.get("inference_time")
inference_time = profiler.get(f"inference_time_{iterations - 1}")
compile_time = inference_and_compile_time - inference_time
logger.info(f"Model compilation took {compile_time:.1f} s")
logger.info(f"Inference time on last iterations was completed in {(inference_time * 1000.0):.2f} ms")
logger.info(
f"Mean inference time for {batch} (batch) images was {(mean_inference_time * 1000.0):.2f} ms ({batch / mean_inference_time:.2f} fps)"
from models.experimental.functional_unet.tests.test_unet_trace import (
test_unet_trace_2cq_same_io,
)

expected_inference_time = expected_inference_time_ms * 1e-3
logger.info(f"Invoking underlying model test for {iterations} iterations...")
result = test_unet_trace_2cq_same_io(batch, groups, iterations, device, use_program_cache, reset_seeds)

total_num_samples = result.batch * result.groups * result.num_devices
expected_inference_time = total_num_samples / expected_throughput
prep_perf_report(
model_name=f"unet_shallow",
batch_size=batch,
inference_and_compile_time=inference_and_compile_time,
inference_time=inference_time,
model_name="unet_shallow-trace_2cq_same_io",
batch_size=total_num_samples,
inference_and_compile_time=result.inference_and_compile_time,
inference_time=result.inference_time,
expected_compile_time=expected_compile_time,
expected_inference_time=expected_inference_time,
comments="",
comments=f"batch_{result.batch}-groups_{result.groups}-num_devices_{result.num_devices}",
)

logger.info(f"Running sanity check against reference model output")
B, C, H, W = torch_output_tensor.shape
ttnn_output_tensor = ttnn.to_torch(output_tensor).reshape(B, C, H, W)
verify_with_pcc(torch_output_tensor, ttnn_output_tensor, UNET_FULL_MODEL_PCC)
assert (
result.get_fps() >= expected_throughput
), f"Expected end-to-end performance to exceed {expected_throughput:.2f} fps but was {result.get_fps():.2f} fps"


@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize("enable_async_mode", (True,), indirect=True)
@pytest.mark.parametrize("device_params", [{"l1_small_size": 79104}], indirect=True)
@pytest.mark.parametrize(
"batch, groups, iterations, expected_compile_time, expected_inference_time_ms",
((1, 2, 16, 25.0, 61.0),),
"device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
)
@pytest.mark.parametrize(
"batch, groups, iterations, expected_compile_time, expected_throughput, use_async_mode",
(
(1, 2, 128, 25.0, 1220.0, True),
(1, 2, 128, 25.0, 1650.0, False),
),
)
def test_unet_data_parallel_perf_e2e(
def test_unet_trace_perf_multi_device(
batch: int,
groups: int,
iterations: int,
expected_compile_time: float,
expected_inference_time_ms: float,
expected_throughput: float,
use_async_mode: bool,
mesh_device,
use_program_cache,
reset_seeds,
enable_async_mode,
):
if not is_n300_with_eth_dispatch_cores(mesh_device) and not is_t3k_with_eth_dispatch_cores(mesh_device):
pytest.skip("Test is only valid for N300 or T3000")

profiler.clear()

inputs_mesh_mapper = ttnn.ShardTensorToMesh(mesh_device, dim=0)
weights_mesh_mapper = ttnn.ReplicateTensorToMesh(mesh_device)
output_mesh_composer = ttnn.ConcatMeshToTensor(mesh_device, dim=0)

torch_input, ttnn_input = create_unet_input_tensors(batch, groups)

profiler.start(f"initialize_ref_model")
model = unet_shallow_torch.UNet.from_random_weights(groups=groups)
profiler.end(f"initialize_ref_model")

profiler.start(f"initialize_model")
parameters = create_unet_model_parameters(model, torch_input, groups=groups, device=mesh_device)
ttnn_model = unet_shallow_ttnn.UNet(parameters, device=mesh_device, mesh_mapper=weights_mesh_mapper)
profiler.end(f"initialize_model")

num_devices = len(mesh_device.get_device_ids())
total_batch = num_devices * batch
torch_input, ttnn_input = create_unet_input_tensors(
total_batch, groups, channel_order="first", pad=False, fold=False, mesh_mapper=inputs_mesh_mapper
)
logger.info(f"Created reference input tensors: {list(torch_input.shape)}")
logger.info(
f"Created multi-device input tensors: shape={list(ttnn_input.shape)} on devices={mesh_device.get_device_ids()}"
from models.experimental.functional_unet.tests.test_unet_trace import (
test_unet_trace_2cq_same_io_multi_device,
)

torch_output_tensor = model(torch_input)

logger.info(f"Compiling model with warmup run")
profiler.start(f"inference_and_compile_time")
output_tensor = ttnn.from_device(ttnn_model(ttnn_input), blocking=True)
profiler.end(f"inference_and_compile_time")

inference_and_compile_time = profiler.get("inference_and_compile_time")
logger.info(f"Model compiled with warmup run in {(inference_and_compile_time):.2f} s")

logger.info(f"Running inference for {iterations} iterations")
for idx in range(iterations):
profiler.start("inference_time")
profiler.start(f"inference_time_{idx}")
output_tensor = ttnn.from_device(ttnn_model(ttnn_input), blocking=False)
profiler.end(f"inference_time_{idx}")
profiler.end("inference_time")
ttnn.synchronize_devices(mesh_device)

mean_inference_time = profiler.get("inference_time")
inference_time = profiler.get(f"inference_time_{iterations - 1}")
compile_time = inference_and_compile_time - inference_time
logger.info(f"Model compilation took {compile_time:.1f} s")
logger.info(f"Inference time on last iterations was completed in {(inference_time * 1000.0):.2f} ms")
logger.info(
f"Mean inference time for {total_batch} (batch) images was {(mean_inference_time * 1000.0):.2f} ms ({total_batch / mean_inference_time:.2f} fps)"
mesh_device.enable_async(use_async_mode)
model_name = "unet_shallow-trace_2cq_same_io-multi_device"
model_name += "-async" if use_async_mode else "-no_async"

logger.info(f"Invoking underlying model test for {iterations} iterations...")
result = test_unet_trace_2cq_same_io_multi_device(
batch, groups, iterations, mesh_device, use_async_mode, use_program_cache, reset_seeds
)

expected_inference_time = expected_inference_time_ms * 1e-3
total_num_samples = result.batch * result.groups * result.num_devices
expected_inference_time = total_num_samples / expected_throughput
prep_perf_report(
model_name=f"unet_shallow-data_parallel",
batch_size=total_batch,
inference_and_compile_time=inference_and_compile_time,
inference_time=inference_time,
model_name="unet_shallow-trace_2cq_same_io-multi_device",
batch_size=total_num_samples,
inference_and_compile_time=result.inference_and_compile_time,
inference_time=result.inference_time,
expected_compile_time=expected_compile_time,
expected_inference_time=expected_inference_time,
comments=f"batch_{total_batch}-num_devices_{num_devices}",
comments=f"batch_{result.batch}-groups_{result.groups}-num_devices_{result.num_devices}",
)

logger.info(f"Running sanity check against reference model output")
B, C, H, W = torch_output_tensor.shape
ttnn_output_tensor = ttnn.to_torch(output_tensor, mesh_composer=output_mesh_composer).reshape(B, C, H, W)
verify_with_pcc(torch_output_tensor, ttnn_output_tensor, UNET_FULL_MODEL_PCC)
assert (
result.get_fps() >= expected_throughput
), f"Expected end-to-end performance to exceed {expected_throughput:.2f} fps but was {result.get_fps():.2f} fps"
21 changes: 14 additions & 7 deletions models/experimental/functional_unet/tests/test_unet_trace.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@
is_n300_with_eth_dispatch_cores,
is_t3k_with_eth_dispatch_cores,
UNET_FULL_MODEL_PCC,
UNetPerformanceStatistics,
)

from models.utility_functions import skip_for_grayskull, divup


@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize("device_params", [{"l1_small_size": 68864, "trace_region_size": 444416}], indirect=True)
@pytest.mark.parametrize(
"batch, groups, iterations",
Expand Down Expand Up @@ -107,7 +107,6 @@ def test_unet_trace(


@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize(
"device_params", [{"l1_small_size": 68864, "trace_region_size": 442368, "num_command_queues": 2}], indirect=True
)
Expand Down Expand Up @@ -221,7 +220,6 @@ def buffer_address(tensor):


@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize("enable_async_mode", (True,), indirect=True)
@pytest.mark.parametrize(
"device_params", [{"l1_small_size": 68864, "trace_region_size": 442368, "num_command_queues": 2}], indirect=True
Expand Down Expand Up @@ -344,7 +342,6 @@ def test_unet_trace_2cq_multi_device(


@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize(
"device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
)
Expand Down Expand Up @@ -395,6 +392,7 @@ def test_unet_trace_2cq_same_io(
ttnn.record_event(1, read_event)

logger.info(f"Compiling model with warmup run")
start = time.time()
ttnn.copy_host_to_device_tensor(ttnn_input, input_tensor, cq_id=1)

ttnn.record_event(1, write_event)
Expand All @@ -414,6 +412,7 @@ def test_unet_trace_2cq_same_io(
ttnn.TensorMemoryLayout.WIDTH_SHARDED, ttnn.BufferType.DRAM, output_dram_shard_spec
)
dram_output_tensor = ttnn.reshard(output_tensor, output_dram_memory_config)
inference_and_compile_time = time.time() - start
logger.info(f"Done compile run")

logger.info(f"Capturing trace")
Expand Down Expand Up @@ -468,17 +467,19 @@ def test_unet_trace_2cq_same_io(
outputs.append(dram_output_tensor.cpu(blocking=False, cq_id=1))
ttnn.synchronize_device(device)
end = time.time()
logger.info(f"Average model time={1000.0 * (end-start) / iterations : .2f} ms")
inference_time = (end - start) / iterations
logger.info(f"Average model time={1000.0 * inference_time : .2f} ms")
logger.info(f"Average model performance={iterations * groups * batch / (end-start) : .2f} fps")

logger.info(f"Running sanity check against reference model output")
B, C, H, W = torch_output_tensor.shape
verify_with_pcc(torch_output_tensor, ttnn.to_torch(outputs[-1]).reshape(B, C, H, W), pcc=UNET_FULL_MODEL_PCC)
ttnn.release_trace(device, tid)

return UNetPerformanceStatistics(groups, batch, 1, inference_and_compile_time, inference_time)


@skip_for_grayskull("UNet not currently supported on GS")
@pytest.mark.models_performance_bare_metal
@pytest.mark.parametrize("enable_async_mode", (True, False), indirect=True)
@pytest.mark.parametrize(
"device_params", [{"l1_small_size": 68864, "trace_region_size": 424960, "num_command_queues": 2}], indirect=True
Expand Down Expand Up @@ -551,6 +552,7 @@ def test_unet_trace_2cq_same_io_multi_device(
ttnn.record_event(1, read_event)

logger.info(f"Compiling model with warmup run")
start = time.time()
ttnn.copy_host_to_device_tensor(ttnn_input, input_tensor, cq_id=1)

ttnn.record_event(1, write_event)
Expand All @@ -570,6 +572,7 @@ def test_unet_trace_2cq_same_io_multi_device(
ttnn.TensorMemoryLayout.WIDTH_SHARDED, ttnn.BufferType.DRAM, output_dram_shard_spec
)
dram_output_tensor = ttnn.reshard(output_tensor, output_dram_memory_config)
inference_and_compile_time = time.time() - start
logger.info(f"Done compile run")

logger.info(f"Capturing trace")
Expand Down Expand Up @@ -628,7 +631,9 @@ def test_unet_trace_2cq_same_io_multi_device(
outputs.append(dram_output_tensor.cpu(blocking=False, cq_id=1))
ttnn.synchronize_devices(mesh_device)
end = time.time()
logger.info(f"Average model time={1000.0 * (end-start) / iterations : .2f} ms")

inference_time = (end - start) / iterations
logger.info(f"Average model time={1000.0 * inference_time : .2f} ms")
logger.info(f"Average model performance={iterations * groups * total_batch / (end-start) : .2f} fps")

logger.info(f"Running sanity check against reference model output")
Expand All @@ -639,3 +644,5 @@ def test_unet_trace_2cq_same_io_multi_device(
pcc=UNET_FULL_MODEL_PCC,
)
ttnn.release_trace(mesh_device, tid)

return UNetPerformanceStatistics(groups, batch, num_devices, inference_and_compile_time, inference_time)
2 changes: 1 addition & 1 deletion tests/scripts/run_performance.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ run_perf_models_cnn_javelin() {
local test_marker=$2

# Run tests
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest models/experimental/functional_unet/tests/test_unet_perf.py -m $test_marker
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/experimental/functional_unet/tests -m $test_marker
env WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml pytest -n auto models/demos/wormhole/stable_diffusion/tests -m $test_marker --timeout=480

## Merge all the generated reports
Expand Down

0 comments on commit 420066f

Please sign in to comment.