Merge branch 'main' into blozano-docker

tenstorrent · Feb 4, 2025 · 183052c · 183052c
2 parents 9446f0f + 0e07413
commit 183052c
Show file tree

Hide file tree

Showing 14 changed files with 850 additions and 53 deletions.
diff --git a/.github/workflows/build-docker-artifact.yaml b/.github/workflows/build-docker-artifact.yaml
@@ -39,7 +39,7 @@ on:
             - "amd64"
 jobs:
   build-docker-image:
-    name: "🐳️ Build ${{ inputs.distro }} {inputs.version }} image"
+    name: "🐳️ Build ${{ inputs.distro }} ${{inputs.version }} image"
     timeout-minutes: 30
     env:
       CONFIG: ci

diff --git a/.github/workflows/pr-gate.yaml b/.github/workflows/pr-gate.yaml
@@ -37,4 +37,4 @@ jobs:
     if: github.event_name != 'pull_request' || !github.event.pull_request.draft
     uses: ./.github/workflows/build-artifact.yaml
     with:
-      os: "ubuntu-22.04-amd64"
+      version: "22.04"
diff --git a/tech_reports/data_formats/data_formats.md b/tech_reports/data_formats/data_formats.md
@@ -8,3 +8,13 @@
 
 <img width="961" alt="image" src="https://github.com/user-attachments/assets/e1f54311-e6a6-48f3-9030-192de985b2ce">
 
+## Mantissa Rounding
+When converting from a higher precision to lower precision data format, the mantissa is rounded to the nearest. If the value to round is tied, then it rounds to the nearest even value for the mantissa. For example, when converting from float32 to bfloat8, we want to round 23 bits of mantissa for float32 to 7 bits of mantissa for bfloat8. However, we also explicitly store the hidden bit of 1 for bfloat8, so we are really rounding to 6 bits total. Consider the following 23 bits of mantissa:
+
+<img width="803" alt="image" src="https://github.com/user-attachments/assets/d8d17ad0-8679-406c-9587-1661f2319965" />
+
+To get the 7 bits of mantissa for bfloat8, we want to keep 6 bits of the original 23-bit mantissa and store the additional hidden bit at the most significant bit (MSB). The least significant bit (LSB) of the 6-bit mantissa to keep is known as the guard bit, which we use to round to the nearest even (if there is a tie). In other implementations or literature, the MSB of the round value is also known as the round bit with the remaining bits denoted as the sticky bit(s), but the result is the same. In host code, the rounding is done with the following process:
+
+<img width="1041" alt="image" src="https://github.com/user-attachments/assets/9adaf40a-750c-4c5c-8ec6-c7ff2fbb2bf9" />
+
+To handle exponent sharing, the mantissa is first normalized prior to rounding if the exponent is different from the shared exponent. If there is an overflow in the mantissa when we round up, we do not recompute the max shared exponent and re-normalize across the 16 numbers. Instead, the mantissa is set to the max value (ie. all 1's). For the other block float formats, the same process applies but with the corresponding number of bits for the mantissa and round value.
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -23,9 +23,9 @@ run_t3000_ttmetal_tests() {
   ./build/test/tt_metal/unit_tests_debug_tools_${ARCH_NAME} --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$?
 
   # Programming examples
-  ./build/test/tt_metal/programming_examples/distributed/distributed_program_dispatch
-  ./build/test/tt_metal/programming_examples/distributed/distributed_buffer_rw
-  ./build/test/tt_metal/programming_examples/distributed/distributed_eltwise_add
+  ./build/programming_examples/distributed/distributed_program_dispatch
+  ./build/programming_examples/distributed/distributed_buffer_rw
+  ./build/programming_examples/distributed/distributed_eltwise_add
 
   # Record the end time
   end_time=$(date +%s)

diff --git a/.../tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id.py b/.../tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id.py
@@ -0,0 +1,111 @@
+# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import sys
+
+from loguru import logger
+import pytest
+import csv
+from tt_metal.tools.profiler.process_device_log import import_log_run_stats
+import tt_metal.tools.profiler.device_post_proc_config as device_post_proc_config
+
+from models.utility_functions import is_grayskull
+
+from tt_metal.tools.profiler.common import PROFILER_LOGS_DIR, PROFILER_DEVICE_SIDE_LOG
+
+profiler_log_path = PROFILER_LOGS_DIR / PROFILER_DEVICE_SIDE_LOG
+
+FILE_NAME = PROFILER_LOGS_DIR / "test_ethernet_link_write_worker_latency.csv"
+
+if os.path.exists(FILE_NAME):
+    os.remove(FILE_NAME)
+
+
+def append_to_csv(file_path, header, data, write_header=True):
+    file_exists = os.path.isfile(file_path)
+    with open(file_path, "a", newline="") as csvfile:
+        writer = csv.writer(csvfile)
+        if not file_exists or write_header:
+            writer.writerow(header)
+        writer.writerows([data])
+
+
+def get_device_freq():
+    setup = device_post_proc_config.default_setup()
+    setup.deviceInputLog = profiler_log_path
+    deviceData = import_log_run_stats(setup)
+    freq = deviceData["deviceInfo"]["freq"]
+    return freq
+
+
+def profile_results(sample_size, sample_count, channel_count):
+    freq = get_device_freq() / 1000.0
+    setup = device_post_proc_config.default_setup()
+    setup.deviceInputLog = profiler_log_path
+    main_test_body_string = "MAIN-TEST-BODY"
+    setup.timerAnalysis = {
+        main_test_body_string: {
+            "across": "device",
+            "type": "adjacent",
+            "start": {"core": "ANY", "risc": "ERISC", "zone_name": main_test_body_string},
+            "end": {"core": "ANY", "risc": "ERISC", "zone_name": main_test_body_string},
+        },
+    }
+    devices_data = import_log_run_stats(setup)
+    device_0 = list(devices_data["devices"].keys())[0]
+    device_1 = list(devices_data["devices"].keys())[1]
+
+    # MAIN-TEST-BODY
+    main_loop_cycle = devices_data["devices"][device_0]["cores"]["DEVICE"]["analysis"][main_test_body_string]["stats"][
+        "Average"
+    ]
+    main_loop_latency = main_loop_cycle / freq / sample_count / channel_count
+    bw = sample_size / main_loop_latency
+
+    header = [
+        "SAMPLE_SIZE",
+        "BW (B/c)",
+    ]
+    write_header = not os.path.exists(FILE_NAME)
+    append_to_csv(
+        FILE_NAME,
+        header,
+        [sample_size, bw],
+        write_header,
+    )
+    return main_loop_latency
+
+
+@pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS")
+@pytest.mark.parametrize("sample_count", [256])
+@pytest.mark.parametrize("channel_count", [16])
+@pytest.mark.parametrize(
+    "sample_size_expected_latency",
+    [(16, 86.2), (128, 86.2), (256, 86.4), (512, 86.5), (1024, 87.2), (2048, 172.9), (4096, 339.9), (8192, 678.4)],
+)
+def test_erisc_write_worker_latency(sample_count, sample_size_expected_latency, channel_count):
+    os.system(f"rm -rf {os.environ['TT_METAL_HOME']}/generated/profiler/.logs/profile_log_device.csv")
+
+    sample_size = sample_size_expected_latency[0]
+    expected_latency = sample_size_expected_latency[1]
+    expected_latency_lower_bound = expected_latency - 0.5
+    expected_latency_upper_bound = expected_latency + 0.5
+
+    ARCH_NAME = os.getenv("ARCH_NAME")
+    cmd = f"TT_METAL_DEVICE_PROFILER=1 \
+            {os.environ['TT_METAL_HOME']}/build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm_{ARCH_NAME} \
+                {sample_count} \
+                {sample_size} \
+                {channel_count} "
+    rc = os.system(cmd)
+    if rc != 0:
+        logger.info("Error in running the test")
+        assert False
+
+    main_loop_latency = profile_results(sample_size, sample_count, channel_count)
+    logger.info(f"sender_loop_latency {main_loop_latency}")
+    logger.info(f"result BW (B/c): {sample_size / main_loop_latency}")
+
+    assert expected_latency_lower_bound <= main_loop_latency <= expected_latency_upper_bound
diff --git a/...metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/...metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -1013,9 +1013,9 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestReadWriteShardedSubBufferForL1)
     const std::vector<ShardedSubBufferStressTestConfig>& configs =
         local_test_functions::generate_sharded_sub_buffer_test_configs(max_buffer_size);
     for (IDevice* device : devices_) {
-        tt::log_info("Running on Device {}", device->id());
+        tt::log_debug("Running on Device {}", device->id());
         for (const ShardedSubBufferStressTestConfig& config : configs) {
-            tt::log_info(
+            tt::log_debug(
                 tt::LogTest,
                 "Device: {} buffer_size: {} page_size: {} region_offset: {} region_size: {} shard_shape: [{}, {}] "
                 "page_shape: [{}, {}] tensor2d_shape: [{}, {}] layout: {} orientation: {} cores: {}",

diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
@@ -7,6 +7,7 @@ set(PERF_MICROBENCH_TESTS_SRCS
     ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
     ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
     ethernet/test_ethernet_link_ping_latency_no_edm.cpp
+    ethernet/test_ethernet_write_worker_latency_no_edm.cpp
     ethernet/test_ethernet_hop_latencies_no_edm.cpp
     routing/test_tx_rx.cpp
     routing/test_mux_demux.cpp