diff --git a/.github/workflows/build-docker-artifact.yaml b/.github/workflows/build-docker-artifact.yaml
index a048a0b09f9..402fcfc678a 100644
--- a/.github/workflows/build-docker-artifact.yaml
+++ b/.github/workflows/build-docker-artifact.yaml
@@ -39,7 +39,7 @@ on:
- "amd64"
jobs:
build-docker-image:
- name: "🐳️ Build ${{ inputs.distro }} {inputs.version }} image"
+ name: "🐳️ Build ${{ inputs.distro }} ${{inputs.version }} image"
timeout-minutes: 30
env:
CONFIG: ci
diff --git a/.github/workflows/pr-gate.yaml b/.github/workflows/pr-gate.yaml
index 83d993ffbbb..0e99878ae77 100644
--- a/.github/workflows/pr-gate.yaml
+++ b/.github/workflows/pr-gate.yaml
@@ -37,4 +37,4 @@ jobs:
if: github.event_name != 'pull_request' || !github.event.pull_request.draft
uses: ./.github/workflows/build-artifact.yaml
with:
- os: "ubuntu-22.04-amd64"
+ version: "22.04"
diff --git a/tech_reports/data_formats/data_formats.md b/tech_reports/data_formats/data_formats.md
index b03426eaa5d..656e587dfe7 100644
--- a/tech_reports/data_formats/data_formats.md
+++ b/tech_reports/data_formats/data_formats.md
@@ -8,3 +8,13 @@
+## Mantissa Rounding
+When converting from a higher precision to lower precision data format, the mantissa is rounded to the nearest. If the value to round is tied, then it rounds to the nearest even value for the mantissa. For example, when converting from float32 to bfloat8, we want to round 23 bits of mantissa for float32 to 7 bits of mantissa for bfloat8. However, we also explicitly store the hidden bit of 1 for bfloat8, so we are really rounding to 6 bits total. Consider the following 23 bits of mantissa:
+
+
+
+To get the 7 bits of mantissa for bfloat8, we want to keep 6 bits of the original 23-bit mantissa and store the additional hidden bit at the most significant bit (MSB). The least significant bit (LSB) of the 6-bit mantissa to keep is known as the guard bit, which we use to round to the nearest even (if there is a tie). In other implementations or literature, the MSB of the round value is also known as the round bit with the remaining bits denoted as the sticky bit(s), but the result is the same. In host code, the rounding is done with the following process:
+
+
+
+To handle exponent sharing, the mantissa is first normalized prior to rounding if the exponent is different from the shared exponent. If there is an overflow in the mantissa when we round up, we do not recompute the max shared exponent and re-normalize across the 16 numbers. Instead, the mantissa is set to the max value (ie. all 1's). For the other block float formats, the same process applies but with the corresponding number of bits for the mantissa and round value.
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
index 993c621f315..b2112c7493e 100755
--- a/tests/scripts/t3000/run_t3000_unit_tests.sh
+++ b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -23,9 +23,9 @@ run_t3000_ttmetal_tests() {
./build/test/tt_metal/unit_tests_debug_tools_${ARCH_NAME} --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$?
# Programming examples
- ./build/test/tt_metal/programming_examples/distributed/distributed_program_dispatch
- ./build/test/tt_metal/programming_examples/distributed/distributed_buffer_rw
- ./build/test/tt_metal/programming_examples/distributed/distributed_eltwise_add
+ ./build/programming_examples/distributed/distributed_program_dispatch
+ ./build/programming_examples/distributed/distributed_buffer_rw
+ ./build/programming_examples/distributed/distributed_eltwise_add
# Record the end time
end_time=$(date +%s)
diff --git a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id.py b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id.py
new file mode 100644
index 00000000000..b532a5bc6e8
--- /dev/null
+++ b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id.py
@@ -0,0 +1,111 @@
+# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import sys
+
+from loguru import logger
+import pytest
+import csv
+from tt_metal.tools.profiler.process_device_log import import_log_run_stats
+import tt_metal.tools.profiler.device_post_proc_config as device_post_proc_config
+
+from models.utility_functions import is_grayskull
+
+from tt_metal.tools.profiler.common import PROFILER_LOGS_DIR, PROFILER_DEVICE_SIDE_LOG
+
+profiler_log_path = PROFILER_LOGS_DIR / PROFILER_DEVICE_SIDE_LOG
+
+FILE_NAME = PROFILER_LOGS_DIR / "test_ethernet_link_write_worker_latency.csv"
+
+if os.path.exists(FILE_NAME):
+ os.remove(FILE_NAME)
+
+
+def append_to_csv(file_path, header, data, write_header=True):
+ file_exists = os.path.isfile(file_path)
+ with open(file_path, "a", newline="") as csvfile:
+ writer = csv.writer(csvfile)
+ if not file_exists or write_header:
+ writer.writerow(header)
+ writer.writerows([data])
+
+
+def get_device_freq():
+ setup = device_post_proc_config.default_setup()
+ setup.deviceInputLog = profiler_log_path
+ deviceData = import_log_run_stats(setup)
+ freq = deviceData["deviceInfo"]["freq"]
+ return freq
+
+
+def profile_results(sample_size, sample_count, channel_count):
+ freq = get_device_freq() / 1000.0
+ setup = device_post_proc_config.default_setup()
+ setup.deviceInputLog = profiler_log_path
+ main_test_body_string = "MAIN-TEST-BODY"
+ setup.timerAnalysis = {
+ main_test_body_string: {
+ "across": "device",
+ "type": "adjacent",
+ "start": {"core": "ANY", "risc": "ERISC", "zone_name": main_test_body_string},
+ "end": {"core": "ANY", "risc": "ERISC", "zone_name": main_test_body_string},
+ },
+ }
+ devices_data = import_log_run_stats(setup)
+ device_0 = list(devices_data["devices"].keys())[0]
+ device_1 = list(devices_data["devices"].keys())[1]
+
+ # MAIN-TEST-BODY
+ main_loop_cycle = devices_data["devices"][device_0]["cores"]["DEVICE"]["analysis"][main_test_body_string]["stats"][
+ "Average"
+ ]
+ main_loop_latency = main_loop_cycle / freq / sample_count / channel_count
+ bw = sample_size / main_loop_latency
+
+ header = [
+ "SAMPLE_SIZE",
+ "BW (B/c)",
+ ]
+ write_header = not os.path.exists(FILE_NAME)
+ append_to_csv(
+ FILE_NAME,
+ header,
+ [sample_size, bw],
+ write_header,
+ )
+ return main_loop_latency
+
+
+@pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS")
+@pytest.mark.parametrize("sample_count", [256])
+@pytest.mark.parametrize("channel_count", [16])
+@pytest.mark.parametrize(
+ "sample_size_expected_latency",
+ [(16, 86.2), (128, 86.2), (256, 86.4), (512, 86.5), (1024, 87.2), (2048, 172.9), (4096, 339.9), (8192, 678.4)],
+)
+def test_erisc_write_worker_latency(sample_count, sample_size_expected_latency, channel_count):
+ os.system(f"rm -rf {os.environ['TT_METAL_HOME']}/generated/profiler/.logs/profile_log_device.csv")
+
+ sample_size = sample_size_expected_latency[0]
+ expected_latency = sample_size_expected_latency[1]
+ expected_latency_lower_bound = expected_latency - 0.5
+ expected_latency_upper_bound = expected_latency + 0.5
+
+ ARCH_NAME = os.getenv("ARCH_NAME")
+ cmd = f"TT_METAL_DEVICE_PROFILER=1 \
+ {os.environ['TT_METAL_HOME']}/build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm_{ARCH_NAME} \
+ {sample_count} \
+ {sample_size} \
+ {channel_count} "
+ rc = os.system(cmd)
+ if rc != 0:
+ logger.info("Error in running the test")
+ assert False
+
+ main_loop_latency = profile_results(sample_size, sample_count, channel_count)
+ logger.info(f"sender_loop_latency {main_loop_latency}")
+ logger.info(f"result BW (B/c): {sample_size / main_loop_latency}")
+
+ assert expected_latency_lower_bound <= main_loop_latency <= expected_latency_upper_bound
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index 8ef873c9f97..4b5b1826c97 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -1013,9 +1013,9 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestReadWriteShardedSubBufferForL1)
const std::vector& configs =
local_test_functions::generate_sharded_sub_buffer_test_configs(max_buffer_size);
for (IDevice* device : devices_) {
- tt::log_info("Running on Device {}", device->id());
+ tt::log_debug("Running on Device {}", device->id());
for (const ShardedSubBufferStressTestConfig& config : configs) {
- tt::log_info(
+ tt::log_debug(
tt::LogTest,
"Device: {} buffer_size: {} page_size: {} region_offset: {} region_size: {} shard_shape: [{}, {}] "
"page_shape: [{}, {}] tensor2d_shape: [{}, {}] layout: {} orientation: {} cores: {}",
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
index 31e3648d336..7573ef25f91 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
@@ -7,6 +7,7 @@ set(PERF_MICROBENCH_TESTS_SRCS
ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
ethernet/test_ethernet_link_ping_latency_no_edm.cpp
+ ethernet/test_ethernet_write_worker_latency_no_edm.cpp
ethernet/test_ethernet_hop_latencies_no_edm.cpp
routing/test_tx_rx.cpp
routing/test_mux_demux.cpp
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp
new file mode 100644
index 00000000000..95109747866
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp
@@ -0,0 +1,267 @@
+
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include