diff --git a/.github/workflows/build-docker-artifact.yaml b/.github/workflows/build-docker-artifact.yaml
index a048a0b09f9..402fcfc678a 100644
--- a/.github/workflows/build-docker-artifact.yaml
+++ b/.github/workflows/build-docker-artifact.yaml
@@ -39,7 +39,7 @@ on:
             - "amd64"
 jobs:
   build-docker-image:
-    name: "🐳️ Build ${{ inputs.distro }} {inputs.version }} image"
+    name: "🐳️ Build ${{ inputs.distro }} ${{inputs.version }} image"
     timeout-minutes: 30
     env:
       CONFIG: ci
diff --git a/.github/workflows/pr-gate.yaml b/.github/workflows/pr-gate.yaml
index 83d993ffbbb..0e99878ae77 100644
--- a/.github/workflows/pr-gate.yaml
+++ b/.github/workflows/pr-gate.yaml
@@ -37,4 +37,4 @@ jobs:
     if: github.event_name != 'pull_request' || !github.event.pull_request.draft
     uses: ./.github/workflows/build-artifact.yaml
     with:
-      os: "ubuntu-22.04-amd64"
+      version: "22.04"
diff --git a/tech_reports/data_formats/data_formats.md b/tech_reports/data_formats/data_formats.md
index b03426eaa5d..656e587dfe7 100644
--- a/tech_reports/data_formats/data_formats.md
+++ b/tech_reports/data_formats/data_formats.md
@@ -8,3 +8,13 @@
 
 <img width="961" alt="image" src="https://github.com/user-attachments/assets/e1f54311-e6a6-48f3-9030-192de985b2ce">
 
+## Mantissa Rounding
+When converting from a higher precision to lower precision data format, the mantissa is rounded to the nearest. If the value to round is tied, then it rounds to the nearest even value for the mantissa. For example, when converting from float32 to bfloat8, we want to round 23 bits of mantissa for float32 to 7 bits of mantissa for bfloat8. However, we also explicitly store the hidden bit of 1 for bfloat8, so we are really rounding to 6 bits total. Consider the following 23 bits of mantissa:
+
+<img width="803" alt="image" src="https://github.com/user-attachments/assets/d8d17ad0-8679-406c-9587-1661f2319965" />
+
+To get the 7 bits of mantissa for bfloat8, we want to keep 6 bits of the original 23-bit mantissa and store the additional hidden bit at the most significant bit (MSB). The least significant bit (LSB) of the 6-bit mantissa to keep is known as the guard bit, which we use to round to the nearest even (if there is a tie). In other implementations or literature, the MSB of the round value is also known as the round bit with the remaining bits denoted as the sticky bit(s), but the result is the same. In host code, the rounding is done with the following process:
+
+<img width="1041" alt="image" src="https://github.com/user-attachments/assets/9adaf40a-750c-4c5c-8ec6-c7ff2fbb2bf9" />
+
+To handle exponent sharing, the mantissa is first normalized prior to rounding if the exponent is different from the shared exponent. If there is an overflow in the mantissa when we round up, we do not recompute the max shared exponent and re-normalize across the 16 numbers. Instead, the mantissa is set to the max value (ie. all 1's). For the other block float formats, the same process applies but with the corresponding number of bits for the mantissa and round value.
diff --git a/tests/scripts/t3000/run_t3000_unit_tests.sh b/tests/scripts/t3000/run_t3000_unit_tests.sh
index 993c621f315..b2112c7493e 100755
--- a/tests/scripts/t3000/run_t3000_unit_tests.sh
+++ b/tests/scripts/t3000/run_t3000_unit_tests.sh
@@ -23,9 +23,9 @@ run_t3000_ttmetal_tests() {
   ./build/test/tt_metal/unit_tests_debug_tools_${ARCH_NAME} --gtest_filter="DPrintFixture.*:WatcherFixture.*" ; fail+=$?
 
   # Programming examples
-  ./build/test/tt_metal/programming_examples/distributed/distributed_program_dispatch
-  ./build/test/tt_metal/programming_examples/distributed/distributed_buffer_rw
-  ./build/test/tt_metal/programming_examples/distributed/distributed_eltwise_add
+  ./build/programming_examples/distributed/distributed_program_dispatch
+  ./build/programming_examples/distributed/distributed_buffer_rw
+  ./build/programming_examples/distributed/distributed_eltwise_add
 
   # Record the end time
   end_time=$(date +%s)
diff --git a/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id.py b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id.py
new file mode 100644
index 00000000000..b532a5bc6e8
--- /dev/null
+++ b/tests/tt_metal/microbenchmarks/ethernet/test_ethernet_link_write_worker_with_transaction_id.py
@@ -0,0 +1,111 @@
+# SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import sys
+
+from loguru import logger
+import pytest
+import csv
+from tt_metal.tools.profiler.process_device_log import import_log_run_stats
+import tt_metal.tools.profiler.device_post_proc_config as device_post_proc_config
+
+from models.utility_functions import is_grayskull
+
+from tt_metal.tools.profiler.common import PROFILER_LOGS_DIR, PROFILER_DEVICE_SIDE_LOG
+
+profiler_log_path = PROFILER_LOGS_DIR / PROFILER_DEVICE_SIDE_LOG
+
+FILE_NAME = PROFILER_LOGS_DIR / "test_ethernet_link_write_worker_latency.csv"
+
+if os.path.exists(FILE_NAME):
+    os.remove(FILE_NAME)
+
+
+def append_to_csv(file_path, header, data, write_header=True):
+    file_exists = os.path.isfile(file_path)
+    with open(file_path, "a", newline="") as csvfile:
+        writer = csv.writer(csvfile)
+        if not file_exists or write_header:
+            writer.writerow(header)
+        writer.writerows([data])
+
+
+def get_device_freq():
+    setup = device_post_proc_config.default_setup()
+    setup.deviceInputLog = profiler_log_path
+    deviceData = import_log_run_stats(setup)
+    freq = deviceData["deviceInfo"]["freq"]
+    return freq
+
+
+def profile_results(sample_size, sample_count, channel_count):
+    freq = get_device_freq() / 1000.0
+    setup = device_post_proc_config.default_setup()
+    setup.deviceInputLog = profiler_log_path
+    main_test_body_string = "MAIN-TEST-BODY"
+    setup.timerAnalysis = {
+        main_test_body_string: {
+            "across": "device",
+            "type": "adjacent",
+            "start": {"core": "ANY", "risc": "ERISC", "zone_name": main_test_body_string},
+            "end": {"core": "ANY", "risc": "ERISC", "zone_name": main_test_body_string},
+        },
+    }
+    devices_data = import_log_run_stats(setup)
+    device_0 = list(devices_data["devices"].keys())[0]
+    device_1 = list(devices_data["devices"].keys())[1]
+
+    # MAIN-TEST-BODY
+    main_loop_cycle = devices_data["devices"][device_0]["cores"]["DEVICE"]["analysis"][main_test_body_string]["stats"][
+        "Average"
+    ]
+    main_loop_latency = main_loop_cycle / freq / sample_count / channel_count
+    bw = sample_size / main_loop_latency
+
+    header = [
+        "SAMPLE_SIZE",
+        "BW (B/c)",
+    ]
+    write_header = not os.path.exists(FILE_NAME)
+    append_to_csv(
+        FILE_NAME,
+        header,
+        [sample_size, bw],
+        write_header,
+    )
+    return main_loop_latency
+
+
+@pytest.mark.skipif(is_grayskull(), reason="Unsupported on GS")
+@pytest.mark.parametrize("sample_count", [256])
+@pytest.mark.parametrize("channel_count", [16])
+@pytest.mark.parametrize(
+    "sample_size_expected_latency",
+    [(16, 86.2), (128, 86.2), (256, 86.4), (512, 86.5), (1024, 87.2), (2048, 172.9), (4096, 339.9), (8192, 678.4)],
+)
+def test_erisc_write_worker_latency(sample_count, sample_size_expected_latency, channel_count):
+    os.system(f"rm -rf {os.environ['TT_METAL_HOME']}/generated/profiler/.logs/profile_log_device.csv")
+
+    sample_size = sample_size_expected_latency[0]
+    expected_latency = sample_size_expected_latency[1]
+    expected_latency_lower_bound = expected_latency - 0.5
+    expected_latency_upper_bound = expected_latency + 0.5
+
+    ARCH_NAME = os.getenv("ARCH_NAME")
+    cmd = f"TT_METAL_DEVICE_PROFILER=1 \
+            {os.environ['TT_METAL_HOME']}/build/test/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm_{ARCH_NAME} \
+                {sample_count} \
+                {sample_size} \
+                {channel_count} "
+    rc = os.system(cmd)
+    if rc != 0:
+        logger.info("Error in running the test")
+        assert False
+
+    main_loop_latency = profile_results(sample_size, sample_count, channel_count)
+    logger.info(f"sender_loop_latency {main_loop_latency}")
+    logger.info(f"result BW (B/c): {sample_size / main_loop_latency}")
+
+    assert expected_latency_lower_bound <= main_loop_latency <= expected_latency_upper_bound
diff --git a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
index 8ef873c9f97..4b5b1826c97 100644
--- a/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
+++ b/tests/tt_metal/tt_metal/dispatch/dispatch_buffer/test_EnqueueWriteBuffer_and_EnqueueReadBuffer.cpp
@@ -1013,9 +1013,9 @@ TEST_F(CommandQueueSingleCardBufferFixture, TestReadWriteShardedSubBufferForL1)
     const std::vector<ShardedSubBufferStressTestConfig>& configs =
         local_test_functions::generate_sharded_sub_buffer_test_configs(max_buffer_size);
     for (IDevice* device : devices_) {
-        tt::log_info("Running on Device {}", device->id());
+        tt::log_debug("Running on Device {}", device->id());
         for (const ShardedSubBufferStressTestConfig& config : configs) {
-            tt::log_info(
+            tt::log_debug(
                 tt::LogTest,
                 "Device: {} buffer_size: {} page_size: {} region_offset: {} region_size: {} shard_shape: [{}, {}] "
                 "page_shape: [{}, {}] tensor2d_shape: [{}, {}] layout: {} orientation: {} cores: {}",
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
index 31e3648d336..7573ef25f91 100644
--- a/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/CMakeLists.txt
@@ -7,6 +7,7 @@ set(PERF_MICROBENCH_TESTS_SRCS
     ethernet/test_workers_and_erisc_datamover_unidirectional.cpp
     ethernet/test_ethernet_bidirectional_bandwidth_no_edm.cpp
     ethernet/test_ethernet_link_ping_latency_no_edm.cpp
+    ethernet/test_ethernet_write_worker_latency_no_edm.cpp
     ethernet/test_ethernet_hop_latencies_no_edm.cpp
     routing/test_tx_rx.cpp
     routing/test_mux_demux.cpp
diff --git a/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp
new file mode 100644
index 00000000000..95109747866
--- /dev/null
+++ b/tests/tt_metal/tt_metal/perf_microbenchmark/ethernet/test_ethernet_write_worker_latency_no_edm.cpp
@@ -0,0 +1,267 @@
+
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <random>
+#include <tuple>
+#include <map>
+
+#include "umd/device/types/arch.h"
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/kernel_types.hpp>
+#include "tt_backend_api_types.hpp"
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/math.hpp>
+#include <tt-metalium/tt_metal.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/kernel.hpp>
+#include "tt_metal/test_utils/comparison.hpp"
+#include "tt_metal/test_utils/df/df.hpp"
+#include "tt_metal/test_utils/env_vars.hpp"
+#include "tt_metal/test_utils/print_helpers.hpp"
+#include "tt_metal/test_utils/stimulus.hpp"
+
+#include <tt-metalium/persistent_kernel_cache.hpp>
+
+// TODO: ARCH_NAME specific, must remove
+#include "eth_l1_address_map.h"
+
+using namespace tt;
+using namespace tt::test_utils;
+using namespace tt::test_utils::df;
+
+class N300TestDevice {
+public:
+    N300TestDevice() : device_open(false) {
+        arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+
+        num_devices_ = tt::tt_metal::GetNumAvailableDevices();
+        if (arch_ == tt::ARCH::WORMHOLE_B0 and tt::tt_metal::GetNumAvailableDevices() >= 2 and
+            tt::tt_metal::GetNumPCIeDevices() >= 1) {
+            std::vector<chip_id_t> ids(num_devices_, 0);
+            std::iota(ids.begin(), ids.end(), 0);
+            devices_ = tt::tt_metal::detail::CreateDevices(ids);
+
+        } else {
+            TT_THROW("This suite can only be run on N300 Wormhole devices");
+        }
+        device_open = true;
+    }
+    ~N300TestDevice() {
+        if (device_open) {
+            TearDown();
+        }
+    }
+
+    void TearDown() {
+        device_open = false;
+        for (auto [device_id, device_ptr] : devices_) {
+            tt::tt_metal::CloseDevice(device_ptr);
+        }
+    }
+
+    std::map<chip_id_t, IDevice*> devices_;
+    tt::ARCH arch_;
+    size_t num_devices_;
+
+private:
+    bool device_open;
+};
+
+void validation(const std::shared_ptr<tt::tt_metal::Buffer>& worker_buffer) {
+    std::vector<uint8_t> golden_vec(worker_buffer->size(), 0);
+    std::vector<uint8_t> result_vec(worker_buffer->size(), 0);
+
+    for (int i = 0; i < worker_buffer->size(); ++i) {
+        golden_vec[i] = i;
+    }
+    tt::tt_metal::detail::ReadFromBuffer(worker_buffer, result_vec);
+
+    bool pass = golden_vec == result_vec;
+    TT_FATAL(pass, "validation failed");
+}
+
+std::vector<Program> build(
+    IDevice* device0,
+    IDevice* device1,
+    CoreCoord eth_sender_core,
+    CoreCoord eth_receiver_core,
+    CoreCoord worker_core,
+    std::size_t num_samples,
+    std::size_t sample_page_size,
+    std::size_t num_buffer_slots,
+    KernelHandle& local_kernel,
+    KernelHandle& remote_kernel,
+    std::shared_ptr<Buffer>& worker_buffer) {
+    Program program0;
+    Program program1;
+
+    // worker core coords
+    uint32_t worker_noc_x = device1->worker_core_from_logical_core(worker_core).x;
+    uint32_t worker_noc_y = device1->worker_core_from_logical_core(worker_core).y;
+
+    uint32_t worker_buffer_addr = worker_buffer->address();
+
+    // eth core ct args
+    const std::vector<uint32_t>& eth_sender_ct_args = {num_buffer_slots};
+    const std::vector<uint32_t>& eth_receiver_ct_args = {
+        num_buffer_slots, worker_noc_x, worker_noc_y, worker_buffer_addr};
+
+    // eth core rt args
+    const std::vector<uint32_t>& eth_sender_receiver_rt_args = {
+        tt_metal::hal.get_dev_addr(HalProgrammableCoreType::ACTIVE_ETH, HalL1MemAddrType::UNRESERVED),
+        static_cast<uint32_t>(num_samples),
+        static_cast<uint32_t>(sample_page_size)};
+
+    local_kernel = tt_metal::CreateKernel(
+        program0,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/"
+        "ethernet_write_worker_latency_ubench_sender.cpp",
+        eth_sender_core,
+        tt_metal::EthernetConfig{.noc = tt_metal::NOC::RISCV_0_default, .compile_args = eth_sender_ct_args});
+    tt_metal::SetRuntimeArgs(program0, local_kernel, eth_sender_core, eth_sender_receiver_rt_args);
+
+    remote_kernel = tt_metal::CreateKernel(
+        program1,
+        "tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/"
+        "ethernet_write_worker_latency_ubench_receiver.cpp",
+        eth_receiver_core,
+        tt_metal::EthernetConfig{.noc = tt_metal::NOC::RISCV_0_default, .compile_args = eth_receiver_ct_args});
+    tt_metal::SetRuntimeArgs(program1, remote_kernel, eth_receiver_core, eth_sender_receiver_rt_args);
+
+    // Launch
+    try {
+        tt::tt_metal::detail::CompileProgram(device0, program0);
+        tt::tt_metal::detail::CompileProgram(device1, program1);
+    } catch (std::exception& e) {
+        log_error(tt::LogTest, "Failed compile: {}", e.what());
+        throw e;
+    }
+
+    std::vector<Program> programs;
+    programs.push_back(std::move(program0));
+    programs.push_back(std::move(program1));
+    return programs;
+}
+
+void run(
+    IDevice* device0, IDevice* device1, Program& program0, Program& program1, std::shared_ptr<Buffer>& worker_buffer) {
+    if (std::getenv("TT_METAL_SLOW_DISPATCH_MODE")) {
+        std::thread th2 = std::thread([&] { tt_metal::detail::LaunchProgram(device0, program0); });
+        std::thread th1 = std::thread([&] { tt_metal::detail::LaunchProgram(device1, program1); });
+
+        th2.join();
+        th1.join();
+    } else {
+        tt_metal::EnqueueProgram(device0->command_queue(), program0, false);
+        tt_metal::EnqueueProgram(device1->command_queue(), program1, false);
+
+        log_info(tt::LogTest, "Calling Finish");
+        tt_metal::Finish(device0->command_queue());
+        tt_metal::Finish(device1->command_queue());
+    }
+    tt::tt_metal::detail::DumpDeviceProfileResults(device0);
+    tt::tt_metal::detail::DumpDeviceProfileResults(device1);
+
+    validation(worker_buffer);
+}
+
+int main(int argc, char** argv) {
+    std::size_t arg_idx = 1;
+    std::size_t num_samples = std::stoi(argv[arg_idx++]);
+    std::size_t sample_page_size = std::stoi(argv[arg_idx++]);
+    std::size_t num_buffer_slots = std::stoi(argv[arg_idx++]);
+
+    auto arch = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name());
+    auto num_devices = tt::tt_metal::GetNumAvailableDevices();
+    if (num_devices < 2) {
+        log_info(tt::LogTest, "Need at least 2 devices to run this test");
+        return 0;
+    }
+    if (arch == tt::ARCH::GRAYSKULL) {
+        log_info(tt::LogTest, "Test must be run on WH");
+        return 0;
+    }
+
+    log_info(tt::LogTest, "setting up test fixture");
+    N300TestDevice test_fixture;
+    log_info(tt::LogTest, "done setting up test fixture");
+
+    const auto& device_0 = test_fixture.devices_.at(0);
+    const auto& active_eth_cores = device_0->get_active_ethernet_cores(true);
+    auto eth_sender_core_iter = active_eth_cores.begin();
+    auto eth_sender_core_iter_end = active_eth_cores.end();
+    chip_id_t device_id = std::numeric_limits<chip_id_t>::max();
+    tt_xy_pair eth_receiver_core;
+    bool initialized = false;
+    tt_xy_pair eth_sender_core;
+    do {
+        TT_ASSERT(eth_sender_core_iter != eth_sender_core_iter_end);
+        std::tie(device_id, eth_receiver_core) = device_0->get_connected_ethernet_core(*eth_sender_core_iter);
+        eth_sender_core = *eth_sender_core_iter;
+        eth_sender_core_iter++;
+    } while (device_id != 1);
+
+    log_info(tt::LogTest, "eth_sender_core: {}", eth_sender_core);
+    log_info(tt::LogTest, "eth_receiver_core: {}", eth_receiver_core);
+
+    TT_ASSERT(device_id == 1);
+    const auto& device_1 = test_fixture.devices_.at(device_id);
+    // worker
+    auto worker_core = CoreCoord(0, 0);
+    // Add more configurations here until proper argc parsing added
+    bool success = false;
+    success = true;
+    log_info(tt::LogTest, "STARTING");
+    try {
+        log_info(
+            tt::LogTest,
+            "num_samples: {}, sample_page_size: {}, num_buffer_slots: {}",
+            num_samples,
+            sample_page_size,
+            num_buffer_slots);
+        KernelHandle local_kernel;
+        KernelHandle remote_kernel;
+        try {
+            ShardSpecBuffer shard_spec = ShardSpecBuffer(
+                CoreRangeSet(std::set<CoreRange>({CoreRange(worker_core)})),
+                {1, sample_page_size},
+                ShardOrientation::ROW_MAJOR,
+                {1, sample_page_size},
+                {1, sample_page_size});
+            auto worker_buffer = CreateBuffer(tt::tt_metal::ShardedBufferConfig{
+                .device = device_1,
+                .size = sample_page_size,
+                .page_size = sample_page_size,
+                .buffer_layout = TensorMemoryLayout::HEIGHT_SHARDED,
+                .shard_parameters = shard_spec});
+
+            auto programs = build(
+                device_0,
+                device_1,
+                eth_sender_core,
+                eth_receiver_core,
+                worker_core,
+                num_samples,
+                sample_page_size,
+                num_buffer_slots,
+                local_kernel,
+                remote_kernel,
+                worker_buffer);
+            run(device_0, device_1, programs[0], programs[1], worker_buffer);
+        } catch (std::exception& e) {
+            log_error(tt::LogTest, "Caught exception: {}", e.what());
+            test_fixture.TearDown();
+            return -1;
+        }
+    } catch (std::exception& e) {
+        test_fixture.TearDown();
+        return -1;
+    }
+
+    return success ? 0 : -1;
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp
new file mode 100644
index 00000000000..23826835c81
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_common.hpp
@@ -0,0 +1,42 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdint>
+#include <array>
+#include "eth_l1_address_map.h"
+#include "dataflow_api.h"
+#include "ethernet/dataflow_api.h"
+#include "debug/assert.h"
+#include "debug/dprint.h"
+
+// #define ENABLE_DEBUG 1
+
+struct eth_buffer_slot_sync_t {
+    volatile uint32_t bytes_sent;
+    volatile uint32_t receiver_ack;
+    volatile uint32_t src_id;
+
+    uint32_t reserved_2;
+};
+
+FORCE_INLINE void eth_setup_handshake(std::uint32_t handshake_register_address, bool is_sender) {
+    if (is_sender) {
+        eth_send_bytes(handshake_register_address, handshake_register_address, 16);
+        eth_wait_for_receiver_done();
+    } else {
+        eth_wait_for_bytes(16);
+        eth_receiver_channel_done(0);
+    }
+}
+
+FORCE_INLINE void switch_context_if_debug() {
+#if ENABLE_DEBUG
+    internal_::risc_context_switch();
+#endif
+}
+
+template <typename T>
+bool is_power_of_two(T val) {
+    return (val & (val - 1)) == T(0);
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_receiver.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_receiver.cpp
new file mode 100644
index 00000000000..5f241b1b48d
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_receiver.cpp
@@ -0,0 +1,154 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ethernet_write_worker_latency_ubench_common.hpp"
+
+static constexpr uint32_t NUM_BUFFER_SLOTS = get_compile_time_arg_val(0);
+static constexpr uint32_t MAX_NUM_TRANSACTION_ID =
+    NUM_BUFFER_SLOTS / 2;  // the algorithm only works for NUM_BUFFER_SLOTS divisible by MAX_NUM_TRANSACTION_ID
+static constexpr uint32_t worker_noc_x = get_compile_time_arg_val(1);
+static constexpr uint32_t worker_noc_y = get_compile_time_arg_val(2);
+static constexpr uint32_t worker_buffer_addr = get_compile_time_arg_val(3);
+
+FORCE_INLINE uint32_t advance_buffer_slot_ptr(uint32_t curr_ptr) { return (curr_ptr + 1) % NUM_BUFFER_SLOTS; }
+
+FORCE_INLINE uint32_t get_buffer_slot_trid(uint32_t curr_ptr) { return curr_ptr % MAX_NUM_TRANSACTION_ID + 1; }
+
+FORCE_INLINE bool has_incoming_packet(volatile eth_buffer_slot_sync_t* buffer_slot_sync_addr) {
+    return buffer_slot_sync_addr->bytes_sent != 0;
+}
+
+FORCE_INLINE bool write_worker_done(uint32_t trid) {
+    return ncrisc_noc_nonposted_write_with_transaction_id_flushed(noc_index, trid);
+}
+
+FORCE_INLINE void ack_complete(volatile eth_buffer_slot_sync_t* buffer_slot_sync_addr) {
+    buffer_slot_sync_addr->bytes_sent = 0;
+
+    eth_send_bytes_over_channel_payload_only_unsafe(
+        reinterpret_cast<uint32_t>(buffer_slot_sync_addr),
+        reinterpret_cast<uint32_t>(buffer_slot_sync_addr),
+        sizeof(eth_buffer_slot_sync_t),
+        sizeof(eth_buffer_slot_sync_t),
+        sizeof(eth_buffer_slot_sync_t) >> 4);
+}
+
+FORCE_INLINE void write_worker(
+    uint32_t buffer_slot_addr,
+    volatile eth_buffer_slot_sync_t* buffer_slot_sync_addr,
+    uint64_t worker_noc_addr,
+    uint32_t message_size,
+    uint32_t curr_trid_to_write) {
+    // write to local
+    noc_async_write_one_packet_with_trid(buffer_slot_addr, worker_noc_addr, message_size, curr_trid_to_write);
+
+    // reset sync
+    buffer_slot_sync_addr->bytes_sent = 0;
+}
+
+FORCE_INLINE void check_incomping_packet_and_write_worker(
+    const std::array<uint32_t, NUM_BUFFER_SLOTS>& buffer_slot_addrs,
+    const std::array<volatile eth_buffer_slot_sync_t*, NUM_BUFFER_SLOTS>& buffer_slot_sync_addrs,
+    uint32_t read_ptr,
+    uint32_t& write_ptr,
+    uint64_t worker_noc_addr,
+    uint32_t message_size) {
+    uint32_t next_write_ptr = advance_buffer_slot_ptr(write_ptr);
+    bool buffer_not_full = next_write_ptr != read_ptr;
+
+    if (buffer_not_full && has_incoming_packet(buffer_slot_sync_addrs[write_ptr])) {
+        uint32_t curr_trid = get_buffer_slot_trid(write_ptr);
+        write_worker(
+            buffer_slot_addrs[write_ptr], buffer_slot_sync_addrs[write_ptr], worker_noc_addr, message_size, curr_trid);
+
+        write_ptr = next_write_ptr;
+    }
+}
+
+FORCE_INLINE void check_write_worker_done_and_send_ack(
+    const std::array<volatile eth_buffer_slot_sync_t*, NUM_BUFFER_SLOTS>& buffer_slot_sync_addrs,
+    uint32_t& read_ptr,
+    uint32_t write_ptr,
+    uint32_t& num_messages_ack) {
+    bool buffer_not_empty = read_ptr != write_ptr;
+    uint32_t curr_trid = get_buffer_slot_trid(read_ptr);
+
+    if (buffer_not_empty && write_worker_done(curr_trid) && !eth_txq_is_busy()) {
+        ack_complete(buffer_slot_sync_addrs[read_ptr]);
+
+        read_ptr = advance_buffer_slot_ptr(read_ptr);
+
+        num_messages_ack++;
+    }
+}
+
+FORCE_INLINE void receiver_main_loop(
+    const std::array<uint32_t, NUM_BUFFER_SLOTS>& buffer_slot_addrs,
+    const std::array<volatile eth_buffer_slot_sync_t*, NUM_BUFFER_SLOTS>& buffer_slot_sync_addrs,
+    uint64_t worker_noc_addr,
+    uint32_t message_size,
+    uint32_t num_messages) {
+    uint32_t total_msgs = num_messages * NUM_BUFFER_SLOTS;
+
+    DPRINT << "RECEIVER MAIN LOOP" << ENDL();
+
+    uint32_t buffer_read_ptr = 0;
+    uint32_t buffer_write_ptr = 0;
+
+    uint32_t num_messages_ack = 0;
+    while (num_messages_ack < total_msgs) {
+        // Check if there's an incoming packet for current buffer slot and write to worker if there's new packet
+        check_incomping_packet_and_write_worker(
+            buffer_slot_addrs,
+            buffer_slot_sync_addrs,
+            buffer_read_ptr,
+            buffer_write_ptr,
+            worker_noc_addr,
+            message_size);
+        // Check if the write for trid is done, and ack sender if the current buffer slot is done
+        check_write_worker_done_and_send_ack(
+            buffer_slot_sync_addrs, buffer_read_ptr, buffer_write_ptr, num_messages_ack);
+
+        // not called in normal execution mode
+        switch_context_if_debug();
+    }
+}
+
+void kernel_main() {
+    uint32_t arg_idx = 0;
+    const uint32_t handshake_addr = get_arg_val<uint32_t>(arg_idx++);
+    const uint32_t num_messages = get_arg_val<uint32_t>(arg_idx++);
+    const uint32_t message_size = get_arg_val<uint32_t>(arg_idx++);
+
+    ASSERT(is_power_of_two(NUM_BUFFER_SLOTS));
+
+    std::array<uint32_t, NUM_BUFFER_SLOTS> buffer_slot_addrs;
+    std::array<volatile eth_buffer_slot_sync_t*, NUM_BUFFER_SLOTS> buffer_slot_sync_addrs;
+    {
+        uint32_t buffer_slot_addr = handshake_addr + sizeof(eth_buffer_slot_sync_t);
+        for (uint8_t i = 0; i < NUM_BUFFER_SLOTS; i++) {
+            buffer_slot_addrs[i] = buffer_slot_addr;
+            buffer_slot_addr += message_size;
+            buffer_slot_sync_addrs[i] = reinterpret_cast<volatile eth_buffer_slot_sync_t*>(buffer_slot_addr);
+            buffer_slot_sync_addrs[i]->bytes_sent = 0;
+            buffer_slot_sync_addrs[i]->receiver_ack = 0;
+            buffer_slot_addr += sizeof(eth_buffer_slot_sync_t);
+        }
+    }
+
+    // Avoids hang in issue https://github.com/tenstorrent/tt-metal/issues/9963
+    for (uint32_t i = 0; i < 2000000000; i++) {
+        asm volatile("nop");
+    }
+
+    // worker noc address
+    uint64_t worker_noc_addr = get_noc_addr(worker_noc_x, worker_noc_y, worker_buffer_addr);
+
+    eth_setup_handshake(handshake_addr, false);
+
+    {
+        DeviceZoneScopedN("MAIN-TEST-BODY");
+        receiver_main_loop(buffer_slot_addrs, buffer_slot_sync_addrs, worker_noc_addr, message_size, num_messages);
+    }
+}
diff --git a/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_sender.cpp b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_sender.cpp
new file mode 100644
index 00000000000..cdf37185e7a
--- /dev/null
+++ b/tests/tt_metal/tt_metal/test_kernels/dataflow/unit_tests/erisc/ethernet_write_worker_latency_ubench_sender.cpp
@@ -0,0 +1,137 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ethernet_write_worker_latency_ubench_common.hpp"
+
+static constexpr uint32_t NUM_BUFFER_SLOTS = get_compile_time_arg_val(0);
+
+FORCE_INLINE uint32_t advance_buffer_slot_ptr(uint32_t curr_ptr) { return (curr_ptr + 1) % NUM_BUFFER_SLOTS; }
+
+FORCE_INLINE void write_receiver(
+    uint32_t buffer_slot_addr,
+    volatile eth_buffer_slot_sync_t* buffer_slot_sync_addr,
+    uint32_t full_payload_size,
+    uint32_t full_payload_size_eth_words) {
+    buffer_slot_sync_addr->bytes_sent = 1;
+
+    eth_send_bytes_over_channel_payload_only_unsafe(
+        buffer_slot_addr, buffer_slot_addr, full_payload_size, full_payload_size, full_payload_size_eth_words);
+}
+
+FORCE_INLINE bool has_receiver_ack(volatile eth_buffer_slot_sync_t* buffer_slot_sync_addr) {
+    return buffer_slot_sync_addr->bytes_sent == 0;
+}
+
+FORCE_INLINE void check_buffer_full_and_send_packet(
+    const std::array<uint32_t, NUM_BUFFER_SLOTS>& buffer_slot_addrs,
+    const std::array<volatile eth_buffer_slot_sync_t*, NUM_BUFFER_SLOTS>& buffer_slot_sync_addrs,
+    uint32_t read_ptr,
+    uint32_t& write_ptr,
+    uint64_t full_payload_size,
+    uint32_t full_payload_size_eth_words) {
+    uint32_t next_write_ptr = advance_buffer_slot_ptr(write_ptr);
+    bool buffer_not_full = next_write_ptr != read_ptr;
+
+    if (buffer_not_full && !eth_txq_is_busy()) {
+        write_receiver(
+            buffer_slot_addrs[write_ptr],
+            buffer_slot_sync_addrs[write_ptr],
+            full_payload_size,
+            full_payload_size_eth_words);
+
+        write_ptr = next_write_ptr;
+    }
+}
+
+FORCE_INLINE void check_receiver_done(
+    const std::array<volatile eth_buffer_slot_sync_t*, NUM_BUFFER_SLOTS>& buffer_slot_sync_addrs,
+    uint32_t& read_ptr,
+    uint32_t& num_messages_ack) {
+    if (has_receiver_ack(buffer_slot_sync_addrs[read_ptr])) {
+        read_ptr = advance_buffer_slot_ptr(read_ptr);
+        num_messages_ack++;
+    }
+}
+
+FORCE_INLINE void sender_main_loop(
+    const std::array<uint32_t, NUM_BUFFER_SLOTS>& buffer_slot_addrs,
+    const std::array<volatile eth_buffer_slot_sync_t*, NUM_BUFFER_SLOTS>& buffer_slot_sync_addrs,
+    uint32_t full_payload_size,
+    uint32_t num_messages) {
+    uint32_t full_payload_size_eth_words = full_payload_size >> 4;
+    uint32_t total_msgs = num_messages * NUM_BUFFER_SLOTS;
+
+    DPRINT << "SENDER MAIN LOOP" << ENDL();
+
+    uint32_t buffer_read_ptr = 0;
+    uint32_t buffer_write_ptr = 0;
+
+    uint32_t num_messages_ack = 0;
+    while (num_messages_ack < total_msgs) {
+        // Check if current buffer slot is ready and send packet to receiver
+        check_buffer_full_and_send_packet(
+            buffer_slot_addrs,
+            buffer_slot_sync_addrs,
+            buffer_read_ptr,
+            buffer_write_ptr,
+            full_payload_size,
+            full_payload_size_eth_words);
+        // Check if the write for trid is done, and ack sender if the current buffer slot is done
+        check_receiver_done(buffer_slot_sync_addrs, buffer_read_ptr, num_messages_ack);
+
+        // not called in normal execution mode
+        switch_context_if_debug();
+    }
+}
+
+void kernel_main() {
+    uint32_t arg_idx = 0;
+    const uint32_t handshake_addr = get_arg_val<uint32_t>(arg_idx++);
+    const uint32_t num_messages = get_arg_val<uint32_t>(arg_idx++);
+    const uint32_t message_size = get_arg_val<uint32_t>(arg_idx++);
+    bool is_sender_offset_0 = get_arg_val<uint32_t>(arg_idx++) == 1;
+
+    ASSERT(is_power_of_two(NUM_BUFFER_SLOTS));
+
+    const uint32_t message_size_eth_words = message_size >> 4;
+
+    const uint32_t full_payload_size = message_size + sizeof(eth_buffer_slot_sync_t);
+    const uint32_t full_payload_size_eth_words = full_payload_size >> 4;
+
+    std::array<uint32_t, NUM_BUFFER_SLOTS> buffer_slot_addrs;
+    std::array<volatile eth_buffer_slot_sync_t*, NUM_BUFFER_SLOTS> buffer_slot_sync_addrs;
+    {
+        uint32_t channel_addr = handshake_addr + sizeof(eth_buffer_slot_sync_t);
+        for (uint8_t i = 0; i < NUM_BUFFER_SLOTS; i++) {
+            buffer_slot_addrs[i] = channel_addr;
+            channel_addr += message_size;
+            buffer_slot_sync_addrs[i] = reinterpret_cast<volatile eth_buffer_slot_sync_t*>(channel_addr);
+            channel_addr += sizeof(eth_buffer_slot_sync_t);
+        }
+    }
+
+    // reset bytes_sent to 0s so first iter it won't block
+    for (uint32_t i = 0; i < NUM_BUFFER_SLOTS; i++) {
+        buffer_slot_sync_addrs[i]->bytes_sent = 0;
+    }
+
+    // assemble a packet filled with values
+    for (uint32_t i = 0; i < NUM_BUFFER_SLOTS; i++) {
+        tt_l1_ptr uint8_t* ptr = reinterpret_cast<tt_l1_ptr uint8_t*>(buffer_slot_addrs[i]);
+        for (uint32_t j = 0; j < message_size; j++) {
+            ptr[j] = j;
+        }
+    }
+
+    // Avoids hang in issue https://github.com/tenstorrent/tt-metal/issues/9963
+    for (uint32_t i = 0; i < 2000000000; i++) {
+        asm volatile("nop");
+    }
+    eth_setup_handshake(handshake_addr, true);
+
+    {
+        DeviceZoneScopedN("MAIN-TEST-BODY");
+        sender_main_loop(buffer_slot_addrs, buffer_slot_sync_addrs, full_payload_size, num_messages);
+    }
+}
diff --git a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
index 6f2cea1128d..fb9cd78cb14 100644
--- a/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
+++ b/tt_metal/hw/inc/blackhole/noc_nonblocking_api.h
@@ -139,7 +139,7 @@ inline __attribute__((always_inline)) bool ncrisc_noc_read_with_transaction_id_f
     return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transcation_id)) == 0);
 }
 
-template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC>
+template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC, bool use_trid = false>
 inline __attribute__((always_inline)) void ncrisc_noc_fast_write(
     uint32_t noc,
     uint32_t cmd_buf,
@@ -151,12 +151,17 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write(
     bool linked,
     uint32_t num_dests,
     bool multicast_path_reserve,
-    bool posted = false) {
+    bool posted = false,
+    uint32_t trid = 0) {
     uint32_t noc_cmd_field =
         NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc) | (linked ? NOC_CMD_VC_LINKED : 0x0) |
         (mcast ? ((multicast_path_reserve ? NOC_CMD_PATH_RESERVE : 0) | NOC_CMD_BRCST_PACKET) : 0x0) |
         (posted ? 0 : NOC_CMD_RESP_MARKED);
 
+    if constexpr (use_trid) {
+        NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(trid));
+    }
+
     NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CTRL, noc_cmd_field);
     NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, src_addr);
     NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dest_addr);
@@ -287,6 +292,11 @@ inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_writes_flushed(u
     return (NOC_STATUS_READ_REG(noc, NIU_MST_WR_ACK_RECEIVED) == noc_nonposted_writes_acked[noc]);
 }
 
+inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_transaction_id_flushed(
+    uint32_t noc, uint32_t transcation_id) {
+    return (NOC_STATUS_READ_REG(noc, NIU_MST_WRITE_REQS_OUTGOING_ID(transcation_id)) == 0);
+}
+
 inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_atomics_flushed(uint32_t noc) {
     return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == noc_nonposted_atomics_acked[noc]);
 }
@@ -439,7 +449,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_read_any_len(
     ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, len_bytes);
 }
 
-template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC>
+template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC, bool use_trid = false, bool one_packet = false>
 inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len(
     uint32_t noc,
     uint32_t cmd_buf,
@@ -451,28 +461,43 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len(
     bool linked,
     uint32_t num_dests,
     bool multicast_path_reserve,
-    bool posted = false) {
-    while (len_bytes > NOC_MAX_BURST_SIZE) {
-        while (!noc_cmd_buf_ready(noc, cmd_buf));
-        ncrisc_noc_fast_write<proc_type, noc_mode>(
-            noc,
-            cmd_buf,
-            src_addr,
-            dest_addr,
-            NOC_MAX_BURST_SIZE,
-            vc,
-            mcast,
-            linked,
-            num_dests,
-            multicast_path_reserve,
-            posted);
-        src_addr += NOC_MAX_BURST_SIZE;
-        dest_addr += NOC_MAX_BURST_SIZE;
-        len_bytes -= NOC_MAX_BURST_SIZE;
+    bool posted = false,
+    uint32_t trid = 0) {
+    if constexpr (!one_packet) {
+        while (len_bytes > NOC_MAX_BURST_SIZE) {
+            while (!noc_cmd_buf_ready(noc, cmd_buf));
+            ncrisc_noc_fast_write<proc_type, noc_mode, use_trid>(
+                noc,
+                cmd_buf,
+                src_addr,
+                dest_addr,
+                NOC_MAX_BURST_SIZE,
+                vc,
+                mcast,
+                linked,
+                num_dests,
+                multicast_path_reserve,
+                posted,
+                trid);
+            src_addr += NOC_MAX_BURST_SIZE;
+            dest_addr += NOC_MAX_BURST_SIZE;
+            len_bytes -= NOC_MAX_BURST_SIZE;
+        }
     }
     while (!noc_cmd_buf_ready(noc, cmd_buf));
-    ncrisc_noc_fast_write<proc_type, noc_mode>(
-        noc, cmd_buf, src_addr, dest_addr, len_bytes, vc, mcast, linked, num_dests, multicast_path_reserve, posted);
+    ncrisc_noc_fast_write<proc_type, noc_mode, use_trid>(
+        noc,
+        cmd_buf,
+        src_addr,
+        dest_addr,
+        len_bytes,
+        vc,
+        mcast,
+        linked,
+        num_dests,
+        multicast_path_reserve,
+        posted,
+        trid);
 }
 
 template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC>
diff --git a/tt_metal/hw/inc/dataflow_api.h b/tt_metal/hw/inc/dataflow_api.h
index 89c6c85d3e3..8d1d95dec80 100644
--- a/tt_metal/hw/inc/dataflow_api.h
+++ b/tt_metal/hw/inc/dataflow_api.h
@@ -2022,6 +2022,31 @@ void noc_async_read_barrier_with_trid(uint32_t trid, uint8_t noc = noc_index) {
     WAYPOINT("NBTD");
 }
 
+inline void noc_async_write_one_packet_with_trid(
+    std::uint32_t src_local_l1_addr,
+    std::uint64_t dst_noc_addr,
+    std::uint32_t size,
+    std::uint32_t trid,
+    uint8_t noc = noc_index) {
+    WAYPOINT("NAWW");
+    DEBUG_SANITIZE_NOC_WRITE_TRANSACTION(noc, dst_noc_addr, src_local_l1_addr, size);
+#ifndef ARCH_GRAYSKULL
+    ncrisc_noc_fast_write_any_len<proc_type, noc_mode, true, true>(
+        noc, write_cmd_buf, src_local_l1_addr, dst_noc_addr, size, NOC_UNICAST_WRITE_VC, false, false, 1, true, trid);
+#endif
+    WAYPOINT("NAWD");
+}
+
+FORCE_INLINE
+void noc_async_write_barrier_with_trid(uint32_t trid, uint8_t noc = noc_index) {
+    WAYPOINT("NWTW");
+#ifndef ARCH_GRAYSKULL
+    while (!ncrisc_noc_nonposted_write_with_transaction_id_flushed(noc, trid));
+#endif
+    invalidate_l1_cache();
+    WAYPOINT("NWTD");
+}
+
 template <bool DRAM>
 FORCE_INLINE uint64_t
 get_noc_addr_from_bank_id(uint32_t bank_id, uint32_t bank_address_offset, uint8_t noc = noc_index) {
diff --git a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h
index 58eb1be5f66..9b763f44fcf 100644
--- a/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h
+++ b/tt_metal/hw/inc/wormhole/noc_nonblocking_api.h
@@ -138,7 +138,7 @@ inline __attribute__((always_inline)) bool ncrisc_noc_read_with_transaction_id_f
     return (NOC_STATUS_READ_REG(noc, NIU_MST_REQS_OUTSTANDING_ID(transcation_id)) == 0);
 }
 
-template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC>
+template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC, bool use_trid = false>
 inline __attribute__((always_inline)) void ncrisc_noc_fast_write(
     uint32_t noc,
     uint32_t cmd_buf,
@@ -150,12 +150,17 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write(
     bool linked,
     uint32_t num_dests,
     bool multicast_path_reserve,
-    bool posted = false) {
+    bool posted = false,
+    uint32_t trid = 0) {
     uint32_t noc_cmd_field =
         NOC_CMD_CPY | NOC_CMD_WR | NOC_CMD_VC_STATIC | NOC_CMD_STATIC_VC(vc) | (linked ? NOC_CMD_VC_LINKED : 0x0) |
         (mcast ? ((multicast_path_reserve ? NOC_CMD_PATH_RESERVE : 0) | NOC_CMD_BRCST_PACKET) : 0x0) |
         (posted ? 0 : NOC_CMD_RESP_MARKED);
 
+    if constexpr (use_trid) {
+        NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_PACKET_TAG, NOC_PACKET_TAG_TRANSACTION_ID(trid));
+    }
+
     NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_CTRL, noc_cmd_field);
     NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_TARG_ADDR_LO, src_addr);
     NOC_CMD_BUF_WRITE_REG(noc, cmd_buf, NOC_RET_ADDR_LO, (uint32_t)dest_addr);
@@ -243,6 +248,11 @@ inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_writes_flushed(u
     return (NOC_STATUS_READ_REG(noc, NIU_MST_WR_ACK_RECEIVED) == noc_nonposted_writes_acked[noc]);
 }
 
+inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_write_with_transaction_id_flushed(
+    uint32_t noc, uint32_t transcation_id) {
+    return (NOC_STATUS_READ_REG(noc, NIU_MST_WRITE_REQS_OUTGOING_ID(transcation_id)) == 0);
+}
+
 inline __attribute__((always_inline)) bool ncrisc_noc_nonposted_atomics_flushed(uint32_t noc) {
     return (NOC_STATUS_READ_REG(noc, NIU_MST_ATOMIC_RESP_RECEIVED) == noc_nonposted_atomics_acked[noc]);
 }
@@ -379,7 +389,7 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_read_any_len(
     ncrisc_noc_fast_read(noc, cmd_buf, src_addr, dest_addr, len_bytes);
 }
 
-template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC>
+template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC, bool use_trid = false, bool one_packet = false>
 inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len(
     uint32_t noc,
     uint32_t cmd_buf,
@@ -391,28 +401,43 @@ inline __attribute__((always_inline)) void ncrisc_noc_fast_write_any_len(
     bool linked,
     uint32_t num_dests,
     bool multicast_path_reserve,
-    bool posted = false) {
-    while (len_bytes > NOC_MAX_BURST_SIZE) {
-        while (!noc_cmd_buf_ready(noc, cmd_buf));
-        ncrisc_noc_fast_write<proc_type, noc_mode>(
-            noc,
-            cmd_buf,
-            src_addr,
-            dest_addr,
-            NOC_MAX_BURST_SIZE,
-            vc,
-            mcast,
-            linked,
-            num_dests,
-            multicast_path_reserve,
-            posted);
-        src_addr += NOC_MAX_BURST_SIZE;
-        dest_addr += NOC_MAX_BURST_SIZE;
-        len_bytes -= NOC_MAX_BURST_SIZE;
+    bool posted = false,
+    uint32_t trid = 0) {
+    if constexpr (!one_packet) {
+        while (len_bytes > NOC_MAX_BURST_SIZE) {
+            while (!noc_cmd_buf_ready(noc, cmd_buf));
+            ncrisc_noc_fast_write<proc_type, noc_mode, use_trid>(
+                noc,
+                cmd_buf,
+                src_addr,
+                dest_addr,
+                NOC_MAX_BURST_SIZE,
+                vc,
+                mcast,
+                linked,
+                num_dests,
+                multicast_path_reserve,
+                posted,
+                trid);
+            src_addr += NOC_MAX_BURST_SIZE;
+            dest_addr += NOC_MAX_BURST_SIZE;
+            len_bytes -= NOC_MAX_BURST_SIZE;
+        }
     }
     while (!noc_cmd_buf_ready(noc, cmd_buf));
-    ncrisc_noc_fast_write<proc_type, noc_mode>(
-        noc, cmd_buf, src_addr, dest_addr, len_bytes, vc, mcast, linked, num_dests, multicast_path_reserve, posted);
+    ncrisc_noc_fast_write<proc_type, noc_mode, use_trid>(
+        noc,
+        cmd_buf,
+        src_addr,
+        dest_addr,
+        len_bytes,
+        vc,
+        mcast,
+        linked,
+        num_dests,
+        multicast_path_reserve,
+        posted,
+        trid);
 }
 
 template <uint32_t proc_type, uint8_t noc_mode = DM_DEDICATED_NOC>