tenstorrent · dchenTT · Jan 23, 2025 · Jan 23, 2025
@@ -13,7 +13,7 @@ set(PROGRAMMING_EXAMPLES_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/matmul_single_core/matmul_single_core.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/pad/pad_multi_core.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/sharding/shard_data_rm.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/vecadd_multi_core/vecadd_multi_core.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/vecadd_sharding/vecadd_sharding.cpp
 )
 
 include(${PROJECT_SOURCE_DIR}/cmake/helper_functions.cmake)

@@ -0,0 +1,74 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "compute_kernel_api.h"
+#include "compute_kernel_api/common.h"
+#include "compute_kernel_api/eltwise_binary.h"
+#include "compute_kernel_api/tile_move_copy.h"
+#include <cstdint>
+
+namespace NAMESPACE {
+void MAIN {
+    // We are going to read from these two circular buffers
+    constexpr auto cb_in0 = get_compile_time_arg_val(0);
+    constexpr auto cb_in1 = get_compile_time_arg_val(1);
+    // and write to the output circular buffer
+    constexpr auto cb_out0 = get_compile_time_arg_val(2);
+
+    uint32_t num_tile = get_arg_val<uint32_t>(0);
+
+    // The destination register.
+    // Quote the doc: "This register is an array of 16 tiles of 32x32 elements
+    // each." If you are familiar with the concept of rotating register file
+    // from computer architecture. Think it like that. Later on we will ensure
+    // that registers are free and then we will submit compute to the FPU/SFPU
+    // that writes to the register.
+    constexpr uint32_t dst_reg = 0;
+
+    // Tell the SFPU that we will be using circular buffers c_in0, c_in1 and
+    // c_out0 to perform the computation.
+    binary_op_init_common(cb_in0, cb_in1, cb_out0);
+    // And we are going to add tiles. This function is only called if we ever
+    // need to switch operation to something else. Since we are only adding
+    // tiles, this function is only called once before the loop.
+    add_tiles_init();
+
+    // Loop over the assigned tiles and perform the computation
+    for (uint32_t i = 0; i < num_tile; i++) {
+        // IMPORTANT: since there is no read kernel, and data is alraedy in circular buffers
+        // do not call cb_wait_front() because there is no wait.
+        // if calling cb_wait_front() here, the kernel will hang forever.
+
+        // Make sure there is a valid MATH thread register we can use.
+        tile_regs_acquire();
+
+        // Add the tiles from the input circular buffers and write the result to
+        // the destination register
+        add_tiles(cb_in0, cb_in1, 0, 0, dst_reg);
+
+        // release lock on DST register by MATH thread
+        tile_regs_commit();
+
+        cb_pop_front(cb_in0, 1);
+        cb_pop_front(cb_in1, 1);
+
+        // acquire an exclusive lock on the DST register for the PACK thread.
+        // make sure MATH thread has committed the DST register earlier
+        tile_regs_wait();
+
+        //  Copy the result from adding the tiles to the output circular buffer
+        pack_tile(dst_reg, cb_out0);
+
+        // release lock on DST register by PACK thread
+        tile_regs_release();
+
+        // no need to call cb_reserve_back(cb_out0, 1)
+        // buffer because output circular buffer is pointed to already allocated L1 buffer
+        // but it does not hurt to call it
+
+        // Mark the output tile as ready and pop the input tiles
+        cb_push_back(cb_out0, 1);
+    }
+}
+}  // namespace NAMESPACE
@@ -0,0 +1,211 @@
+// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+// This programming example is an advanced example, compared to the vecadd single core example in the
+// contributed folder.  It illustrated sharding tensor inputs to L1 memory of multiple cores directly,
+// then perform vector addition tile by tile. Because of sharding to L1, DRAM is not involved.
+// Data copy is avoided and reader and writer kernels are not needed.
+
+#include <tt-metalium/bfloat16.hpp>
+#include <tt-metalium/core_coord.hpp>
+#include <tt-metalium/host_api.hpp>
+#include <tt-metalium/device_impl.hpp>
+#include <tt-metalium/tt_metal.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <string_view>
+#include <vector>
+
+using namespace tt;
+using namespace tt::tt_metal;
+
+using CoreSpec = std::variant<CoreCoord, CoreRange, CoreRangeSet>;
+
+// sharding configuration is defined by the following struct
+struct L1Config {
+    L1Config(TensorMemoryLayout layout, uint32_t cores_height, uint32_t cores_width) :
+        buffer_layout(layout), num_cores_height(cores_height), num_cores_width(cores_width) {}
+
+    TensorMemoryLayout buffer_layout;
+    uint32_t num_cores_height;
+    uint32_t num_cores_width;
+
+    // following sharding parameters are hardcode for this example
+    tt::DataFormat l1_data_format = tt::DataFormat::Float16_b;
+    uint32_t element_size = 2;
+    uint32_t num_tiles_per_core_height = 2;
+    uint32_t num_tiles_per_core_width = 2;
+
+    // following sharding parameters are calculated based on the above configuration
+    uint32_t num_cores = num_cores_height * num_cores_width;
+    uint32_t num_tiles_per_core = num_tiles_per_core_height * num_tiles_per_core_width;
+    uint32_t size_bytes = num_cores_height * num_tiles_per_core_height * tt::constants::TILE_HEIGHT * num_cores_width *
+                          num_tiles_per_core_width * tt::constants::TILE_WIDTH * element_size;
+    uint32_t page_size_bytes = tt::constants::TILE_HW * element_size;
+    CoreRange cores = CoreRange(CoreCoord(0, 0), CoreCoord(0, num_cores - 1));
+    ShardSpecBuffer shard_spec() const {
+        return ShardSpecBuffer(
+            CoreRangeSet(std::set<CoreRange>({cores})),
+            {(uint32_t)num_tiles_per_core_height * tt::constants::TILE_HEIGHT,
+             (uint32_t)num_tiles_per_core_width * tt::constants::TILE_WIDTH},
+            ShardOrientation::ROW_MAJOR,
+            {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH},
+            {num_cores_height * num_tiles_per_core_height * num_cores_height,
+             num_tiles_per_core_width * num_cores_width});
+    }
+};
+
+std::shared_ptr<Buffer> MakeShardedL1BufferBFP16(IDevice* device, const L1Config& test_config) {
+    return CreateBuffer(tt::tt_metal::ShardedBufferConfig{
+        .device = device,
+        .size = test_config.size_bytes,
+        .page_size = test_config.page_size_bytes,
+        .buffer_layout = test_config.buffer_layout,
+        .shard_parameters = test_config.shard_spec()});
+}
+
+CBHandle MakeCircularBufferBFP16(
+    Program& program, const CoreSpec& core, tt::CBIndex cb, uint32_t n_tiles, const std::shared_ptr<Buffer>& l1_buf) {
+    constexpr uint32_t tile_size = sizeof(bfloat16) * tt::constants::TILE_HW;
+    CircularBufferConfig cb_src0_config = CircularBufferConfig(n_tiles * tile_size, {{cb, tt::DataFormat::Float16_b}})
+                                              .set_page_size(cb, tile_size)
+                                              // IMPORTANT: assign L1 buffer address to circular buffer directly so that
+                                              // no extra allocation and data copy
+                                              .set_globally_allocated_address(*l1_buf);
+    return CreateCircularBuffer(program, core, cb_src0_config);
+}
+
+std::string next_arg(int& i, int argc, char** argv) {
+    if (i + 1 >= argc) {
+        std::cerr << "Expected argument after " << argv[i] << std::endl;
+        exit(1);
+    }
+    return argv[++i];
+}
+
+void help(std::string_view program_name) {
+    std::cout << "Usage: " << program_name << " [options]\n";
+    std::cout << "This program demonstrates how to add two vectors using "
+                 "tt-Metalium.\n";
+    std::cout << "\n";
+    std::cout << "Options:\n";
+    std::cout << "  --device, -d <device_id>  Specify the device to run the "
+                 "program on. Default is 0.\n";
+    std::cout << "  --sharding_type, -s <sharding>  Specify the sharding type "
+                 "options are height, width, or block. Default is height.\n";
+    exit(0);
+}
+
+int main(int argc, char** argv) {
+    // used fixed seed for reproducibility and deterministic results
+    int seed = 0x1234567;
+    int device_id = 0;
+    std::string sharding_type = "height";
+
+    // sharding configuration, 4x4 of tiles bfloat16, each core has 2x2 tiles, sharded to 4 core
+    const std::unordered_map<std::string_view, L1Config> test_configs{
+        {"height", {TensorMemoryLayout::HEIGHT_SHARDED, 4, 1}},
+        {"width", {TensorMemoryLayout::WIDTH_SHARDED, 1, 4}},
+        {"block", {TensorMemoryLayout::BLOCK_SHARDED, 2, 2}},
+    };
+
+    // Quick and dirty argument parsing.
+    for (int i = 1; i < argc; i++) {
+        std::string_view arg = argv[i];
+        if (arg == "--device" || arg == "-d") {
+            device_id = std::stoi(next_arg(i, argc, argv));
+        } else if (arg == "--help" || arg == "-h") {
+            help(argv[0]);
+            return 0;
+        } else if (arg == "--sharding_type" || arg == "-s") {
+            sharding_type = next_arg(i, argc, argv);
+            if (not test_configs.contains(sharding_type)) {
+                std::cout << "Invalid sharding type: " << sharding_type << std::endl;
+                help(argv[0]);
+                return 1;
+            }
+        } else {
+            std::cout << "Unknown argument: " << arg << std::endl;
+            help(argv[0]);
+        }
+    }
+
+    IDevice* device = CreateDevice(device_id);
+    Program program = CreateProgram();
+
+    std::cout << "Sharding type: " << sharding_type << std::endl;
+    const auto& test_config = test_configs.at(sharding_type);
+
+    // Create the input and output buffers.
+    auto a = MakeShardedL1BufferBFP16(device, test_config);
+    auto b = MakeShardedL1BufferBFP16(device, test_config);
+    auto c = MakeShardedL1BufferBFP16(device, test_config);
+
+    std::mt19937 rng(seed);
+    auto a_data = create_random_vector_of_bfloat16_native(test_config.size_bytes, 10, rng());
+    auto b_data = create_random_vector_of_bfloat16_native(test_config.size_bytes, 10, rng());
+
+    auto cb_a =
+        MakeCircularBufferBFP16(program, test_config.cores, tt::CBIndex::c_0, test_config.num_tiles_per_core, a);
+    auto cb_b =
+        MakeCircularBufferBFP16(program, test_config.cores, tt::CBIndex::c_1, test_config.num_tiles_per_core, b);
+    auto cb_c =
+        MakeCircularBufferBFP16(program, test_config.cores, tt::CBIndex::c_2, test_config.num_tiles_per_core, c);
+
+    auto compute = CreateKernel(
+        program,
+        "tt_metal/programming_examples/vecadd_sharding/kernels/add_sharding.cpp",
+        test_config.cores,
+        ComputeConfig{
+            .math_approx_mode = false,
+            // pass in compile time arguments
+            .compile_args = {tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_2},
+            .defines = {}});
+
+    // copy data from host to L1 directly
+    detail::WriteToBuffer(a, a_data);
+    detail::WriteToBuffer(b, b_data);
+
+    for (int i = 0; i < test_config.num_cores; ++i) {
+        // Set runtime arguments for each core.
+        CoreCoord core = {0, i};
+        SetRuntimeArgs(program, compute, core, {test_config.num_tiles_per_core});
+    }
+
+    CommandQueue& cq = device->command_queue();
+    // Enqueue the program
+    EnqueueProgram(cq, program, true);
+
+    std::cout << "Kernel execution finished" << std::endl;
+
+    // Read the output buffer.
+    std::vector<bfloat16> c_data;
+    detail::ReadFromBuffer(c, c_data);
+
+    // Print partial results so we can see the output is correct (plus or minus
+    // some error due to BFP16 precision)
+    std::cout << "Partial results: (note we are running under BFP16. It's going "
+                 "to be less accurate)\n";
+    size_t element_per_core = constants::TILE_HW * test_config.num_tiles_per_core;
+    size_t print_per_core = std::min((size_t)10, element_per_core);
+
+    for (int core = 0; core < test_config.num_cores; ++core) {
+        const auto core_offset = core * element_per_core;
+        std::cout << "Core (0, " << core << "):\n";
+        for (int index = 0; index < print_per_core; index++) {
+            const auto i = core_offset + index;
+            std::cout << "index  " << i << "   " << a_data[i].to_float() << " + " << b_data[i].to_float() << " = "
+                      << c_data[i].to_float() << "\n";
+        }
+        std::cout << std::endl;
+    }
+    std::cout << std::flush;
+
+    // Finally, we close the device.
+    CloseDevice(device);
+    return 0;
+}