diff --git a/tt_metal/programming_examples/CMakeLists.txt b/tt_metal/programming_examples/CMakeLists.txt index 4de55fdd190..dfbd663921f 100644 --- a/tt_metal/programming_examples/CMakeLists.txt +++ b/tt_metal/programming_examples/CMakeLists.txt @@ -13,7 +13,7 @@ set(PROGRAMMING_EXAMPLES_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/matmul_single_core/matmul_single_core.cpp ${CMAKE_CURRENT_SOURCE_DIR}/pad/pad_multi_core.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sharding/shard_data_rm.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/vecadd_multi_core/vecadd_multi_core.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/vecadd_sharding/vecadd_sharding.cpp ) include(${PROJECT_SOURCE_DIR}/cmake/helper_functions.cmake) diff --git a/tt_metal/programming_examples/vecadd_sharding/kernels/add_sharding.cpp b/tt_metal/programming_examples/vecadd_sharding/kernels/add_sharding.cpp new file mode 100644 index 00000000000..f607877abfa --- /dev/null +++ b/tt_metal/programming_examples/vecadd_sharding/kernels/add_sharding.cpp @@ -0,0 +1,74 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +#include "compute_kernel_api.h" +#include "compute_kernel_api/common.h" +#include "compute_kernel_api/eltwise_binary.h" +#include "compute_kernel_api/tile_move_copy.h" +#include + +namespace NAMESPACE { +void MAIN { + // We are going to read from these two circular buffers + constexpr auto cb_in0 = get_compile_time_arg_val(0); + constexpr auto cb_in1 = get_compile_time_arg_val(1); + // and write to the output circular buffer + constexpr auto cb_out0 = get_compile_time_arg_val(2); + + uint32_t num_tile = get_arg_val(0); + + // The destination register. + // Quote the doc: "This register is an array of 16 tiles of 32x32 elements + // each." If you are familiar with the concept of rotating register file + // from computer architecture. Think it like that. Later on we will ensure + // that registers are free and then we will submit compute to the FPU/SFPU + // that writes to the register. + constexpr uint32_t dst_reg = 0; + + // Tell the SFPU that we will be using circular buffers c_in0, c_in1 and + // c_out0 to perform the computation. + binary_op_init_common(cb_in0, cb_in1, cb_out0); + // And we are going to add tiles. This function is only called if we ever + // need to switch operation to something else. Since we are only adding + // tiles, this function is only called once before the loop. + add_tiles_init(); + + // Loop over the assigned tiles and perform the computation + for (uint32_t i = 0; i < num_tile; i++) { + // IMPORTANT: since there is no read kernel, and data is alraedy in circular buffers + // do not call cb_wait_front() because there is no wait. + // if calling cb_wait_front() here, the kernel will hang forever. + + // Make sure there is a valid MATH thread register we can use. + tile_regs_acquire(); + + // Add the tiles from the input circular buffers and write the result to + // the destination register + add_tiles(cb_in0, cb_in1, 0, 0, dst_reg); + + // release lock on DST register by MATH thread + tile_regs_commit(); + + cb_pop_front(cb_in0, 1); + cb_pop_front(cb_in1, 1); + + // acquire an exclusive lock on the DST register for the PACK thread. + // make sure MATH thread has committed the DST register earlier + tile_regs_wait(); + + // Copy the result from adding the tiles to the output circular buffer + pack_tile(dst_reg, cb_out0); + + // release lock on DST register by PACK thread + tile_regs_release(); + + // no need to call cb_reserve_back(cb_out0, 1) + // buffer because output circular buffer is pointed to already allocated L1 buffer + // but it does not hurt to call it + + // Mark the output tile as ready and pop the input tiles + cb_push_back(cb_out0, 1); + } +} +} // namespace NAMESPACE diff --git a/tt_metal/programming_examples/vecadd_sharding/vecadd_sharding.cpp b/tt_metal/programming_examples/vecadd_sharding/vecadd_sharding.cpp new file mode 100644 index 00000000000..3918ec83986 --- /dev/null +++ b/tt_metal/programming_examples/vecadd_sharding/vecadd_sharding.cpp @@ -0,0 +1,211 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC +// +// SPDX-License-Identifier: Apache-2.0 + +// This programming example is an advanced example, compared to the vecadd single core example in the +// contributed folder. It illustrated sharding tensor inputs to L1 memory of multiple cores directly, +// then perform vector addition tile by tile. Because of sharding to L1, DRAM is not involved. +// Data copy is avoided and reader and writer kernels are not needed. + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using namespace tt; +using namespace tt::tt_metal; + +using CoreSpec = std::variant; + +// sharding configuration is defined by the following struct +struct L1Config { + L1Config(TensorMemoryLayout layout, uint32_t cores_height, uint32_t cores_width) : + buffer_layout(layout), num_cores_height(cores_height), num_cores_width(cores_width) {} + + TensorMemoryLayout buffer_layout; + uint32_t num_cores_height; + uint32_t num_cores_width; + + // following sharding parameters are hardcode for this example + tt::DataFormat l1_data_format = tt::DataFormat::Float16_b; + uint32_t element_size = 2; + uint32_t num_tiles_per_core_height = 2; + uint32_t num_tiles_per_core_width = 2; + + // following sharding parameters are calculated based on the above configuration + uint32_t num_cores = num_cores_height * num_cores_width; + uint32_t num_tiles_per_core = num_tiles_per_core_height * num_tiles_per_core_width; + uint32_t size_bytes = num_cores_height * num_tiles_per_core_height * tt::constants::TILE_HEIGHT * num_cores_width * + num_tiles_per_core_width * tt::constants::TILE_WIDTH * element_size; + uint32_t page_size_bytes = tt::constants::TILE_HW * element_size; + CoreRange cores = CoreRange(CoreCoord(0, 0), CoreCoord(0, num_cores - 1)); + ShardSpecBuffer shard_spec() const { + return ShardSpecBuffer( + CoreRangeSet(std::set({cores})), + {(uint32_t)num_tiles_per_core_height * tt::constants::TILE_HEIGHT, + (uint32_t)num_tiles_per_core_width * tt::constants::TILE_WIDTH}, + ShardOrientation::ROW_MAJOR, + {tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH}, + {num_cores_height * num_tiles_per_core_height * num_cores_height, + num_tiles_per_core_width * num_cores_width}); + } +}; + +std::shared_ptr MakeShardedL1BufferBFP16(IDevice* device, const L1Config& test_config) { + return CreateBuffer(tt::tt_metal::ShardedBufferConfig{ + .device = device, + .size = test_config.size_bytes, + .page_size = test_config.page_size_bytes, + .buffer_layout = test_config.buffer_layout, + .shard_parameters = test_config.shard_spec()}); +} + +CBHandle MakeCircularBufferBFP16( + Program& program, const CoreSpec& core, tt::CBIndex cb, uint32_t n_tiles, const std::shared_ptr& l1_buf) { + constexpr uint32_t tile_size = sizeof(bfloat16) * tt::constants::TILE_HW; + CircularBufferConfig cb_src0_config = CircularBufferConfig(n_tiles * tile_size, {{cb, tt::DataFormat::Float16_b}}) + .set_page_size(cb, tile_size) + // IMPORTANT: assign L1 buffer address to circular buffer directly so that + // no extra allocation and data copy + .set_globally_allocated_address(*l1_buf); + return CreateCircularBuffer(program, core, cb_src0_config); +} + +std::string next_arg(int& i, int argc, char** argv) { + if (i + 1 >= argc) { + std::cerr << "Expected argument after " << argv[i] << std::endl; + exit(1); + } + return argv[++i]; +} + +void help(std::string_view program_name) { + std::cout << "Usage: " << program_name << " [options]\n"; + std::cout << "This program demonstrates how to add two vectors using " + "tt-Metalium.\n"; + std::cout << "\n"; + std::cout << "Options:\n"; + std::cout << " --device, -d Specify the device to run the " + "program on. Default is 0.\n"; + std::cout << " --sharding_type, -s Specify the sharding type " + "options are height, width, or block. Default is height.\n"; + exit(0); +} + +int main(int argc, char** argv) { + // used fixed seed for reproducibility and deterministic results + int seed = 0x1234567; + int device_id = 0; + std::string sharding_type = "height"; + + // sharding configuration, 4x4 of tiles bfloat16, each core has 2x2 tiles, sharded to 4 core + const std::unordered_map test_configs{ + {"height", {TensorMemoryLayout::HEIGHT_SHARDED, 4, 1}}, + {"width", {TensorMemoryLayout::WIDTH_SHARDED, 1, 4}}, + {"block", {TensorMemoryLayout::BLOCK_SHARDED, 2, 2}}, + }; + + // Quick and dirty argument parsing. + for (int i = 1; i < argc; i++) { + std::string_view arg = argv[i]; + if (arg == "--device" || arg == "-d") { + device_id = std::stoi(next_arg(i, argc, argv)); + } else if (arg == "--help" || arg == "-h") { + help(argv[0]); + return 0; + } else if (arg == "--sharding_type" || arg == "-s") { + sharding_type = next_arg(i, argc, argv); + if (not test_configs.contains(sharding_type)) { + std::cout << "Invalid sharding type: " << sharding_type << std::endl; + help(argv[0]); + return 1; + } + } else { + std::cout << "Unknown argument: " << arg << std::endl; + help(argv[0]); + } + } + + IDevice* device = CreateDevice(device_id); + Program program = CreateProgram(); + + std::cout << "Sharding type: " << sharding_type << std::endl; + const auto& test_config = test_configs.at(sharding_type); + + // Create the input and output buffers. + auto a = MakeShardedL1BufferBFP16(device, test_config); + auto b = MakeShardedL1BufferBFP16(device, test_config); + auto c = MakeShardedL1BufferBFP16(device, test_config); + + std::mt19937 rng(seed); + auto a_data = create_random_vector_of_bfloat16_native(test_config.size_bytes, 10, rng()); + auto b_data = create_random_vector_of_bfloat16_native(test_config.size_bytes, 10, rng()); + + auto cb_a = + MakeCircularBufferBFP16(program, test_config.cores, tt::CBIndex::c_0, test_config.num_tiles_per_core, a); + auto cb_b = + MakeCircularBufferBFP16(program, test_config.cores, tt::CBIndex::c_1, test_config.num_tiles_per_core, b); + auto cb_c = + MakeCircularBufferBFP16(program, test_config.cores, tt::CBIndex::c_2, test_config.num_tiles_per_core, c); + + auto compute = CreateKernel( + program, + "tt_metal/programming_examples/vecadd_sharding/kernels/add_sharding.cpp", + test_config.cores, + ComputeConfig{ + .math_approx_mode = false, + // pass in compile time arguments + .compile_args = {tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_2}, + .defines = {}}); + + // copy data from host to L1 directly + detail::WriteToBuffer(a, a_data); + detail::WriteToBuffer(b, b_data); + + for (int i = 0; i < test_config.num_cores; ++i) { + // Set runtime arguments for each core. + CoreCoord core = {0, i}; + SetRuntimeArgs(program, compute, core, {test_config.num_tiles_per_core}); + } + + CommandQueue& cq = device->command_queue(); + // Enqueue the program + EnqueueProgram(cq, program, true); + + std::cout << "Kernel execution finished" << std::endl; + + // Read the output buffer. + std::vector c_data; + detail::ReadFromBuffer(c, c_data); + + // Print partial results so we can see the output is correct (plus or minus + // some error due to BFP16 precision) + std::cout << "Partial results: (note we are running under BFP16. It's going " + "to be less accurate)\n"; + size_t element_per_core = constants::TILE_HW * test_config.num_tiles_per_core; + size_t print_per_core = std::min((size_t)10, element_per_core); + + for (int core = 0; core < test_config.num_cores; ++core) { + const auto core_offset = core * element_per_core; + std::cout << "Core (0, " << core << "):\n"; + for (int index = 0; index < print_per_core; index++) { + const auto i = core_offset + index; + std::cout << "index " << i << " " << a_data[i].to_float() << " + " << b_data[i].to_float() << " = " + << c_data[i].to_float() << "\n"; + } + std::cout << std::endl; + } + std::cout << std::flush; + + // Finally, we close the device. + CloseDevice(device); + return 0; +}