Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#16469 Add sharding to vecadd example #17011

Merged
merged 1 commit into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tt_metal/programming_examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ set(PROGRAMMING_EXAMPLES_SRCS
${CMAKE_CURRENT_SOURCE_DIR}/matmul_single_core/matmul_single_core.cpp
${CMAKE_CURRENT_SOURCE_DIR}/pad/pad_multi_core.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sharding/shard_data_rm.cpp
${CMAKE_CURRENT_SOURCE_DIR}/vecadd_multi_core/vecadd_multi_core.cpp
${CMAKE_CURRENT_SOURCE_DIR}/vecadd_sharding/vecadd_sharding.cpp
)

include(${PROJECT_SOURCE_DIR}/cmake/helper_functions.cmake)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

#include "compute_kernel_api.h"
#include "compute_kernel_api/common.h"
#include "compute_kernel_api/eltwise_binary.h"
#include "compute_kernel_api/tile_move_copy.h"
#include <cstdint>

namespace NAMESPACE {
void MAIN {
// We are going to read from these two circular buffers
constexpr auto cb_in0 = get_compile_time_arg_val(0);
constexpr auto cb_in1 = get_compile_time_arg_val(1);
// and write to the output circular buffer
constexpr auto cb_out0 = get_compile_time_arg_val(2);

uint32_t num_tile = get_arg_val<uint32_t>(0);

// The destination register.
// Quote the doc: "This register is an array of 16 tiles of 32x32 elements
// each." If you are familiar with the concept of rotating register file
// from computer architecture. Think it like that. Later on we will ensure
// that registers are free and then we will submit compute to the FPU/SFPU
// that writes to the register.
constexpr uint32_t dst_reg = 0;

// Tell the SFPU that we will be using circular buffers c_in0, c_in1 and
// c_out0 to perform the computation.
binary_op_init_common(cb_in0, cb_in1, cb_out0);
// And we are going to add tiles. This function is only called if we ever
// need to switch operation to something else. Since we are only adding
// tiles, this function is only called once before the loop.
add_tiles_init();

// Loop over the assigned tiles and perform the computation
for (uint32_t i = 0; i < num_tile; i++) {
// IMPORTANT: since there is no read kernel, and data is alraedy in circular buffers
// do not call cb_wait_front() because there is no wait.
// if calling cb_wait_front() here, the kernel will hang forever.

// Make sure there is a valid MATH thread register we can use.
tile_regs_acquire();

// Add the tiles from the input circular buffers and write the result to
// the destination register
add_tiles(cb_in0, cb_in1, 0, 0, dst_reg);

// release lock on DST register by MATH thread
tile_regs_commit();

cb_pop_front(cb_in0, 1);
cb_pop_front(cb_in1, 1);

// acquire an exclusive lock on the DST register for the PACK thread.
// make sure MATH thread has committed the DST register earlier
tile_regs_wait();

// Copy the result from adding the tiles to the output circular buffer
pack_tile(dst_reg, cb_out0);

// release lock on DST register by PACK thread
tile_regs_release();

// no need to call cb_reserve_back(cb_out0, 1)
// buffer because output circular buffer is pointed to already allocated L1 buffer
// but it does not hurt to call it

// Mark the output tile as ready and pop the input tiles
cb_push_back(cb_out0, 1);
}
}
} // namespace NAMESPACE
211 changes: 211 additions & 0 deletions tt_metal/programming_examples/vecadd_sharding/vecadd_sharding.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
// SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0

// This programming example is an advanced example, compared to the vecadd single core example in the
// contributed folder. It illustrated sharding tensor inputs to L1 memory of multiple cores directly,
// then perform vector addition tile by tile. Because of sharding to L1, DRAM is not involved.
// Data copy is avoided and reader and writer kernels are not needed.

#include <tt-metalium/bfloat16.hpp>
#include <tt-metalium/core_coord.hpp>
#include <tt-metalium/host_api.hpp>
#include <tt-metalium/device_impl.hpp>
#include <tt-metalium/tt_metal.hpp>

#include <cstddef>
#include <cstdint>
#include <memory>
#include <random>
#include <string_view>
#include <vector>

using namespace tt;
using namespace tt::tt_metal;

using CoreSpec = std::variant<CoreCoord, CoreRange, CoreRangeSet>;

// sharding configuration is defined by the following struct
struct L1Config {
L1Config(TensorMemoryLayout layout, uint32_t cores_height, uint32_t cores_width) :
buffer_layout(layout), num_cores_height(cores_height), num_cores_width(cores_width) {}

TensorMemoryLayout buffer_layout;
uint32_t num_cores_height;
uint32_t num_cores_width;

// following sharding parameters are hardcode for this example
tt::DataFormat l1_data_format = tt::DataFormat::Float16_b;
uint32_t element_size = 2;
uint32_t num_tiles_per_core_height = 2;
uint32_t num_tiles_per_core_width = 2;

// following sharding parameters are calculated based on the above configuration
uint32_t num_cores = num_cores_height * num_cores_width;
uint32_t num_tiles_per_core = num_tiles_per_core_height * num_tiles_per_core_width;
uint32_t size_bytes = num_cores_height * num_tiles_per_core_height * tt::constants::TILE_HEIGHT * num_cores_width *
num_tiles_per_core_width * tt::constants::TILE_WIDTH * element_size;
uint32_t page_size_bytes = tt::constants::TILE_HW * element_size;
CoreRange cores = CoreRange(CoreCoord(0, 0), CoreCoord(0, num_cores - 1));
ShardSpecBuffer shard_spec() const {
return ShardSpecBuffer(
CoreRangeSet(std::set<CoreRange>({cores})),
{(uint32_t)num_tiles_per_core_height * tt::constants::TILE_HEIGHT,
(uint32_t)num_tiles_per_core_width * tt::constants::TILE_WIDTH},
ShardOrientation::ROW_MAJOR,
{tt::constants::TILE_HEIGHT, tt::constants::TILE_WIDTH},
{num_cores_height * num_tiles_per_core_height * num_cores_height,
num_tiles_per_core_width * num_cores_width});
}
};

std::shared_ptr<Buffer> MakeShardedL1BufferBFP16(IDevice* device, const L1Config& test_config) {
return CreateBuffer(tt::tt_metal::ShardedBufferConfig{
.device = device,
.size = test_config.size_bytes,
.page_size = test_config.page_size_bytes,
.buffer_layout = test_config.buffer_layout,
.shard_parameters = test_config.shard_spec()});
}

CBHandle MakeCircularBufferBFP16(
Program& program, const CoreSpec& core, tt::CBIndex cb, uint32_t n_tiles, const std::shared_ptr<Buffer>& l1_buf) {
constexpr uint32_t tile_size = sizeof(bfloat16) * tt::constants::TILE_HW;
CircularBufferConfig cb_src0_config = CircularBufferConfig(n_tiles * tile_size, {{cb, tt::DataFormat::Float16_b}})
.set_page_size(cb, tile_size)
// IMPORTANT: assign L1 buffer address to circular buffer directly so that
// no extra allocation and data copy
.set_globally_allocated_address(*l1_buf);
return CreateCircularBuffer(program, core, cb_src0_config);
}

std::string next_arg(int& i, int argc, char** argv) {
if (i + 1 >= argc) {
std::cerr << "Expected argument after " << argv[i] << std::endl;
exit(1);
}
return argv[++i];
}

void help(std::string_view program_name) {
std::cout << "Usage: " << program_name << " [options]\n";
std::cout << "This program demonstrates how to add two vectors using "
"tt-Metalium.\n";
std::cout << "\n";
std::cout << "Options:\n";
std::cout << " --device, -d <device_id> Specify the device to run the "
"program on. Default is 0.\n";
std::cout << " --sharding_type, -s <sharding> Specify the sharding type "
"options are height, width, or block. Default is height.\n";
exit(0);
}

int main(int argc, char** argv) {
// used fixed seed for reproducibility and deterministic results
int seed = 0x1234567;
int device_id = 0;
std::string sharding_type = "height";

// sharding configuration, 4x4 of tiles bfloat16, each core has 2x2 tiles, sharded to 4 core
const std::unordered_map<std::string_view, L1Config> test_configs{
{"height", {TensorMemoryLayout::HEIGHT_SHARDED, 4, 1}},
{"width", {TensorMemoryLayout::WIDTH_SHARDED, 1, 4}},
{"block", {TensorMemoryLayout::BLOCK_SHARDED, 2, 2}},
};

// Quick and dirty argument parsing.
for (int i = 1; i < argc; i++) {
std::string_view arg = argv[i];
if (arg == "--device" || arg == "-d") {
device_id = std::stoi(next_arg(i, argc, argv));
} else if (arg == "--help" || arg == "-h") {
help(argv[0]);
return 0;
} else if (arg == "--sharding_type" || arg == "-s") {
sharding_type = next_arg(i, argc, argv);
if (not test_configs.contains(sharding_type)) {
std::cout << "Invalid sharding type: " << sharding_type << std::endl;
help(argv[0]);
return 1;
}
} else {
std::cout << "Unknown argument: " << arg << std::endl;
help(argv[0]);
}
}

IDevice* device = CreateDevice(device_id);
Program program = CreateProgram();

std::cout << "Sharding type: " << sharding_type << std::endl;
const auto& test_config = test_configs.at(sharding_type);

// Create the input and output buffers.
auto a = MakeShardedL1BufferBFP16(device, test_config);
auto b = MakeShardedL1BufferBFP16(device, test_config);
auto c = MakeShardedL1BufferBFP16(device, test_config);

std::mt19937 rng(seed);
auto a_data = create_random_vector_of_bfloat16_native(test_config.size_bytes, 10, rng());
auto b_data = create_random_vector_of_bfloat16_native(test_config.size_bytes, 10, rng());

auto cb_a =
MakeCircularBufferBFP16(program, test_config.cores, tt::CBIndex::c_0, test_config.num_tiles_per_core, a);
auto cb_b =
MakeCircularBufferBFP16(program, test_config.cores, tt::CBIndex::c_1, test_config.num_tiles_per_core, b);
auto cb_c =
MakeCircularBufferBFP16(program, test_config.cores, tt::CBIndex::c_2, test_config.num_tiles_per_core, c);

auto compute = CreateKernel(
program,
"tt_metal/programming_examples/vecadd_sharding/kernels/add_sharding.cpp",
test_config.cores,
ComputeConfig{
.math_approx_mode = false,
// pass in compile time arguments
.compile_args = {tt::CBIndex::c_0, tt::CBIndex::c_1, tt::CBIndex::c_2},
.defines = {}});

// copy data from host to L1 directly
detail::WriteToBuffer(a, a_data);
detail::WriteToBuffer(b, b_data);

for (int i = 0; i < test_config.num_cores; ++i) {
// Set runtime arguments for each core.
CoreCoord core = {0, i};
SetRuntimeArgs(program, compute, core, {test_config.num_tiles_per_core});
}

CommandQueue& cq = device->command_queue();
// Enqueue the program
EnqueueProgram(cq, program, true);

std::cout << "Kernel execution finished" << std::endl;

// Read the output buffer.
std::vector<bfloat16> c_data;
detail::ReadFromBuffer(c, c_data);

// Print partial results so we can see the output is correct (plus or minus
// some error due to BFP16 precision)
std::cout << "Partial results: (note we are running under BFP16. It's going "
"to be less accurate)\n";
size_t element_per_core = constants::TILE_HW * test_config.num_tiles_per_core;
size_t print_per_core = std::min((size_t)10, element_per_core);

for (int core = 0; core < test_config.num_cores; ++core) {
const auto core_offset = core * element_per_core;
std::cout << "Core (0, " << core << "):\n";
for (int index = 0; index < print_per_core; index++) {
const auto i = core_offset + index;
std::cout << "index " << i << " " << a_data[i].to_float() << " + " << b_data[i].to_float() << " = "
<< c_data[i].to_float() << "\n";
}
std::cout << std::endl;
}
std::cout << std::flush;

// Finally, we close the device.
CloseDevice(device);
return 0;
}
Loading