Skip to content

Commit

Permalink
#0: Remove AddConfigBuffer for programs and update use cases to store…
Browse files Browse the repository at this point in the history
… in program cache instead
  • Loading branch information
tt-aho committed Jan 22, 2025
1 parent 8aca869 commit 244460b
Show file tree
Hide file tree
Showing 11 changed files with 51 additions and 43 deletions.
4 changes: 1 addition & 3 deletions tt_metal/api/tt-metalium/program_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ namespace detail{
void ValidateCircularBufferRegion(const Program &program, const IDevice* device);
KernelHandle AddKernel (Program &program, const std::shared_ptr<Kernel>& kernel, const HalProgrammableCoreType core_type);
std::shared_ptr<Kernel> GetKernel(const Program &program, KernelHandle kernel_id);
std::shared_ptr<CircularBuffer> GetCircularBuffer(const Program &program, CBHandle id);
void AddConfigBuffer(Program &program, const std::shared_ptr<Buffer>& config_buffer);
std::shared_ptr<CircularBuffer> GetCircularBuffer(const Program& program, CBHandle id);

class Internal_;
}
Expand Down Expand Up @@ -228,7 +227,6 @@ class Program {
std::unordered_map<uint64_t, ProgramCommandSequence> &get_cached_program_command_sequences() noexcept;
bool kernel_binary_always_stored_in_ringbuffer();

friend void detail::AddConfigBuffer(Program &program, const std::shared_ptr<Buffer>& config_buffer);
friend void program_dispatch::assemble_device_commands(
ProgramCommandSequence& program_command_sequence, Program& program, IDevice* device, SubDeviceId sub_device_id);
template<typename T> friend void program_dispatch::finalize_program_offsets(T&, IDevice*);
Expand Down
11 changes: 0 additions & 11 deletions tt_metal/impl/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,6 @@ class Program_ {
std::vector<std::vector<std::shared_ptr<KernelGroup>>> kernel_groups_;
std::vector<std::vector<uint8_t>> core_to_kernel_group_index_table_;

std::vector<std::shared_ptr<Buffer>> config_buffers_;

std::vector<ProgramConfig> program_configs_;
// Counts how much space is needed for each core + each launch buffer msg queue.
std::vector<uint32_t> program_config_sizes_;
Expand All @@ -252,9 +250,6 @@ class Program_ {

void add_semaphore(const CoreRangeSet & crs, uint32_t semaphore_id, uint32_t init_value, CoreType core_type);

friend void AddConfigBuffer(Program &program, const std::shared_ptr<Buffer>& config_buffer);
void add_config_buffer(const std::shared_ptr<Buffer>& config_buffer);

// Ensures that statically allocated circular buffers do not grow into L1 buffer space
void validate_circular_buffer_region(const IDevice* device);

Expand Down Expand Up @@ -297,10 +292,6 @@ void ValidateCircularBufferRegion(const Program &program, const IDevice* device)
program.pimpl_->validate_circular_buffer_region(device);
}

void AddConfigBuffer(Program &program, const std::shared_ptr<Buffer>& config_buffer) {
program.pimpl_->add_config_buffer(std::move(config_buffer));
}

void EnablePersistentKernelCache() { enable_persistent_kernel_cache = true; }

void DisablePersistentKernelCache() { enable_persistent_kernel_cache = false; }
Expand Down Expand Up @@ -894,8 +885,6 @@ void Program::add_semaphore(const CoreRangeSet &crs, uint32_t semaphore_id, uint
pimpl_->add_semaphore(crs, semaphore_id, init_value, core_type);
}

void detail::Program_::add_config_buffer(const std::shared_ptr<Buffer>& config_buffer) { config_buffers_.emplace_back(config_buffer); }

std::vector<std::vector<CoreCoord>> detail::Program_::logical_cores() const {
std::vector<std::vector<CoreCoord>> cores_in_program;
std::vector<std::set<CoreCoord>> unique_cores;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
uint32_t out_subblock_h_ntiles = block_config.out_subblock_h_ntiles;
uint32_t out_subblock_w_ntiles = block_config.out_subblock_w_ntiles;

auto conv_reader_indices_buffer = conv_reader_indices.value().device_buffer();

// out_subblock_h_ntiles = 8;

tt::DataFormat act_df = tt_metal::datatype_to_dataformat_converter(a.get_dtype());
Expand Down Expand Up @@ -1239,7 +1241,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
CircularBufferConfig cb_for_reader_indices_config =
CircularBufferConfig(out_block_h_datums * 2, {{cb_for_reader_indices, tt::DataFormat::Float16_b}})
.set_page_size(cb_for_reader_indices, out_block_h_datums * 2);
cb_for_reader_indices_config.set_globally_allocated_address(*conv_reader_indices.value().buffer());
cb_for_reader_indices_config.set_globally_allocated_address(*conv_reader_indices_buffer);
auto cb_for_reader_indices_id =
tt_metal::CreateCircularBuffer(program, all_cores, cb_for_reader_indices_config);

Expand Down Expand Up @@ -1714,6 +1716,7 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(

auto mcast_sender_cores_vec = grid_to_cores(mcast_sender_cores.start_coord, mcast_sender_cores.end_coord, true);
auto mcast_receiver_cores_vec = corerange_to_cores(mcast_receiver_cores, std::nullopt, true);
// Capture conv_reader_indices_buffer to cache this with the program
auto override_runtime_arguments_callback =
[reader_kernel_id = reader_id,
mcast_sender_cores = mcast_sender_cores_vec,
Expand All @@ -1725,7 +1728,8 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_impl(
total_active_num_cores = total_active_num_cores,
num_cores_x = num_cores_x,
num_cores_y = num_cores_y,
has_bias = has_bias](
has_bias = has_bias,
conv_reader_indices_buffer = conv_reader_indices_buffer](
const void* operation,
Program& program,
const std::vector<Tensor>& input_tensors,
Expand Down Expand Up @@ -1836,8 +1840,6 @@ operation::ProgramWithCallbacks multi_core_optimized_conv_sharded_v2_new(
conv_reader_indices_tensor = ttnn::operations::sliding_window::move_config_tensor_to_device(
conv_reader_indices_tensor, parallel_config, is_block_sharded, a.device());

// add config tensor to program
tt::tt_metal::detail::AddConfigBuffer(program, conv_reader_indices_tensor.device_buffer());
if (parallel_config.shard_scheme == TensorMemoryLayout::WIDTH_SHARDED) {
return multi_core_optimized_conv_width_sharded_v2_impl(
program,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -739,10 +739,12 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh
act_block_num_tiles_split,
act_tile_size);

auto conv_reader_indices_buffer = conv_reader_indices.value().device_buffer();

CircularBufferConfig cb_for_reader_indices_config =
CircularBufferConfig(out_block_h_datums * 2, {{cb_for_reader_indices, tt::DataFormat::Float16_b}})
.set_page_size(cb_for_reader_indices, out_block_h_datums * 2);
cb_for_reader_indices_config.set_globally_allocated_address(*conv_reader_indices.value().buffer());
cb_for_reader_indices_config.set_globally_allocated_address(*conv_reader_indices_buffer);
auto cb_for_reader_indices_id = tt_metal::CreateCircularBuffer(program, all_cores, cb_for_reader_indices_config);

if (has_bias) {
Expand Down Expand Up @@ -874,11 +876,13 @@ tt::tt_metal::operation::ProgramWithCallbacks multi_core_optimized_conv_width_sh
(uint32_t)(core_index < output_num_cores)});
}

auto empty_callback = [](const void* operation,
Program& program,
const std::vector<Tensor>& input_tensors,
const std::vector<std::optional<const Tensor>>& optional_input_tensors,
const std::vector<Tensor>& output_tensors) {};
// Capture conv_reader_indices_buffer to cache this with the program
auto empty_callback = [conv_reader_indices_buffer](
const void* operation,
Program& program,
const std::vector<Tensor>& input_tensors,
const std::vector<std::optional<const Tensor>>& optional_input_tensors,
const std::vector<Tensor>& output_tensors) {};
return {.program = std::move(program), .override_runtime_arguments_callback = empty_callback};
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ operation::ProgramWithCallbacks UntilizeWithHaloV2::create_program(
remote_config,
remote_read_,
transpose_mcast_,
output_tensor)};
output_tensor,
/*capture_buffers=*/false)};
}

} // namespace ttnn::operations::data_movement
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
const Tensor& remote_config,
const bool remote_read,
const bool transpose_mcast,
Tensor& output_tensor) {
Tensor& output_tensor,
const bool capture_buffers) {
IDevice* device = input_tensor.device();
Buffer* src_buffer = input_tensor.buffer();
Buffer* dst_buffer = output_tensor.buffer();
Expand Down Expand Up @@ -143,22 +144,22 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
TT_ASSERT(local_config.get_dtype() == DataType::UINT16);
TT_ASSERT(remote_config.get_dtype() == DataType::UINT16);

Buffer* padding_config_buffer = padding_config.buffer();
auto padding_config_buffer = padding_config.device_buffer();
const uint32_t num_cores = all_cores.num_cores();
auto padding_config_cb_config =
CircularBufferConfig(padding_config_buffer->size() / num_cores, {{padding_config_cb_id, kernel_config_df}})
.set_page_size(padding_config_cb_id, padding_config_buffer->page_size())
.set_globally_allocated_address(*padding_config_buffer);
CBHandle padding_config_cb = CreateCircularBuffer(program, all_cores, padding_config_cb_config);

Buffer* local_config_buffer = local_config.buffer();
auto local_config_buffer = local_config.device_buffer();
auto local_config_cb_config =
CircularBufferConfig(local_config_buffer->size() / num_cores, {{local_config_cb_id, kernel_config_df}})
.set_page_size(local_config_cb_id, local_config_buffer->page_size())
.set_globally_allocated_address(*local_config_buffer);
CBHandle local_config_cb = CreateCircularBuffer(program, all_cores, local_config_cb_config);

Buffer* remote_config_buffer = remote_config.buffer();
auto remote_config_buffer = remote_config.device_buffer();
auto remote_config_cb_config =
CircularBufferConfig(remote_config_buffer->size() / num_cores, {{remote_config_cb_id, kernel_config_df}})
.set_page_size(remote_config_cb_id, remote_config_buffer->page_size())
Expand Down Expand Up @@ -212,7 +213,20 @@ operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
DataMovementConfig{
.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default, .compile_args = reader_ct_args});

auto override_runtime_arguments_callback = [src_cb, out_cb, padding_config_cb, local_config_cb, remote_config_cb](
if (!capture_buffers) {
padding_config_buffer = nullptr;
local_config_buffer = nullptr;
remote_config_buffer = nullptr;
}
// Capture padding_config_buffer, local_config_buffer, remote_config_buffer to cache this with the program
auto override_runtime_arguments_callback = [src_cb,
out_cb,
padding_config_cb,
local_config_cb,
remote_config_cb,
padding_config_buffer,
local_config_buffer,
remote_config_buffer](
const void* operation,
Program& program,
const std::vector<Tensor>& input_tensors,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ tt::tt_metal::operation::ProgramWithCallbacks untilize_with_halo_multi_core_v2(
const Tensor& remote_config,
const bool remote_read,
const bool transpose_mcast,
Tensor& output_tensor);
Tensor& output_tensor,
const bool capture_buffers); // Used by halo op to cache internally created config buffers with the program
// Untilize with Halo V2 op takes them as inputs from the user, so doesn't capture

} // namespace ttnn::operations::data_movement::detail
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_
// This should allocate a DRAM buffer on the device
IDevice* device = input.device();
tt::tt_metal::Buffer* src_dram_buffer = input.buffer();
tt::tt_metal::Buffer* reader_indices_buffer = reader_indices.buffer();
auto reader_indices_buffer = reader_indices.device_buffer();
tt::tt_metal::Buffer* dst_dram_buffer = output.buffer();

const tt::tt_metal::LegacyShape input_shape = input.get_legacy_shape();
Expand Down Expand Up @@ -376,14 +376,16 @@ Pool2D::MultiCore::cached_program_t pool2d_multi_core_sharded_with_halo_v2_impl_

auto compute_kernel = CreateKernel(program, compute_kernel_fname, core_range, compute_config);

// Capture reader_indices_buffer to cache this with the program
return {
std::move(program),
{.reader0_kernel = reader0_kernel,
.reader1_kernel = reader1_kernel,
.raw_in_cb = raw_in_cb,
.cb_out = cb_out,
.ncores = ncores,
.ncores_w = ncores_w}};
.ncores_w = ncores_w,
.reader_indices_buffer = reader_indices_buffer}};
}

Pool2D::MultiCore::cached_program_t Pool2D::MultiCore::create(
Expand Down Expand Up @@ -418,8 +420,6 @@ Pool2D::MultiCore::cached_program_t Pool2D::MultiCore::create(
auto reader_indices_on_device =
sliding_window::move_config_tensor_to_device(reader_indices, parallel_config, is_block_sharded, input.device());

tt::tt_metal::detail::AddConfigBuffer(program, reader_indices_on_device.device_buffer());

auto in_n = sliding_window_config.batch_size;
auto in_h = sliding_window_config.input_hw.first;
auto in_w = sliding_window_config.input_hw.second;
Expand Down
1 change: 1 addition & 0 deletions ttnn/cpp/ttnn/operations/pool/generic/device/pool_op.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct Pool2D {
CBHandle cb_out;
uint32_t ncores;
uint32_t ncores_w;
std::shared_ptr<Buffer> reader_indices_buffer;
};

using cached_program_t = ttnn::device_operation::CachedProgram<shared_variables_t>;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,9 @@ operation::ProgramWithCallbacks upsample_multi_core(
ShardSpec config_shard_spec(input.shard_spec().value().grid, shard_shape, config_tensor_shard_orientation);
MemoryConfig memory_config{input.memory_config().memory_layout, BufferType::L1_SMALL, config_shard_spec};
auto config_tensor_device = config_tensor.to(device, memory_config);
tt::tt_metal::detail::AddConfigBuffer(program, config_tensor_device.device_buffer());

tt::DataFormat config_df = tt::DataFormat::RawUInt16;
Buffer* config_buffer = config_tensor_device.buffer();
auto config_buffer = config_tensor_device.device_buffer();
auto config_buffer_page_size = config_buffer->page_size();
uint32_t config_cb_id = CBIndex::c_6;
auto config_cb_config = CircularBufferConfig(config_buffer_page_size, {{config_cb_id, config_df}})
Expand Down Expand Up @@ -311,7 +310,8 @@ operation::ProgramWithCallbacks upsample_multi_core(
TT_THROW("Unsupported memory layout");
}

auto override_runtime_args_callback = [writer_kernel, cb_src0, out_cb, config_cb](
// Capture config_buffer to cache this with the program
auto override_runtime_args_callback = [writer_kernel, cb_src0, out_cb, config_cb, config_buffer](
const void* operation,
Program& program,
const std::vector<Tensor>& input_tensors,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,6 @@ operation::ProgramWithCallbacks HaloDeviceOperation::create_program(

Program program = CreateProgram();

tt::tt_metal::detail::AddConfigBuffer(program, pad_config_device_tensor.device_buffer());
tt::tt_metal::detail::AddConfigBuffer(program, local_config_device_tensor.device_buffer());
tt::tt_metal::detail::AddConfigBuffer(program, remote_config_device_tensor.device_buffer());

return {data_movement::detail::untilize_with_halo_multi_core_v2(
program,
input_tensor,
Expand All @@ -136,7 +132,8 @@ operation::ProgramWithCallbacks HaloDeviceOperation::create_program(
remote_config_device_tensor,
remote_read_,
transpose_mcast_,
output_tensor)};
output_tensor,
/*capture_buffers=*/true)};
}

Tensor halo_op(
Expand Down

0 comments on commit 244460b

Please sign in to comment.