diff --git a/tests/tt_metal/distributed/CMakeLists.txt b/tests/tt_metal/distributed/CMakeLists.txt index df725bed51c2..d926ba9d7a31 100644 --- a/tests/tt_metal/distributed/CMakeLists.txt +++ b/tests/tt_metal/distributed/CMakeLists.txt @@ -2,6 +2,7 @@ set(UNIT_TESTS_DISTRIBUTED_SRC ${CMAKE_CURRENT_SOURCE_DIR}/test_distributed.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_workload.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/test_mesh_allocator.cpp ) add_executable(distributed_unit_tests ${UNIT_TESTS_DISTRIBUTED_SRC}) diff --git a/tests/tt_metal/distributed/test_mesh_allocator.cpp b/tests/tt_metal/distributed/test_mesh_allocator.cpp new file mode 100644 index 000000000000..903b3d6444c4 --- /dev/null +++ b/tests/tt_metal/distributed/test_mesh_allocator.cpp @@ -0,0 +1,33 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include +#include "tests/tt_metal/tt_metal/common/multi_device_fixture.hpp" + +namespace tt::tt_metal::distributed::test { + +using MeshAllocatorTest = T3000MultiDeviceFixture; + +TEST_F(MeshAllocatorTest, BasicAllocationSanityCheck) { + const size_t allocation_size = 1024 * 8; // 1KB + const tt::tt_metal::BufferType buffer_type = tt::tt_metal::BufferType::L1; + + auto config = InterleavedBufferConfig{ + .device = mesh_device_.get(), + .size = allocation_size, + .page_size = 1024, + .buffer_type = buffer_type, + .buffer_layout = tt::tt_metal::TensorMemoryLayout::INTERLEAVED}; + + auto buffer = CreateBuffer(config); + + EXPECT_TRUE(buffer->is_allocated()); + EXPECT_EQ(buffer->size(), allocation_size); + EXPECT_EQ(buffer->buffer_type(), buffer_type); +} + +} // namespace tt::tt_metal::distributed::test diff --git a/tt_metal/api/tt-metalium/device_impl.hpp b/tt_metal/api/tt-metalium/device_impl.hpp index 4baaa11f4712..ea5a41c14821 100644 --- a/tt_metal/api/tt-metalium/device_impl.hpp +++ b/tt_metal/api/tt-metalium/device_impl.hpp @@ -5,7 +5,6 @@ #pragma once #include -#include #include #include "device.hpp" @@ -254,7 +253,8 @@ class Device : public IDevice { static constexpr uint32_t DEFAULT_NUM_SUB_DEVICES = 1; void initialize_cluster(); - std::unique_ptr initialize_allocator(size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap = {}); + std::unique_ptr initialize_allocator( + size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap = {}); void initialize_build(); void initialize_device_kernel_defines(); void initialize_device_bank_to_noc_tables(const HalProgrammableCoreType &core_type, CoreCoord virtual_core); diff --git a/tt_metal/api/tt-metalium/mesh_device.hpp b/tt_metal/api/tt-metalium/mesh_device.hpp index ec3ac7417182..ee682f3b5f93 100644 --- a/tt_metal/api/tt-metalium/mesh_device.hpp +++ b/tt_metal/api/tt-metalium/mesh_device.hpp @@ -15,8 +15,13 @@ #include "mesh_device_view.hpp" #include "sub_device_types.hpp" #include "span.hpp" +#include "work_executor.hpp" -namespace tt::tt_metal::distributed { +namespace tt::tt_metal { + +class SubDeviceManagerTracker; + +namespace distributed { class MeshCommandQueue; class MeshDeviceView; @@ -56,8 +61,8 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this parent_mesh_; // Submesh created with reference to parent mesh std::unique_ptr mesh_command_queue_; - - void initialize(); + std::unique_ptr sub_device_manager_tracker_; + std::unique_ptr work_executor_; // This is a reference device used to query properties that are the same for all devices in the mesh. IDevice* reference_device() const; @@ -292,7 +297,8 @@ class MeshDevice : public IDevice, public std::enable_shared_from_this l1_bank_remap = {}); }; std::ostream& operator<<(std::ostream& os, const MeshDevice& mesh_device); @@ -305,4 +311,6 @@ struct MeshSubDeviceManagerId { std::vector sub_device_manager_ids; }; -} // namespace tt::tt_metal::distributed +} // namespace distributed + +} // namespace tt::tt_metal diff --git a/tt_metal/api/tt-metalium/sub_device_manager.hpp b/tt_metal/api/tt-metalium/sub_device_manager.hpp index b2140ffab6ae..8b14b416d436 100644 --- a/tt_metal/api/tt-metalium/sub_device_manager.hpp +++ b/tt_metal/api/tt-metalium/sub_device_manager.hpp @@ -30,7 +30,8 @@ class SubDeviceManager { MAX_NUM_SUB_DEVICES <= std::numeric_limits::max(), "MAX_NUM_SUB_DEVICES must be less than or equal to the max value of SubDeviceId::Id"); // Constructor used for the default/global device - SubDeviceManager(IDevice* device, std::unique_ptr&& global_allocator); + SubDeviceManager( + IDevice* device, std::unique_ptr&& global_allocator, tt::stl::Span sub_devices); // Constructor used for regular sub-devices SubDeviceManager(tt::stl::Span sub_devices, DeviceAddr local_l1_size, IDevice* device); diff --git a/tt_metal/api/tt-metalium/sub_device_manager_tracker.hpp b/tt_metal/api/tt-metalium/sub_device_manager_tracker.hpp index f7131a42ba82..e1ad68e4a6bc 100644 --- a/tt_metal/api/tt-metalium/sub_device_manager_tracker.hpp +++ b/tt_metal/api/tt-metalium/sub_device_manager_tracker.hpp @@ -26,7 +26,8 @@ class SubDeviceManagerTracker { public: // TODO: Potentially move the global allocator creation into here instead of from the device // This creates the SubDeviceManagerTracker with a default SubDeviceManager that has the entire grid as a sub-device - SubDeviceManagerTracker(IDevice* device, std::unique_ptr&& global_allocator); + SubDeviceManagerTracker( + IDevice* device, std::unique_ptr&& global_allocator, tt::stl::Span sub_devices); SubDeviceManagerTracker(const SubDeviceManagerTracker& other) = delete; SubDeviceManagerTracker& operator=(const SubDeviceManagerTracker& other) = delete; @@ -58,6 +59,9 @@ class SubDeviceManagerTracker { // default case to not affect performance SubDeviceManagerId get_default_sub_device_manager_id() const; + std::optional lowest_occupied_compute_l1_address( + tt::stl::Span sub_device_ids = {}) const; + private: void reset_sub_device_state(const std::unique_ptr& sub_device_manager); diff --git a/tt_metal/api/tt-metalium/sub_device_types.hpp b/tt_metal/api/tt-metalium/sub_device_types.hpp index 574c6f699d78..01912af60bf2 100644 --- a/tt_metal/api/tt-metalium/sub_device_types.hpp +++ b/tt_metal/api/tt-metalium/sub_device_types.hpp @@ -33,6 +33,7 @@ struct SubDeviceId { return *this; } + bool operator<(size_t other) const { return id < other; } bool operator==(const SubDeviceId& other) const { return id == other.id; } bool operator!=(const SubDeviceId& other) const { return id != other.id; } diff --git a/tt_metal/distributed/mesh_buffer.cpp b/tt_metal/distributed/mesh_buffer.cpp index 153e40ed3fd2..a258df9d22ae 100644 --- a/tt_metal/distributed/mesh_buffer.cpp +++ b/tt_metal/distributed/mesh_buffer.cpp @@ -59,10 +59,9 @@ std::shared_ptr MeshBuffer::create( }}, mesh_buffer_config); - // Rely on the single device allocator to provide the address for the entire mesh buffer. - // TODO: use mesh allocator, when available. + // Rely on the MeshDevice allocator to provide the address for the entire mesh buffer. std::shared_ptr backing_buffer = Buffer::create( - mesh_device->get_device(0, 0), + mesh_device, /*address=*/address.value_or(0), device_local_size, device_local_config.page_size, @@ -104,7 +103,7 @@ void MeshBuffer::allocate() { auto allocate_device_buffer_at_address = [this](const Coordinate& coord) { std::shared_ptr buffer = Buffer::create( - mesh_device_->get_device(coord.row, coord.col), + mesh_device_, address_, device_local_size_, device_local_config_.page_size, diff --git a/tt_metal/distributed/mesh_device.cpp b/tt_metal/distributed/mesh_device.cpp index b3e0d1039c31..cb1c606396d7 100644 --- a/tt_metal/distributed/mesh_device.cpp +++ b/tt_metal/distributed/mesh_device.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -15,6 +16,11 @@ #include #include #include "tt_metal/distributed/mesh_command_queue.hpp" +#include +#include +#include +#include +#include #include @@ -25,8 +31,29 @@ MeshDeviceID generate_unique_mesh_id() { static std::atomic next_id{0}; return next_id++; } + +// Helper function to verify all devices in the MeshDevice have the same value +template +void validate_devices_return_same_value( + const std::vector& devices, F&& func, const std::source_location& loc = std::source_location::current()) { + if (devices.empty()) { + TT_THROW("{} [{}:{}] failed: MeshDevice has no devices", loc.function_name(), loc.file_name(), loc.line()); + } + + auto&& reference_value = std::forward(func)(devices.front()); + if (!std::all_of(devices.begin(), devices.end(), [&](const auto& device) { + return std::forward(func)(device) == reference_value; + })) { + TT_THROW( + "{} [{}:{}] failed: Devices contained in MeshDevice returned different values", + loc.function_name(), + loc.file_name(), + loc.line()); + } } +} // namespace + MeshDevice::ScopedDevices::ScopedDevices( size_t l1_small_size, size_t trace_region_size, @@ -56,39 +83,62 @@ const std::vector& MeshDevice::ScopedDevices::get_devices() const { re uint32_t MeshDevice::build_key() const { TT_FATAL(tt::tt_metal::hal.is_coordinate_virtualization_enabled(), "MeshDevice::build_key() expects coordinate virtualization to be enabled"); + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->build_key(); }); return reference_device()->build_key(); } -uint8_t MeshDevice::num_hw_cqs() const { return reference_device()->num_hw_cqs(); } +uint8_t MeshDevice::num_hw_cqs() const { + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->num_hw_cqs(); }); + return reference_device()->num_hw_cqs(); +} bool MeshDevice::is_initialized() const { - const auto& devices = this->get_devices(); - return std::all_of(devices.begin(), devices.end(), [](const auto& device) { return device->is_initialized(); }); + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->is_initialized(); }); + return reference_device()->is_initialized(); +} +uint32_t MeshDevice::l1_size_per_core() const { + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->l1_size_per_core(); }); + return reference_device()->l1_size_per_core(); } -uint32_t MeshDevice::l1_size_per_core() const { return reference_device()->l1_size_per_core(); } -uint32_t MeshDevice::dram_size_per_channel() const { return reference_device()->dram_size_per_channel(); } +uint32_t MeshDevice::dram_size_per_channel() const { + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->dram_size_per_channel(); }); + return reference_device()->dram_size_per_channel(); +} IDevice* MeshDevice::reference_device() const { return this->get_devices().at(0); } -MeshDevice::MeshDevice(std::shared_ptr mesh_handle, const MeshShape& mesh_shape, MeshType type, std::weak_ptr parent_mesh) : +MeshDevice::MeshDevice( + std::shared_ptr mesh_handle, + const MeshShape& mesh_shape, + MeshType type, + std::weak_ptr parent_mesh) : scoped_devices_(std::move(mesh_handle)), mesh_shape_(mesh_shape), type_(type), mesh_id_(generate_unique_mesh_id()), - parent_mesh_(std::move(parent_mesh)) {} + parent_mesh_(std::move(parent_mesh)) { + work_executor_ = std::make_unique(0 /* worker_core */, mesh_id_); +} std::shared_ptr MeshDevice::create( const MeshDeviceConfig& config, size_t l1_small_size, size_t trace_region_size, size_t num_command_queues, - const DispatchCoreConfig& dispatch_core_config) { + const DispatchCoreConfig& dispatch_core_config, + tt::stl::Span l1_bank_remap) { auto mesh_device = std::make_shared( std::make_shared(l1_small_size, trace_region_size, num_command_queues, dispatch_core_config, config), config.mesh_shape, config.mesh_type); - mesh_device->initialize(); + + mesh_device->initialize(num_command_queues, l1_small_size, trace_region_size, l1_bank_remap); return mesh_device; } @@ -149,14 +199,6 @@ std::vector> MeshDevice::create_submeshes(const Mesh return submeshes; } -void MeshDevice::initialize() { - view_ = std::make_unique(scoped_devices_->get_devices(), mesh_shape_); - SystemMesh::instance().register_mesh_device(shared_from_this(), this->get_devices()); - if (this->using_fast_dispatch()) { - mesh_command_queue_ = std::make_unique(this, 0); - } -} - MeshDevice::~MeshDevice() {} IDevice* MeshDevice::get_device_index(size_t device_index) const { @@ -199,11 +241,16 @@ const DeviceIds MeshDevice::get_device_ids() const { size_t MeshDevice::num_devices() const { return view_->num_devices(); } CoreCoord MeshDevice::compute_with_storage_grid_size() const { - return this->reference_device()->compute_with_storage_grid_size(); + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->compute_with_storage_grid_size(); }); + return reference_device()->compute_with_storage_grid_size(); } - -tt::ARCH MeshDevice::arch() const { return this->reference_device()->arch(); } +tt::ARCH MeshDevice::arch() const { + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->arch(); }); + return reference_device()->arch(); +} size_t MeshDevice::num_rows() const { return mesh_shape_.num_rows; } @@ -260,6 +307,8 @@ bool MeshDevice::close() { } parent_mesh_.reset(); view_.reset(); + sub_device_manager_tracker_.reset(); + work_executor_.reset(); return true; } @@ -306,56 +355,114 @@ size_t MeshDevice::num_program_cache_entries() { return total_entries; } +SubDeviceManagerId MeshDevice::create_sub_device_manager( + tt::stl::Span sub_devices, DeviceAddr local_l1_size) { + return sub_device_manager_tracker_->create_sub_device_manager(sub_devices, local_l1_size); +} +void MeshDevice::remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id) { + sub_device_manager_tracker_->remove_sub_device_manager(sub_device_manager_id); +} +void MeshDevice::load_sub_device_manager(SubDeviceManagerId sub_device_manager_id) { + sub_device_manager_tracker_->load_sub_device_manager(sub_device_manager_id); +} +void MeshDevice::clear_loaded_sub_device_manager() { sub_device_manager_tracker_->clear_loaded_sub_device_manager(); } -SubDeviceManagerId MeshDevice::create_sub_device_manager(tt::stl::Span sub_devices, DeviceAddr local_l1_size) { return reference_device()->create_sub_device_manager(sub_devices, local_l1_size); } -void MeshDevice::remove_sub_device_manager(SubDeviceManagerId sub_device_manager_id) { return reference_device()->remove_sub_device_manager(sub_device_manager_id); } -void MeshDevice::load_sub_device_manager(SubDeviceManagerId sub_device_manager_id) { return reference_device()->load_sub_device_manager(sub_device_manager_id); } -void MeshDevice::clear_loaded_sub_device_manager() { return reference_device()->clear_loaded_sub_device_manager(); } - -std::tuple MeshDevice::create_sub_device_manager_with_fabric(tt::stl::Span sub_devices, DeviceAddr local_l1_size) { return reference_device()->create_sub_device_manager_with_fabric(sub_devices, local_l1_size); } -CoreCoord MeshDevice::dram_grid_size() const { return reference_device()->dram_grid_size(); } +std::tuple MeshDevice::create_sub_device_manager_with_fabric( + tt::stl::Span sub_devices, DeviceAddr local_l1_size) { + return sub_device_manager_tracker_->create_sub_device_manager_with_fabric(sub_devices, local_l1_size); +} +CoreCoord MeshDevice::dram_grid_size() const { + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->dram_grid_size(); }); + return reference_device()->dram_grid_size(); +} bool MeshDevice::using_slow_dispatch() const { - const auto& devices = this->get_devices(); - TT_FATAL(devices.size() > 0, "Expected at least one device in a Mesh to use slow dispatch."); - bool first_device_dispatch = devices[0]->using_slow_dispatch(); - TT_FATAL(std::all_of( - devices.begin(), - devices.end(), - [first_device_dispatch](const IDevice* device) { - return device->using_slow_dispatch() == first_device_dispatch; - }), "Expected all devices in a Mesh to use identical dispatch modes."); - return first_device_dispatch; + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->using_slow_dispatch(); }); + return reference_device()->using_slow_dispatch(); } bool MeshDevice::using_fast_dispatch() const { - const auto& devices = this->get_devices(); - TT_FATAL(devices.size() > 0, "Expected at least one device in a Mesh to use fast dispatch."); - bool first_device_dispatch = devices[0]->using_fast_dispatch(); - TT_FATAL(std::all_of( - devices.begin(), - devices.end(), - [first_device_dispatch](const IDevice* device) { - return device->using_fast_dispatch() == first_device_dispatch; - }), "Expected all devices in a Mesh to use identical dispatch modes."); - return first_device_dispatch; + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->using_fast_dispatch(); }); + return reference_device()->using_fast_dispatch(); } // Device property methods that can be delegated to reference device -CoreCoord MeshDevice::grid_size() const { return reference_device()->grid_size(); } -CoreCoord MeshDevice::logical_grid_size() const { return reference_device()->logical_grid_size(); } -CoreType MeshDevice::core_type_from_virtual_core(const CoreCoord& virtual_coord) const { return reference_device()->core_type_from_virtual_core(virtual_coord); } -CoreCoord MeshDevice::virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const { return reference_device()->virtual_noc_coordinate(noc_index, coord); } -CoreCoord MeshDevice::virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const { return reference_device()->virtual_noc0_coordinate(noc_index, coord); } -std::vector MeshDevice::worker_cores_from_logical_cores(const std::vector& logical_cores) const { return reference_device()->worker_cores_from_logical_cores(logical_cores); } -std::vector MeshDevice::ethernet_cores_from_logical_cores(const std::vector& logical_cores) const { return reference_device()->ethernet_cores_from_logical_cores(logical_cores); } -std::vector MeshDevice::get_optimal_dram_bank_to_logical_worker_assignment() { return reference_device()->get_optimal_dram_bank_to_logical_worker_assignment(); } -CoreCoord MeshDevice::virtual_core_from_logical_core(const CoreCoord& logical_coord, const CoreType& core_type) const { return reference_device()->virtual_core_from_logical_core(logical_coord, core_type); } -CoreCoord MeshDevice::worker_core_from_logical_core(const CoreCoord& logical_core) const { return reference_device()->worker_core_from_logical_core(logical_core); } -CoreCoord MeshDevice::ethernet_core_from_logical_core(const CoreCoord& logical_core) const { return reference_device()->ethernet_core_from_logical_core(logical_core); } -CoreCoord MeshDevice::logical_core_from_ethernet_core(const CoreCoord& ethernet_core) const { return reference_device()->logical_core_from_ethernet_core(ethernet_core); } +CoreCoord MeshDevice::grid_size() const { + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->grid_size(); }); + return reference_device()->grid_size(); +} +CoreCoord MeshDevice::logical_grid_size() const { + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->logical_grid_size(); }); + return reference_device()->logical_grid_size(); +} +CoreType MeshDevice::core_type_from_virtual_core(const CoreCoord& virtual_coord) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [virtual_coord](const auto& device) { + return device->core_type_from_virtual_core(virtual_coord); + }); + return reference_device()->core_type_from_virtual_core(virtual_coord); +} +CoreCoord MeshDevice::virtual_noc_coordinate(uint8_t noc_index, CoreCoord coord) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [noc_index, coord](const auto& device) { + return device->virtual_noc_coordinate(noc_index, coord); + }); + return reference_device()->virtual_noc_coordinate(noc_index, coord); +} +CoreCoord MeshDevice::virtual_noc0_coordinate(uint8_t noc_index, CoreCoord coord) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [noc_index, coord](const auto& device) { + return device->virtual_noc0_coordinate(noc_index, coord); + }); + return reference_device()->virtual_noc0_coordinate(noc_index, coord); +} +std::vector MeshDevice::worker_cores_from_logical_cores(const std::vector& logical_cores) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [logical_cores](const auto& device) { + return device->worker_cores_from_logical_cores(logical_cores); + }); + return reference_device()->worker_cores_from_logical_cores(logical_cores); +} +std::vector MeshDevice::get_optimal_dram_bank_to_logical_worker_assignment() { + validate_devices_return_same_value(scoped_devices_->get_devices(), [](const auto& device) { + return device->get_optimal_dram_bank_to_logical_worker_assignment(); + }); + return reference_device()->get_optimal_dram_bank_to_logical_worker_assignment(); +} +CoreCoord MeshDevice::virtual_core_from_logical_core(const CoreCoord& logical_coord, const CoreType& core_type) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [logical_coord, core_type](const auto& device) { + return device->virtual_core_from_logical_core(logical_coord, core_type); + }); + return reference_device()->virtual_core_from_logical_core(logical_coord, core_type); +} +CoreCoord MeshDevice::worker_core_from_logical_core(const CoreCoord& logical_core) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [logical_core](const auto& device) { + return device->worker_core_from_logical_core(logical_core); + }); + return reference_device()->worker_core_from_logical_core(logical_core); +} +CoreCoord MeshDevice::logical_core_from_ethernet_core(const CoreCoord& ethernet_core) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [ethernet_core](const auto& device) { + return device->logical_core_from_ethernet_core(ethernet_core); + }); + return reference_device()->logical_core_from_ethernet_core(ethernet_core); +} // These methods require some change / or assert out for now +std::vector MeshDevice::ethernet_cores_from_logical_cores( + const std::vector& logical_cores) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [logical_cores](const auto& device) { + return device->ethernet_cores_from_logical_cores(logical_cores); + }); + return reference_device()->ethernet_cores_from_logical_cores(logical_cores); +} +CoreCoord MeshDevice::ethernet_core_from_logical_core(const CoreCoord& logical_core) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [logical_core](const auto& device) { + return device->ethernet_core_from_logical_core(logical_core); + }); + return reference_device()->ethernet_core_from_logical_core(logical_core); +} std::unordered_set MeshDevice::get_active_ethernet_cores(bool skip_reserved_tunnel_cores) const { TT_THROW("get_active_ethernet_cores() is not supported on MeshDevice - use individual devices instead"); } @@ -381,34 +488,111 @@ std::vector MeshDevice::get_ethernet_sockets(chip_id_t connected_chip } // Core and worker management methods (These are OK) -CoreRangeSet MeshDevice::worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const { return reference_device()->worker_cores(core_type, sub_device_id); } -uint32_t MeshDevice::num_worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const { return reference_device()->num_worker_cores(core_type, sub_device_id); } - +CoreRangeSet MeshDevice::worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const { + return sub_device_manager_tracker_->get_active_sub_device_manager()->sub_device(sub_device_id).cores(core_type); +} +uint32_t MeshDevice::num_worker_cores(HalProgrammableCoreType core_type, SubDeviceId sub_device_id) const { + return sub_device_manager_tracker_->get_active_sub_device_manager()->sub_device(sub_device_id).num_cores(core_type); +} // Bank and memory management methods int MeshDevice::num_dram_channels() const { return reference_device()->num_dram_channels() * this->num_devices(); } -uint32_t MeshDevice::num_banks(const BufferType& buffer_type) const { return reference_device()->num_banks(buffer_type); } -uint32_t MeshDevice::num_banks(const BufferType& buffer_type, SubDeviceId sub_device_id) const { return reference_device()->num_banks(buffer_type, sub_device_id); } -uint32_t MeshDevice::bank_size(const BufferType& buffer_type) const { return reference_device()->bank_size(buffer_type); } -uint32_t MeshDevice::bank_size(const BufferType& buffer_type, SubDeviceId sub_device_id) const { return reference_device()->bank_size(buffer_type, sub_device_id); } -uint32_t MeshDevice::dram_channel_from_bank_id(uint32_t bank_id) const { return reference_device()->dram_channel_from_bank_id(bank_id); } -uint32_t MeshDevice::dram_channel_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const { return reference_device()->dram_channel_from_bank_id(bank_id, sub_device_id); } -CoreCoord MeshDevice::logical_core_from_dram_channel(uint32_t dram_channel) const { return reference_device()->logical_core_from_dram_channel(dram_channel); } -uint32_t MeshDevice::dram_channel_from_logical_core(const CoreCoord& logical_core) const { return reference_device()->dram_channel_from_logical_core(logical_core); } -int32_t MeshDevice::bank_offset(BufferType buffer_type, uint32_t bank_id) const { return reference_device()->bank_offset(buffer_type, bank_id); } -int32_t MeshDevice::bank_offset(BufferType buffer_type, uint32_t bank_id, SubDeviceId sub_device_id) const { return reference_device()->bank_offset(buffer_type, bank_id, sub_device_id); } -CoreCoord MeshDevice::logical_core_from_bank_id(uint32_t bank_id) const { return reference_device()->logical_core_from_bank_id(bank_id); } -CoreCoord MeshDevice::logical_core_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const { return reference_device()->logical_core_from_bank_id(bank_id, sub_device_id); } -const std::vector& MeshDevice::bank_ids_from_dram_channel(uint32_t dram_channel) const { return reference_device()->bank_ids_from_dram_channel(dram_channel); } -const std::vector& MeshDevice::bank_ids_from_dram_channel(uint32_t dram_channel, SubDeviceId sub_device_id) const { return reference_device()->bank_ids_from_dram_channel(dram_channel, sub_device_id); } -const std::vector& MeshDevice::bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord& logical_core) const { return reference_device()->bank_ids_from_logical_core(buffer_type, logical_core); } -const std::vector& MeshDevice::bank_ids_from_logical_core(BufferType buffer_type, const CoreCoord& logical_core, SubDeviceId sub_device_id) const { return reference_device()->bank_ids_from_logical_core(buffer_type, logical_core, sub_device_id); } +uint32_t MeshDevice::num_banks(const BufferType& buffer_type) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::num_banks(*allocator, buffer_type); +} +uint32_t MeshDevice::num_banks(const BufferType& buffer_type, SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::num_banks(*allocator, buffer_type); +} +uint32_t MeshDevice::bank_size(const BufferType& buffer_type) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::bank_size(*allocator, buffer_type); +} +uint32_t MeshDevice::bank_size(const BufferType& buffer_type, SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::bank_size(*allocator, buffer_type); +} +uint32_t MeshDevice::dram_channel_from_bank_id(uint32_t bank_id) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::dram_channel_from_bank_id(*allocator, bank_id); +} +uint32_t MeshDevice::dram_channel_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::dram_channel_from_bank_id(*allocator, bank_id); +} +CoreCoord MeshDevice::logical_core_from_dram_channel(uint32_t dram_channel) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [dram_channel](const auto& device) { + return device->logical_core_from_dram_channel(dram_channel); + }); + return reference_device()->logical_core_from_dram_channel(dram_channel); +} +uint32_t MeshDevice::dram_channel_from_logical_core(const CoreCoord& logical_core) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [logical_core](const auto& device) { + return device->dram_channel_from_logical_core(logical_core); + }); + return reference_device()->dram_channel_from_logical_core(logical_core); +} +int32_t MeshDevice::bank_offset(BufferType buffer_type, uint32_t bank_id) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::bank_offset(*allocator, buffer_type, bank_id); +} +int32_t MeshDevice::bank_offset(BufferType buffer_type, uint32_t bank_id, SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::bank_offset(*allocator, buffer_type, bank_id); +} +CoreCoord MeshDevice::logical_core_from_bank_id(uint32_t bank_id) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::logical_core_from_bank_id(*allocator, bank_id); +} + +CoreCoord MeshDevice::logical_core_from_bank_id(uint32_t bank_id, SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::logical_core_from_bank_id(*allocator, bank_id); +} +const std::vector& MeshDevice::bank_ids_from_dram_channel(uint32_t dram_channel) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::bank_ids_from_dram_channel(*allocator, dram_channel); +} +const std::vector& MeshDevice::bank_ids_from_dram_channel( + uint32_t dram_channel, SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::bank_ids_from_dram_channel(*allocator, dram_channel); +} +const std::vector& MeshDevice::bank_ids_from_logical_core( + BufferType buffer_type, const CoreCoord& logical_core) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::bank_ids_from_logical_core(*allocator, buffer_type, logical_core); +} +const std::vector& MeshDevice::bank_ids_from_logical_core( + BufferType buffer_type, const CoreCoord& logical_core, SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::bank_ids_from_logical_core(*allocator, buffer_type, logical_core); +} // Core management and network operations -const std::set& MeshDevice::ethernet_cores() const { return reference_device()->ethernet_cores(); } -const std::set& MeshDevice::storage_only_cores() const { return reference_device()->storage_only_cores(); } -uint32_t MeshDevice::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const { return reference_device()->get_noc_unicast_encoding(noc_index, core); } -uint32_t MeshDevice::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const { return reference_device()->get_noc_multicast_encoding(noc_index, cores); } +const std::set& MeshDevice::ethernet_cores() const { + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->ethernet_cores(); }); + return reference_device()->ethernet_cores(); +} +const std::set& MeshDevice::storage_only_cores() const { + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->storage_only_cores(); }); + return reference_device()->storage_only_cores(); +} +uint32_t MeshDevice::get_noc_unicast_encoding(uint8_t noc_index, const CoreCoord& core) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [noc_index, core](const auto& device) { + return device->get_noc_unicast_encoding(noc_index, core); + }); + return reference_device()->get_noc_unicast_encoding(noc_index, core); +} +uint32_t MeshDevice::get_noc_multicast_encoding(uint8_t noc_index, const CoreRange& cores) const { + validate_devices_return_same_value(scoped_devices_->get_devices(), [noc_index, cores](const auto& device) { + return device->get_noc_multicast_encoding(noc_index, cores); + }); + return reference_device()->get_noc_multicast_encoding(noc_index, cores); +} // Floating point and build environment const JitBuildEnv& MeshDevice::build_env() const { @@ -484,9 +668,26 @@ void MeshDevice::set_trace_buffers_size(uint32_t size) { // Dispatch and initialization bool MeshDevice::initialize(const uint8_t num_hw_cqs, size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap, bool minimal) { - TT_THROW("initialize() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->initialize(num_hw_cqs, l1_small_size, trace_region_size, l1_bank_remap, minimal); + work_executor_->initialize(); + work_executor_->set_worker_mode(WorkExecutorMode::SYNCHRONOUS); + view_ = std::make_unique(scoped_devices_->get_devices(), mesh_shape_); + SystemMesh::instance().register_mesh_device(shared_from_this(), this->get_devices()); + + // For MeshDevice, we support uniform sub-devices across all devices and we do not support ethernet subdevices. + const auto& compute_grid_size = this->compute_with_storage_grid_size(); + auto sub_devices = { + SubDevice(std::array{CoreRangeSet(CoreRange({0, 0}, {compute_grid_size.x - 1, compute_grid_size.y - 1}))})}; + + const auto& allocator = reference_device()->get_initialized_allocator(); + sub_device_manager_tracker_ = std::make_unique( + this, std::make_unique(allocator->config), sub_devices); + + if (this->using_fast_dispatch()) { + mesh_command_queue_ = std::make_unique(this, 0); + } + return true; } + void MeshDevice::build_firmware() { TT_THROW("build_firmware() is not supported on MeshDevice - use individual devices instead"); reference_device()->build_firmware(); @@ -516,32 +717,20 @@ void MeshDevice::update_dispatch_cores_for_multi_cq_eth_dispatch() { reference_device()->update_dispatch_cores_for_multi_cq_eth_dispatch(); } void MeshDevice::synchronize() { - TT_THROW("synchronize() is not supported on MeshDevice - use individual devices instead"); - reference_device()->synchronize(); -} -WorkExecutorMode MeshDevice::get_worker_mode() { - TT_THROW("get_worker_mode() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->get_worker_mode(); + TT_FATAL( + this->get_worker_mode() == WorkExecutorMode::SYNCHRONOUS, + "MeshDevice must be in synchronous mode to synchronize"); + this->work_executor_->synchronize(); } +WorkExecutorMode MeshDevice::get_worker_mode() { return this->work_executor_->get_worker_mode(); } void MeshDevice::set_worker_queue_mode(const WorkerQueueMode& mode) { - TT_THROW("set_worker_queue_mode() is not supported on MeshDevice - use individual devices instead"); - reference_device()->set_worker_queue_mode(mode); -} -WorkerQueueMode MeshDevice::get_worker_queue_mode() { - TT_THROW("get_worker_queue_mode() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->get_worker_queue_mode(); -} -bool MeshDevice::is_worker_queue_empty() const { - TT_THROW("is_worker_queue_empty() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->is_worker_queue_empty(); -} -bool MeshDevice::can_use_passthrough_scheduling() const { - TT_THROW("can_use_passthrough_scheduling() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->can_use_passthrough_scheduling(); + this->work_executor_->set_worker_queue_mode(mode); } +WorkerQueueMode MeshDevice::get_worker_queue_mode() { return this->work_executor_->get_worker_queue_mode(); } +bool MeshDevice::is_worker_queue_empty() const { return this->work_executor_->worker_queue.empty(); } +bool MeshDevice::can_use_passthrough_scheduling() const { return this->work_executor_->use_passthrough(); } void MeshDevice::push_work(std::function work, bool blocking) { - TT_THROW("push_work() is not supported on MeshDevice - use individual devices instead"); - reference_device()->push_work(std::move(work), blocking); + this->work_executor_->push_work(std::move(work), blocking); } program_cache::detail::ProgramCache& MeshDevice::get_program_cache() { return reference_device()->get_program_cache(); } HalProgrammableCoreType MeshDevice::get_programmable_core_type(CoreCoord virtual_core) const { return reference_device()->get_programmable_core_type(virtual_core); } @@ -550,67 +739,71 @@ std::vector> MeshDevice::extract_dst_no return reference_device()->extract_dst_noc_multicast_info(ranges, core_type); } bool MeshDevice::dispatch_s_enabled() const { - TT_THROW("dispatch_s_enabled() is not supported on MeshDevice - use individual devices instead"); + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->dispatch_s_enabled(); }); return reference_device()->dispatch_s_enabled(); } bool MeshDevice::distributed_dispatcher() const { - TT_THROW("distributed_dispatcher() is not supported on MeshDevice - use individual devices instead"); + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->distributed_dispatcher(); }); return reference_device()->distributed_dispatcher(); } NOC MeshDevice::dispatch_go_signal_noc() const { - TT_THROW("dispatch_go_signal_noc() is not supported on MeshDevice - use individual devices instead"); + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->dispatch_go_signal_noc(); }); return reference_device()->dispatch_go_signal_noc(); } size_t MeshDevice::get_device_kernel_defines_hash() { TT_THROW("get_device_kernel_defines_hash() is not supported on MeshDevice - use individual devices instead"); + validate_devices_return_same_value( + scoped_devices_->get_devices(), [](const auto& device) { return device->get_device_kernel_defines_hash(); }); return reference_device()->get_device_kernel_defines_hash(); } // Methods for SubDevice Management uint8_t MeshDevice::num_noc_mcast_txns(SubDeviceId sub_device_id) const { - TT_THROW("num_noc_mcast_txns() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->num_noc_mcast_txns(sub_device_id); + return sub_device_manager_tracker_->get_active_sub_device_manager()->num_noc_mcast_txns(sub_device_id); } uint8_t MeshDevice::num_noc_unicast_txns(SubDeviceId sub_device_id) const { - TT_THROW("num_noc_unicast_txns() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->num_noc_unicast_txns(sub_device_id); + return sub_device_manager_tracker_->get_active_sub_device_manager()->num_noc_unicast_txns(sub_device_id); } uint8_t MeshDevice::noc_data_start_index(SubDeviceId sub_device_id, bool mcast_data, bool unicast_data) const { - TT_THROW("noc_data_start_index() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->noc_data_start_index(sub_device_id, mcast_data, unicast_data); + if (mcast_data) { + return sub_device_manager_tracker_->get_active_sub_device_manager()->noc_mcast_data_start_index(sub_device_id); + } else if (unicast_data) { + return sub_device_manager_tracker_->get_active_sub_device_manager()->noc_unicast_data_start_index( + sub_device_id); + } else { + return 0; + } } SubDeviceManagerId MeshDevice::get_active_sub_device_manager_id() const { - TT_THROW("get_active_sub_device_manager_id() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->get_active_sub_device_manager_id(); + return sub_device_manager_tracker_->get_active_sub_device_manager()->id(); } SubDeviceManagerId MeshDevice::get_default_sub_device_manager_id() const { - TT_THROW("get_default_sub_device_manager_id() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->get_default_sub_device_manager_id(); + return sub_device_manager_tracker_->get_default_sub_device_manager()->id(); } CoreCoord MeshDevice::virtual_program_dispatch_core(uint8_t cq_id) const { - TT_THROW("virtual_program_dispatch_core() is not supported on MeshDevice - use individual devices instead"); + validate_devices_return_same_value(scoped_devices_->get_devices(), [cq_id](const auto& device) { + return device->virtual_program_dispatch_core(cq_id); + }); return reference_device()->virtual_program_dispatch_core(cq_id); } const std::vector& MeshDevice::get_sub_device_ids() const { - TT_THROW("get_sub_device_ids() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->get_sub_device_ids(); + return sub_device_manager_tracker_->get_active_sub_device_manager()->get_sub_device_ids(); } const std::vector& MeshDevice::get_sub_device_stall_group() const { - TT_THROW("get_sub_device_stall_group() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->get_sub_device_stall_group(); + return sub_device_manager_tracker_->get_active_sub_device_manager()->get_sub_device_stall_group(); } void MeshDevice::set_sub_device_stall_group(tt::stl::Span sub_device_ids) { - TT_THROW("set_sub_device_stall_group() is not supported on MeshDevice - use individual devices instead"); - this->reference_device()->set_sub_device_stall_group(sub_device_ids); + sub_device_manager_tracker_->get_active_sub_device_manager()->set_sub_device_stall_group(sub_device_ids); } void MeshDevice::reset_sub_device_stall_group() { - TT_THROW("reset_sub_device_stall_group() is not supported on MeshDevice - use individual devices instead"); - this->reference_device()->reset_sub_device_stall_group(); + sub_device_manager_tracker_->get_active_sub_device_manager()->reset_sub_device_stall_group(); } uint32_t MeshDevice::num_sub_devices() const { - TT_THROW("num_sub_devices() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->num_sub_devices(); + return sub_device_manager_tracker_->get_active_sub_device_manager()->num_sub_devices(); } uint32_t MeshDevice::get_completion_queue_reader_core() const { TT_THROW("get_completion_queue_reader_core() is not supported on MeshDevice - use individual devices instead"); @@ -627,38 +820,85 @@ std::vector> MeshDevice::get_tunnels_from_mmio() const { // Allocator methods // Memory statistics and buffer management -uint32_t MeshDevice::get_allocator_alignment() const { return reference_device()->get_allocator_alignment(); } -uint32_t MeshDevice::get_allocator_alignment(SubDeviceId sub_device_id) const { return reference_device()->get_allocator_alignment(sub_device_id); } -std::optional MeshDevice::lowest_occupied_compute_l1_address() const { return reference_device()->lowest_occupied_compute_l1_address(); } -std::optional MeshDevice::lowest_occupied_compute_l1_address(tt::stl::Span sub_device_ids) const { return reference_device()->lowest_occupied_compute_l1_address(sub_device_ids); } -size_t MeshDevice::get_l1_small_size() const { return reference_device()->get_l1_small_size(); } -size_t MeshDevice::get_l1_small_size(SubDeviceId sub_device_id) const { return reference_device()->get_l1_small_size(sub_device_id); } -const std::unordered_set& MeshDevice::get_allocated_buffers() const { return reference_device()->get_allocated_buffers(); } -const std::unordered_set& MeshDevice::get_allocated_buffers(SubDeviceId sub_device_id) const { return reference_device()->get_allocated_buffers(sub_device_id); } +uint32_t MeshDevice::get_allocator_alignment() const { + const auto& allocator = this->get_initialized_allocator(); + return allocator->config.alignment; +} +uint32_t MeshDevice::get_allocator_alignment(SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator->config.alignment; +} + +std::optional MeshDevice::lowest_occupied_compute_l1_address() const { + return sub_device_manager_tracker_->lowest_occupied_compute_l1_address(); +} + +std::optional MeshDevice::lowest_occupied_compute_l1_address( + tt::stl::Span sub_device_ids) const { + return sub_device_manager_tracker_->lowest_occupied_compute_l1_address(sub_device_ids); +} + +size_t MeshDevice::get_l1_small_size() const { + const auto& allocator = this->get_initialized_allocator(); + return allocator->config.l1_small_size; +} +size_t MeshDevice::get_l1_small_size(SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator->config.l1_small_size; +} +const std::unordered_set& MeshDevice::get_allocated_buffers() const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::get_allocated_buffers(*allocator); +} +const std::unordered_set& MeshDevice::get_allocated_buffers(SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::get_allocated_buffers(*allocator); +} allocator::Statistics MeshDevice::get_memory_allocation_statistics(const BufferType& buffer_type) const { - return this->reference_device()->get_memory_allocation_statistics(buffer_type); + const auto& allocator = this->get_initialized_allocator(); + return allocator::get_statistics(*allocator, buffer_type); } allocator::Statistics MeshDevice::get_memory_allocation_statistics( const BufferType& buffer_type, SubDeviceId sub_device_id) const { - // With current implementation, we assume that all devices have the same memory allocation statistics. - // This will be made more explicit in the future to have lock-step allocation across devices. - // Right now, we just return the statistics of the first device. - return this->reference_device()->get_memory_allocation_statistics(buffer_type, sub_device_id); -} -const std::unique_ptr& MeshDevice::get_initialized_allocator() const { return reference_device()->get_initialized_allocator(); } -const std::unique_ptr& MeshDevice::get_initialized_allocator(SubDeviceId sub_device_id) const { return reference_device()->get_initialized_allocator(sub_device_id); } -DeviceAddr MeshDevice::get_base_allocator_addr(const HalMemType& mem_type) const { return reference_device()->get_base_allocator_addr(mem_type); } -DeviceAddr MeshDevice::get_base_allocator_addr(const HalMemType& mem_type, SubDeviceId sub_device_id) const { return reference_device()->get_base_allocator_addr(mem_type, sub_device_id); } + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::get_statistics(*allocator, buffer_type); +} +const std::unique_ptr& MeshDevice::get_initialized_allocator() const { + return sub_device_manager_tracker_->get_default_sub_device_manager()->get_initialized_allocator(SubDeviceId{0}); +} +const std::unique_ptr& MeshDevice::get_initialized_allocator(SubDeviceId sub_device_id) const { + return sub_device_manager_tracker_->get_active_sub_device_manager()->get_initialized_allocator(sub_device_id); +} +DeviceAddr MeshDevice::get_base_allocator_addr(const HalMemType& mem_type) const { + const auto& allocator = this->get_initialized_allocator(); + return allocator::get_unreserved_base_address(*allocator, mem_type); +} +DeviceAddr MeshDevice::get_base_allocator_addr(const HalMemType& mem_type, SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + return allocator::get_unreserved_base_address(*allocator, mem_type); +} // Buffer and memory management operations -void MeshDevice::deallocate_buffers() { reference_device()->deallocate_buffers(); } -void MeshDevice::deallocate_buffers(SubDeviceId sub_device_id) { reference_device()->deallocate_buffers(sub_device_id); } -void MeshDevice::dump_memory_blocks(const BufferType& buffer_type, std::ofstream& out) const { reference_device()->dump_memory_blocks(buffer_type, out); } -void MeshDevice::dump_memory_blocks(const BufferType& buffer_type, std::ofstream& out, SubDeviceId sub_device_id) const { reference_device()->dump_memory_blocks(buffer_type, out, sub_device_id); } - +void MeshDevice::deallocate_buffers() { + const auto& allocator = this->get_initialized_allocator(); + allocator::deallocate_buffers(*allocator); +} +void MeshDevice::deallocate_buffers(SubDeviceId sub_device_id) { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + allocator::deallocate_buffers(*allocator); +} +void MeshDevice::dump_memory_blocks(const BufferType& buffer_type, std::ofstream& out) const { + const auto& allocator = this->get_initialized_allocator(); + allocator::dump_memory_blocks(*allocator, buffer_type, out); +} +void MeshDevice::dump_memory_blocks( + const BufferType& buffer_type, std::ofstream& out, SubDeviceId sub_device_id) const { + const auto& allocator = this->get_initialized_allocator(sub_device_id); + allocator::dump_memory_blocks(*allocator, buffer_type, out); +} MemoryBlockTable MeshDevice::get_memory_block_table(const BufferType& buffer_type) const { - TT_THROW("get_memory_block_table() is not supported on MeshDevice - use individual devices instead"); - return reference_device()->get_memory_block_table(buffer_type); + const auto& allocator = this->get_initialized_allocator(); + return allocator::get_memory_block_table(*allocator, buffer_type); } MeshSubDeviceManagerId MeshDevice::mesh_create_sub_device_manager( diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index beaa2a5842d1..3de32389ce51 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -228,8 +228,20 @@ void Device::initialize_cluster() { void Device::initialize_default_sub_device_state(size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap) { // Create the default sub-device manager representing the entire chip + const auto& compute_grid_size = this->compute_with_storage_grid_size(); + const auto& active_eth_cores = this->get_active_ethernet_cores(true); + std::vector active_eth_core_ranges; + active_eth_core_ranges.reserve(active_eth_cores.size()); + for (const auto& core : active_eth_cores) { + active_eth_core_ranges.emplace_back(core, core); + } + + auto sub_devices = {SubDevice(std::array{ + CoreRangeSet(CoreRange({0, 0}, {compute_grid_size.x - 1, compute_grid_size.y - 1})), + CoreRangeSet(std::move(active_eth_core_ranges))})}; + sub_device_manager_tracker_ = std::make_unique( - this, this->initialize_allocator(l1_small_size, trace_region_size, l1_bank_remap)); + this, this->initialize_allocator(l1_small_size, trace_region_size, l1_bank_remap), sub_devices); } std::unique_ptr Device::initialize_allocator(size_t l1_small_size, size_t trace_region_size, tt::stl::Span l1_bank_remap) { @@ -1444,39 +1456,11 @@ void Device::deallocate_buffers(SubDeviceId sub_device_id) { } std::optional Device::lowest_occupied_compute_l1_address() const { - // Global bank id needs to look up a bank from the compute grid (not the storage grid) - // Since banks are lockstep in an allocator it doesn't matter if the actual core matches or not - auto global_bank_id = - this->bank_ids_from_logical_core(BufferType::L1, *this->compute_cores_.begin())[0]; - const auto& allocator = this->get_initialized_allocator(); - return allocator::lowest_occupied_l1_address(*allocator, global_bank_id); + return sub_device_manager_tracker_->lowest_occupied_compute_l1_address(); } std::optional Device::lowest_occupied_compute_l1_address(tt::stl::Span sub_device_ids) const { - // Sub-device banks are currently all compute banks - // Since banks are lockstep in an allocator it doesn't matter which core is used - uint32_t sub_device_bank_id = 0; - DeviceAddr lowest_addr = std::numeric_limits::max(); - for (const auto& sub_device_id : sub_device_ids) { - const auto& allocator = - sub_device_manager_tracker_->get_active_sub_device_manager()->sub_device_allocator(sub_device_id); - if (allocator) { - auto found_addr = allocator::lowest_occupied_l1_address(*allocator, sub_device_bank_id); - if (found_addr.has_value()) { - lowest_addr = std::min(lowest_addr, *found_addr); - } - } - } - // sub-device allocators sit below global allocator. If an address is found for a sub-device, no need to check the global allocator - if (lowest_addr != std::numeric_limits::max()) { - return lowest_addr; - } else { - const auto &allocator = this->get_initialized_allocator(); - // Global bank id needs to look up a bank from the compute grid (not the storage grid) - auto global_bank_id = - this->bank_ids_from_logical_core(BufferType::L1, *this->compute_cores_.begin())[0]; - return allocator::lowest_occupied_l1_address(*allocator, global_bank_id); - } + return sub_device_manager_tracker_->lowest_occupied_compute_l1_address(sub_device_ids); } std::pair Device::build_processor_type_to_index(uint32_t programmable_core, uint32_t processor_class) const { diff --git a/tt_metal/impl/sub_device/sub_device_manager.cpp b/tt_metal/impl/sub_device/sub_device_manager.cpp index a9c101618a88..6f33a60a7afd 100644 --- a/tt_metal/impl/sub_device/sub_device_manager.cpp +++ b/tt_metal/impl/sub_device/sub_device_manager.cpp @@ -42,21 +42,14 @@ SubDeviceManager::SubDeviceManager( this->populate_noc_data(); } -SubDeviceManager::SubDeviceManager(IDevice* device, std::unique_ptr&& global_allocator) : - id_(next_sub_device_manager_id_++), device_(device) { +SubDeviceManager::SubDeviceManager( + IDevice* device, std::unique_ptr&& global_allocator, tt::stl::Span sub_devices) : + id_(next_sub_device_manager_id_++), + device_(device), + sub_devices_(sub_devices.begin(), sub_devices.end()), + local_l1_size_(0) { TT_ASSERT(device != nullptr, "Device must not be null"); - local_l1_size_ = 0; - const auto& compute_grid_size = device_->compute_with_storage_grid_size(); - const auto& active_eth_cores = device_->get_active_ethernet_cores(true); - std::vector active_eth_core_ranges; - active_eth_core_ranges.reserve(active_eth_cores.size()); - for (const auto& core : active_eth_cores) { - active_eth_core_ranges.emplace_back(core, core); - } - sub_devices_ = {SubDevice(std::array{ - CoreRangeSet(CoreRange({0, 0}, {compute_grid_size.x - 1, compute_grid_size.y - 1})), - CoreRangeSet(std::move(active_eth_core_ranges))})}; this->populate_sub_device_ids(); // No need to validate sub-devices since this constructs a sub-device of the entire grid this->populate_num_cores(); @@ -183,25 +176,30 @@ void SubDeviceManager::validate_sub_devices() const { // Validate sub device cores fit inside the device grid const auto& compute_grid_size = device_->compute_with_storage_grid_size(); CoreRange device_worker_cores = CoreRange({0, 0}, {compute_grid_size.x - 1, compute_grid_size.y - 1}); - const auto& device_eth_cores = device_->get_active_ethernet_cores(true); - for (const auto& sub_device : sub_devices_) { + + for (auto sub_device_id = SubDeviceId{0}; sub_device_id < this->num_sub_devices(); ++sub_device_id) { + const auto& sub_device = this->sub_device(sub_device_id); const auto& worker_cores = sub_device.cores(HalProgrammableCoreType::TENSIX); TT_FATAL( device_worker_cores.contains(worker_cores), "Tensix cores {} specified in sub device must be within device grid {}", worker_cores, device_worker_cores); - const auto& eth_cores = sub_device.cores(HalProgrammableCoreType::ACTIVE_ETH); - uint32_t num_eth_cores = 0; - for (const auto& dev_eth_core : device_eth_cores) { - if (eth_cores.contains(dev_eth_core)) { - num_eth_cores++; + + if (sub_device.has_core_type(HalProgrammableCoreType::ACTIVE_ETH)) { + const auto& eth_cores = sub_device.cores(HalProgrammableCoreType::ACTIVE_ETH); + uint32_t num_eth_cores = 0; + const auto& device_eth_cores = tt::Cluster::instance().get_active_ethernet_cores(device_->id()); + for (const auto& dev_eth_core : device_eth_cores) { + if (eth_cores.contains(dev_eth_core)) { + num_eth_cores++; + } } + TT_FATAL( + num_eth_cores == eth_cores.num_cores(), + "Ethernet cores {} specified in sub device must be within device grid", + eth_cores); } - TT_FATAL( - num_eth_cores == eth_cores.num_cores(), - "Ethernet cores {} specified in sub device must be within device grid", - eth_cores); } if (sub_devices_.size() < 2) { return; diff --git a/tt_metal/impl/sub_device/sub_device_manager_tracker.cpp b/tt_metal/impl/sub_device/sub_device_manager_tracker.cpp index db94b5be25a7..81344be97494 100644 --- a/tt_metal/impl/sub_device/sub_device_manager_tracker.cpp +++ b/tt_metal/impl/sub_device/sub_device_manager_tracker.cpp @@ -22,9 +22,10 @@ namespace tt::tt_metal { -SubDeviceManagerTracker::SubDeviceManagerTracker(IDevice* device, std::unique_ptr&& global_allocator) : +SubDeviceManagerTracker::SubDeviceManagerTracker( + IDevice* device, std::unique_ptr&& global_allocator, tt::stl::Span sub_devices) : device_(device) { - auto sub_device_manager = std::make_unique(device, std::move(global_allocator)); + auto sub_device_manager = std::make_unique(device, std::move(global_allocator), sub_devices); default_sub_device_manager_ = sub_device_manager.get(); active_sub_device_manager_ = default_sub_device_manager_; sub_device_managers_.insert_or_assign(sub_device_manager->id(), std::move(sub_device_manager)); @@ -123,4 +124,27 @@ SubDeviceManagerId SubDeviceManagerTracker::get_default_sub_device_manager_id() return default_sub_device_manager_->id(); } +std::optional SubDeviceManagerTracker::lowest_occupied_compute_l1_address( + tt::stl::Span sub_device_ids) const { + constexpr uint32_t global_bank_id = 0; + if (sub_device_ids.empty()) { + // Global bank id needs to look up a bank from the compute grid (not the storage grid) + // Since banks are lockstep in an allocator it doesn't matter if the actual core matches or not + const auto& default_allocator = default_sub_device_manager_->get_initialized_allocator(SubDeviceId{0}); + return allocator::lowest_occupied_l1_address(*default_allocator, global_bank_id); + } else { + DeviceAddr lowest_addr = std::numeric_limits::max(); + for (const auto& sub_device_id : sub_device_ids) { + const auto& allocator = this->get_active_sub_device_manager()->sub_device_allocator(sub_device_id); + if (allocator) { + auto found_addr = allocator::lowest_occupied_l1_address(*allocator, global_bank_id); + if (found_addr.has_value()) { + lowest_addr = std::min(lowest_addr, *found_addr); + } + } + } + return lowest_addr; + } +} + } // namespace tt::tt_metal