From e4233210efb8c6524912c4510045f3c74244c042 Mon Sep 17 00:00:00 2001 From: Almeet Bhullar Date: Mon, 20 Jan 2025 18:25:37 +0000 Subject: [PATCH] Adding active erisc FW for BH + support for compiling this + updates to BH eth_l1_address_map --- tt_metal/api/tt-metalium/build.hpp | 4 + tt_metal/hw/CMakeLists.txt | 1 + tt_metal/hw/firmware/src/active_erisc.cc | 156 ++++++++++++++++++ tt_metal/hw/firmware/src/active_erisck.cc | 38 +++++ tt_metal/hw/firmware/src/idle_erisc.cc | 5 +- tt_metal/hw/inc/blackhole/dev_mem_map.h | 14 +- .../hw/inc/blackhole/eth_l1_address_map.h | 122 ++++++-------- .../hw/inc/grayskull/eth_l1_address_map.h | 1 - tt_metal/hw/inc/wormhole/eth_l1_address_map.h | 1 - tt_metal/hw/toolchain/memory.ld | 10 ++ tt_metal/hw/toolchain/sections.ld | 2 +- tt_metal/impl/debug/dprint_server.cpp | 5 +- tt_metal/impl/device/device.cpp | 9 +- tt_metal/jit_build/build.cpp | 47 +++++- tt_metal/llrt/blackhole/bh_hal_active_eth.cpp | 11 +- 15 files changed, 336 insertions(+), 90 deletions(-) create mode 100644 tt_metal/hw/firmware/src/active_erisc.cc create mode 100644 tt_metal/hw/firmware/src/active_erisck.cc diff --git a/tt_metal/api/tt-metalium/build.hpp b/tt_metal/api/tt-metalium/build.hpp index 6b9e7f6a71f..426d7d763d3 100644 --- a/tt_metal/api/tt-metalium/build.hpp +++ b/tt_metal/api/tt-metalium/build.hpp @@ -30,6 +30,10 @@ struct JitBuiltStateConfig { int processor_id = 0; bool is_fw = false; uint32_t dispatch_message_addr = 0; + // Set `is_cooperative` when Metal FW/Kernel code is loaded on risc with some base FW running. + // In this case Metal FW will need to facilitate context switching to base FW (e.g. code running on WH active + // eriscs) + bool is_cooperative = false; }; // The build environment diff --git a/tt_metal/hw/CMakeLists.txt b/tt_metal/hw/CMakeLists.txt index 5b6ed87f5c2..9ba5bdbea1d 100644 --- a/tt_metal/hw/CMakeLists.txt +++ b/tt_metal/hw/CMakeLists.txt @@ -12,6 +12,7 @@ set(PROCS trisc2 ierisc slave_ierisc + aerisc ) set(TYPES firmware diff --git a/tt_metal/hw/firmware/src/active_erisc.cc b/tt_metal/hw/firmware/src/active_erisc.cc new file mode 100644 index 00000000000..2c50889f7f9 --- /dev/null +++ b/tt_metal/hw/firmware/src/active_erisc.cc @@ -0,0 +1,156 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "risc_common.h" +#include "noc.h" +#include "noc_overlay_parameters.h" +#include "ckernel_structs.h" +#include "stream_io_map.h" +#include "c_tensix_core.h" +#include "tdma_xmov.h" +#include "noc_nonblocking_api.h" +#include "firmware_common.h" +#include "tools/profiler/kernel_profiler.hpp" +#include "dev_msgs.h" +#include "risc_attribs.h" +#include "circular_buffer.h" +#include "dataflow_api.h" +#include "ethernet/dataflow_api.h" +#include "ethernet/tunneling.h" + +#include "debug/watcher_common.h" +#include "debug/waypoint.h" +#include "debug/stack_usage.h" +#include "debug/dprint.h" + +uint8_t noc_index; + +uint32_t noc_reads_num_issued[NUM_NOCS] __attribute__((used)); +uint32_t noc_nonposted_writes_num_issued[NUM_NOCS] __attribute__((used)); +uint32_t noc_nonposted_writes_acked[NUM_NOCS] __attribute__((used)); +uint32_t noc_nonposted_atomics_acked[NUM_NOCS] __attribute__((used)); +uint32_t noc_posted_writes_num_issued[NUM_NOCS] __attribute__((used)); + +uint32_t tt_l1_ptr* rta_l1_base __attribute__((used)); +uint32_t tt_l1_ptr* crta_l1_base __attribute__((used)); +uint32_t tt_l1_ptr* sem_l1_base[ProgrammableCoreType::COUNT] __attribute__((used)); + +uint8_t my_x[NUM_NOCS] __attribute__((used)); +uint8_t my_y[NUM_NOCS] __attribute__((used)); + +// These arrays are stored in local memory of FW, but primarily used by the kernel which shares +// FW symbols. Hence mark these as 'used' so that FW compiler doesn't optimize it out. +uint16_t dram_bank_to_noc_xy[NUM_NOCS][NUM_DRAM_BANKS] __attribute__((used)); +uint16_t l1_bank_to_noc_xy[NUM_NOCS][NUM_L1_BANKS] __attribute__((used)); +int32_t bank_to_dram_offset[NUM_DRAM_BANKS] __attribute__((used)); +int32_t bank_to_l1_offset[NUM_L1_BANKS] __attribute__((used)); + +CBInterface cb_interface[NUM_CIRCULAR_BUFFERS] __attribute__((used)); + +#if defined(PROFILE_KERNEL) +namespace kernel_profiler { +uint32_t wIndex __attribute__((used)); +uint32_t stackSize __attribute__((used)); +uint32_t sums[SUM_COUNT] __attribute__((used)); +uint32_t sumIDs[SUM_COUNT] __attribute__((used)); +} // namespace kernel_profiler +#endif + +int main() { + configure_l1_data_cache(); + DIRTY_STACK_MEMORY(); + WAYPOINT("I"); + do_crt1((uint32_t*)eth_l1_mem::address_map::MEM_ERISC_INIT_LOCAL_L1_BASE_SCRATCH); + + // put this into scratch space similar to idle erisc + noc_bank_table_init(eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SCRATCH); + + risc_init(); + + mailboxes->slave_sync.all = RUN_SYNC_MSG_ALL_SLAVES_DONE; + + noc_init(MEM_NOC_ATOMIC_RET_VAL_ADDR); + for (uint32_t n = 0; n < NUM_NOCS; n++) { + noc_local_state_init(n); + } + + mailboxes->go_message.signal = RUN_MSG_DONE; + mailboxes->launch_msg_rd_ptr = 0; // Initialize the rdptr to 0 + + while (1) { + // Wait... + go_msg_t* go_msg_address = &(mailboxes->go_message); + WAYPOINT("GW"); + + uint8_t go_message_signal = RUN_MSG_DONE; + while ((go_message_signal = mailboxes->go_message.signal) != RUN_MSG_GO) { + invalidate_l1_cache(); + // While the go signal for kernel execution is not sent, check if the worker was signalled + // to reset its launch message read pointer. + if (go_message_signal == RUN_MSG_RESET_READ_PTR) { + // Set the rd_ptr on workers to specified value + mailboxes->launch_msg_rd_ptr = 0; + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + mailboxes->go_message.signal = RUN_MSG_DONE; + // Notify dispatcher that this has been done + internal_::notify_dispatch_core_done(dispatch_addr); + } + } + WAYPOINT("GD"); + + { + // Only include this iteration in the device profile if the launch message is valid. This is because all + // workers get a go signal regardless of whether they're running a kernel or not. We don't want to profile + // "invalid" iterations. + DeviceZoneScopedMainN("ACTIVE-ERISC-FW"); + uint32_t launch_msg_rd_ptr = mailboxes->launch_msg_rd_ptr; + launch_msg_t* launch_msg_address = &(mailboxes->launch[launch_msg_rd_ptr]); + + DeviceZoneSetCounter(launch_msg_address->kernel_config.host_assigned_id); + + noc_index = launch_msg_address->kernel_config.brisc_noc_id; + + flush_erisc_icache(); + + enum dispatch_core_processor_masks enables = + (enum dispatch_core_processor_masks)launch_msg_address->kernel_config.enables; + + // Run the ERISC kernel, no kernel config buffer on active eth + if (enables & DISPATCH_CLASS_MASK_ETH_DM0) { + WAYPOINT("R"); + // TODO: This currently runs on second risc on active eth cores but with newer drop of syseng FW + // this will run on risc0 + int index = static_cast::type>(EthProcessorTypes::DM0); + void (*kernel_address)(uint32_t) = (void (*)(uint32_t))( + mailboxes->launch[mailboxes->launch_msg_rd_ptr].kernel_config.kernel_text_offset[index]); + (*kernel_address)((uint32_t)kernel_address); + + RECORD_STACK_USAGE(); + WAYPOINT("D"); + } + + mailboxes->go_message.signal = RUN_MSG_DONE; + + // Notify dispatcher core that it has completed + if (launch_msg_address->kernel_config.mode == DISPATCH_MODE_DEV) { + launch_msg_address->kernel_config.enables = 0; + uint64_t dispatch_addr = NOC_XY_ADDR( + NOC_X(mailboxes->go_message.master_x), + NOC_Y(mailboxes->go_message.master_y), + DISPATCH_MESSAGE_ADDR + mailboxes->go_message.dispatch_message_offset); + CLEAR_PREVIOUS_LAUNCH_MESSAGE_ENTRY_FOR_WATCHER(); + internal_::notify_dispatch_core_done(dispatch_addr); + mailboxes->launch_msg_rd_ptr = (launch_msg_rd_ptr + 1) & (launch_msg_buffer_num_entries - 1); + } + } + } + + return 0; +} diff --git a/tt_metal/hw/firmware/src/active_erisck.cc b/tt_metal/hw/firmware/src/active_erisck.cc new file mode 100644 index 00000000000..0e2c75d5008 --- /dev/null +++ b/tt_metal/hw/firmware/src/active_erisck.cc @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: © 2025 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include "eth_l1_address_map.h" +#include "noc_parameters.h" +#include "ethernet/dataflow_api.h" +#include "noc.h" +#include "noc_overlay_parameters.h" +#include "risc_attribs.h" +#include "tensix.h" +#include "tensix_types.h" +#include "tt_eth_api.h" +#include "c_tensix_core.h" +#include "noc_nonblocking_api.h" +#include "firmware_common.h" +#include "stream_io_map.h" +#include "tdma_xmov.h" +#include "debug/dprint.h" +#include "tools/profiler/kernel_profiler.hpp" +#include +#include + +extern uint32_t __kernel_init_local_l1_base[]; +extern uint32_t __fw_export_end_text[]; + +void kernel_launch(uint32_t kernel_base_addr) { + DeviceZoneScopedMainChildN("ACTIVE-ERISC-KERNEL"); + + extern uint32_t __kernel_init_local_l1_base[]; + extern uint32_t __fw_export_end_text[]; + do_crt1((uint32_t tt_l1_ptr*)(kernel_base_addr + (uint32_t)__kernel_init_local_l1_base - + (uint32_t)__fw_export_end_text)); + + noc_local_state_init(NOC_INDEX); + + kernel_main(); +} diff --git a/tt_metal/hw/firmware/src/idle_erisc.cc b/tt_metal/hw/firmware/src/idle_erisc.cc index a776c088933..fe762348a5a 100644 --- a/tt_metal/hw/firmware/src/idle_erisc.cc +++ b/tt_metal/hw/firmware/src/idle_erisc.cc @@ -151,9 +151,8 @@ int main() { enum dispatch_core_processor_masks enables = (enum dispatch_core_processor_masks)launch_msg_address->kernel_config.enables; run_slave_eriscs(enables); - uint32_t kernel_config_base = firmware_config_init(mailboxes, ProgrammableCoreType::IDLE_ETH, DISPATCH_CLASS_ETH_DM0); - uint32_t tt_l1_ptr* cb_l1_base = - (uint32_t tt_l1_ptr*)(kernel_config_base + launch_msg_address->kernel_config.local_cb_offset); + uint32_t kernel_config_base = + firmware_config_init(mailboxes, ProgrammableCoreType::IDLE_ETH, DISPATCH_CLASS_ETH_DM0); // Run the ERISC kernel if (enables & DISPATCH_CLASS_MASK_ETH_DM0) { diff --git a/tt_metal/hw/inc/blackhole/dev_mem_map.h b/tt_metal/hw/inc/blackhole/dev_mem_map.h index 82de170217d..075edd005ca 100644 --- a/tt_metal/hw/inc/blackhole/dev_mem_map.h +++ b/tt_metal/hw/inc/blackhole/dev_mem_map.h @@ -115,7 +115,7 @@ #define MEM_TRISC2_STACK_BASE (MEM_LOCAL_BASE + MEM_TRISC_LOCAL_SIZE - MEM_TRISC2_STACK_SIZE) ///////////// -// IERISC memory map +// Idle ERISC memory map #define MEM_IERISC_LOCAL_SIZE (8 * 1024) #define MEM_SLAVE_IERISC_LOCAL_SIZE (8 * 1024) #define MEM_IERISC_FIRMWARE_SIZE (24 * 1024) @@ -144,6 +144,18 @@ #define IERISC_RESET_PC (MEM_LOCAL_BASE | 0x14000) #define SLAVE_IERISC_RESET_PC (MEM_LOCAL_BASE | 0x14008) +///////////// +// Active ERISC memory map +// TODO: These are added here to enable aerisc compilation but are replicated in eth_l1_address_map +// eth_l1_address_map should be removed in favour of this file +#define MEM_AERISC_MAILBOX_BASE (MEM_IERISC_RESERVED1 + MEM_IERISC_RESERVED1_SIZE) +#define MEM_AERISC_MAILBOX_END (MEM_AERISC_MAILBOX_BASE + MEM_IERISC_MAILBOX_SIZE) +#define MEM_AERISC_FIRMWARE_BASE (MEM_AERISC_MAILBOX_END) +#define MEM_AERISC_MAP_END (MEM_AERISC_FIRMWARE_BASE + MEM_IERISC_FIRMWARE_SIZE) +#define MEM_AERISC_INIT_LOCAL_L1_BASE_SCRATCH MEM_AERISC_MAP_END +#define MEM_AERISC_STACK_SIZE 1024 +#define MEM_AERISC_STACK_BASE (MEM_LOCAL_BASE + MEM_IERISC_LOCAL_SIZE - MEM_AERISC_STACK_SIZE) + ///////////// // Padding/alignment restriction needed in linker scripts for erisc #define MEM_IERISC_KERNEL_PAD 32 diff --git a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h index 6af030f3b5e..37dd8ea87c8 100644 --- a/tt_metal/hw/inc/blackhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/blackhole/eth_l1_address_map.h @@ -6,78 +6,71 @@ #include -#include "noc/noc_parameters.h" // L1_ALIGNMENT - namespace eth_l1_mem { struct address_map { - // UMD doesn't distinguish between active/idle eth cores - // UMD needs space for l1_barrier - // active/idle eth cores have very different mem maps - // Reserve some space at the end of l1 for l1_barrier - static constexpr std::int32_t ERISC_BARRIER_SIZE = 32; - static constexpr std::int32_t MAX_SIZE = 512 * 1024 - ERISC_BARRIER_SIZE; - static constexpr std::int32_t MAX_L1_LOADING_SIZE = 1 * 512 * 1024 - ERISC_BARRIER_SIZE; - - // Sizes - static constexpr std::int32_t FIRMWARE_SIZE = 32 * 1024; - static constexpr std::int32_t COMMAND_Q_SIZE = 4 * 1024; - static constexpr std::int32_t DATA_BUFFER_SIZE_HOST = 4 * 1024; - static constexpr std::int32_t DATA_BUFFER_SIZE_ETH = 4 * 1024; - static constexpr std::int32_t DATA_BUFFER_SIZE_NOC = 16 * 1024; - static constexpr std::int32_t DATA_BUFFER_SIZE = 24 * 1024; - // Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + - // NUM_L1_BANKS) - static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_XY_SIZE = 1024; - // Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + - // NUM_L1_BANKS) - static constexpr std::int32_t ERISC_MEM_BANK_OFFSET_SIZE = 1024; - - // Kernel config buffer is WIP - // Size is presently based on the old sizes of the RTAs + CB config + Sems - static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_SIZE = 96 * 4 + 8 * 16; - - // Base addresses - static constexpr std::int32_t FIRMWARE_BASE = 0x9040; - static constexpr std::int32_t L1_EPOCH_Q_BASE = 0x9000; // Epoch Q start in L1. - static constexpr std::int32_t COMMAND_Q_BASE = L1_EPOCH_Q_BASE + FIRMWARE_SIZE; - static constexpr std::int32_t DATA_BUFFER_BASE = COMMAND_Q_BASE + COMMAND_Q_SIZE; - static constexpr std::int32_t TILE_HEADER_BUFFER_BASE = DATA_BUFFER_BASE + DATA_BUFFER_SIZE; - - // TT Metal Specific - static constexpr std::int32_t ERISC_FIRMWARE_SIZE = 2 * 1024; - // Total 160 * 1024 L1 starting from TILE_HEADER_BUFFER_BASE - // - 1 * 1024 misc args - // - 53 * 1024 eth app reserved buffer space - // - 106 * 1024 L1 unreserved buffer space - static constexpr std::int32_t MAX_NUM_CONCURRENT_TRANSACTIONS = 8; + // From top of L1: + // - Syseng reserves [Max L1 Eth - SYSENG_RESERVED_SIZE, Max L1 Eth) + // - L1 barrier [Max L1 Eth - SYSENG_RESERVED_SIZE - ERISC_BARRIER_SIZE, Max L1 Eth - SYSENG_RESERVED_SIZE) + // - Tunneling [Max L1 Eth - SYSENG_RESERVED_SIZE - ERISC_BARRIER_SIZE - ERISC_APP_SYNC_INFO_SIZE, Max L1 Eth - + // SYSENG_RESERVED_SIZE - ERISC_BARRIER_SIZE) + static constexpr std::int32_t SYSENG_RESERVED_SIZE = 64 * 1024; + static constexpr std::int32_t ERISC_BARRIER_SIZE = 64; static constexpr std::int32_t ERISC_APP_ROUTING_INFO_SIZE = 48; + static constexpr std::int32_t MAX_NUM_CONCURRENT_TRANSACTIONS = 8; static constexpr std::int32_t ERISC_APP_SYNC_INFO_SIZE = 160 + 16 * MAX_NUM_CONCURRENT_TRANSACTIONS; + static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2064; // aligning this to L1_ALIGNMENT + + static constexpr std::int32_t MAX_SIZE = 512 * 1024 - SYSENG_RESERVED_SIZE - ERISC_BARRIER_SIZE - + ERISC_APP_ROUTING_INFO_SIZE - ERISC_APP_SYNC_INFO_SIZE - + FABRIC_ROUTER_CONFIG_SIZE; + static constexpr std::int32_t MAX_L1_LOADING_SIZE = MAX_SIZE; + + static constexpr std::int32_t FABRIC_ROUTER_CONFIG_BASE = MAX_SIZE; + static constexpr std::int32_t ERISC_APP_SYNC_INFO_BASE = FABRIC_ROUTER_CONFIG_BASE + FABRIC_ROUTER_CONFIG_BASE; + static constexpr std::int32_t ERISC_APP_ROUTING_INFO_BASE = ERISC_APP_SYNC_INFO_BASE + ERISC_APP_SYNC_INFO_SIZE; + static constexpr std::uint32_t ERISC_BARRIER_BASE = ERISC_APP_ROUTING_INFO_BASE + ERISC_APP_ROUTING_INFO_SIZE; - static constexpr std::int32_t ERISC_BARRIER_BASE = MAX_SIZE; - static constexpr std::int32_t ERISC_APP_ROUTING_INFO_BASE = TILE_HEADER_BUFFER_BASE; - static constexpr std::int32_t ERISC_APP_SYNC_INFO_BASE = ERISC_APP_ROUTING_INFO_BASE + ERISC_APP_ROUTING_INFO_SIZE; + static constexpr std::int32_t ERISC_FIRMWARE_SIZE = 24 * 1024; + static constexpr std::uint32_t MEM_ERISC_LOCAL_SIZE = (8 * 1024); + static constexpr std::int32_t RISC_LOCAL_MEM_BASE = + 0xFFB00000; // Actual local memory address as seen from risc firmware + // As part of the init risc firmware will copy local memory data from + // l1 locations listed above into internal local memory that starts + // at RISC_LOCAL_MEM_BASE address - static constexpr std::uint32_t ERISC_MEM_MAILBOX_BASE = ERISC_APP_SYNC_INFO_BASE + ERISC_APP_SYNC_INFO_SIZE; + static constexpr uint32_t MEM_ERISC_RESERVED1 = 0; + static constexpr uint32_t MEM_ERISC_RESERVED1_SIZE = 1024; + static constexpr std::int32_t ERISC_MEM_MAILBOX_BASE = MEM_ERISC_RESERVED1 + MEM_ERISC_RESERVED1_SIZE; static constexpr std::uint32_t ERISC_MEM_MAILBOX_SIZE = 3344; static constexpr std::uint32_t ERISC_MEM_MAILBOX_END = ERISC_MEM_MAILBOX_BASE + ERISC_MEM_MAILBOX_SIZE; - static constexpr std::int32_t ERISC_L1_KERNEL_CONFIG_BASE = ERISC_MEM_MAILBOX_END; - static constexpr std::int32_t FABRIC_ROUTER_CONFIG_BASE = - (ERISC_L1_KERNEL_CONFIG_BASE + ERISC_L1_KERNEL_CONFIG_SIZE + 31) & ~31; - static constexpr std::int32_t FABRIC_ROUTER_CONFIG_SIZE = 2056; - static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = - (FABRIC_ROUTER_CONFIG_BASE + FABRIC_ROUTER_CONFIG_SIZE + 31) & ~31; - static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = MAX_L1_LOADING_SIZE - ERISC_L1_UNRESERVED_BASE; - - static_assert((ERISC_L1_UNRESERVED_BASE % 32) == 0); - - // This scratch address is same as ERISC_L1_UNRESERVED_BASE, as the scratch space is used to copy data during - // runtime build, and is unused once FW copies the data to local memory during FW initialization. - static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = ERISC_L1_UNRESERVED_BASE; + + static constexpr std::int32_t FIRMWARE_BASE = ERISC_MEM_MAILBOX_END; + static constexpr std::int32_t MEM_ERISC_MAP_END = FIRMWARE_BASE + ERISC_FIRMWARE_SIZE; + + static constexpr std::uint32_t MEM_ERISC_KERNEL_SIZE = (24 * 1024); + static constexpr std::int32_t MEM_ERISC_INIT_LOCAL_L1_BASE_SCRATCH = MEM_ERISC_MAP_END; + static constexpr std::int32_t MEM_ERISC_STACK_SIZE = 1024; + static constexpr std::int32_t MEM_SLAVE_ERISC_STACK_SIZE = 1024; + static constexpr std::int32_t MEM_ERISC_STACK_BASE = + RISC_LOCAL_MEM_BASE + MEM_ERISC_LOCAL_SIZE - MEM_ERISC_STACK_SIZE; + + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SCRATCH = + MEM_ERISC_INIT_LOCAL_L1_BASE_SCRATCH + MEM_ERISC_LOCAL_SIZE; + // Memory for (dram/l1)_bank_to_noc_xy arrays, size needs to be atleast 2 * NUM_NOCS * (NUM_DRAM_BANKS + + // NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_XY_SIZE = 1024; + // Memory for bank_to_dram_offset and bank_to_l1_offset arrays, size needs to be atleast 4 * (NUM_DRAM_BANKS + + // NUM_L1_BANKS) + static constexpr std::int32_t ERISC_MEM_BANK_OFFSET_SIZE = 1024; static constexpr std::int32_t ERISC_MEM_BANK_TO_NOC_SIZE = ERISC_MEM_BANK_TO_NOC_XY_SIZE + ERISC_MEM_BANK_OFFSET_SIZE; - static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = L1_EPOCH_Q_BASE + 4; + static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = 0; // don't need this - just to get things to compile + static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = (MEM_ERISC_MAP_END + (69 * 1024) + 63) & ~63; + static constexpr std::int32_t ERISC_L1_UNRESERVED_SIZE = MAX_SIZE - ERISC_L1_UNRESERVED_BASE; + + static_assert((ERISC_L1_UNRESERVED_BASE % 64) == 0); template struct TAssertEquality { @@ -85,14 +78,7 @@ struct address_map { static constexpr bool _cResult = (A == B); }; - static constexpr std::int32_t RISC_LOCAL_MEM_BASE = - 0xffb00000; // Actaul local memory address as seen from risc firmware - // As part of the init risc firmware will copy local memory data from - // l1 locations listed above into internal local memory that starts - // at RISC_LOCAL_MEM_BASE address - - static constexpr std::uint32_t FW_VERSION_ADDR = 0x210; - static constexpr std::uint32_t RETRAIN_COUNT_ADDR = 0x1EDC; // Not implemented for BH yet! + static constexpr std::uint32_t RETRAIN_COUNT_ADDR = 0x1EDC; // UPDATE ADDR FOR BH! static constexpr std::uint32_t RETRAIN_FORCE_ADDR = 0x1EFC; }; } // namespace eth_l1_mem diff --git a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h index 754d5fe9da2..3ce95f1beba 100644 --- a/tt_metal/hw/inc/grayskull/eth_l1_address_map.h +++ b/tt_metal/hw/inc/grayskull/eth_l1_address_map.h @@ -28,7 +28,6 @@ struct address_map { static constexpr std::int32_t ERISC_FIRMWARE_SIZE = 16; static constexpr std::int32_t ERISC_L1_UNRESERVED_BASE = 0; static constexpr std::int32_t LAUNCH_ERISC_APP_FLAG = 0; - static constexpr std::uint32_t FW_VERSION_ADDR = 0; static constexpr std::int32_t ERISC_BARRIER_BASE = 0; static constexpr std::int32_t MAX_L1_LOADING_SIZE = 1; diff --git a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h index 78654de0eac..faa1814d985 100644 --- a/tt_metal/hw/inc/wormhole/eth_l1_address_map.h +++ b/tt_metal/hw/inc/wormhole/eth_l1_address_map.h @@ -89,7 +89,6 @@ struct address_map { // l1 locations listed above into internal local memory that starts // at RISC_LOCAL_MEM_BASE address - static constexpr std::uint32_t FW_VERSION_ADDR = 0x210; static constexpr std::uint32_t RETRAIN_COUNT_ADDR = 0x1EDC; static constexpr std::uint32_t RETRAIN_FORCE_ADDR = 0x1EFC; }; diff --git a/tt_metal/hw/toolchain/memory.ld b/tt_metal/hw/toolchain/memory.ld index af4c5fae69c..d528bd0f729 100644 --- a/tt_metal/hw/toolchain/memory.ld +++ b/tt_metal/hw/toolchain/memory.ld @@ -40,6 +40,16 @@ MEMORY NCRISC_FIRMWARE_CODE : ORIGIN = MEM_NCRISC_FIRMWARE_BASE, LENGTH = MEM_NCRISC_KERNEL_SIZE #endif +#ifdef COMPILE_FOR_AERISC + AERISC_LOCAL_DATA_MEM : ORIGIN = MEM_LOCAL_BASE, LENGTH = MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE + AERISC_STACK_MEM : ORIGIN = MEM_AERISC_STACK_BASE, LENGTH = MEM_IERISC_STACK_SIZE +#if defined(TYPE_FIRMWARE) + AERISC_FIRMWARE_CODE : ORIGIN = MEM_AERISC_FIRMWARE_BASE, LENGTH = MEM_IERISC_FIRMWARE_SIZE +#else + AERISC_FIRMWARE_CODE : ORIGIN = MEM_AERISC_FIRMWARE_BASE, LENGTH = MEM_IERISC_KERNEL_SIZE +#endif +#endif + #ifdef COMPILE_FOR_IERISC IERISC_LOCAL_DATA_MEM : ORIGIN = MEM_LOCAL_BASE, LENGTH = MEM_IERISC_LOCAL_SIZE - MEM_IERISC_STACK_SIZE IERISC_STACK_MEM : ORIGIN = MEM_IERISC_STACK_BASE, LENGTH = MEM_IERISC_STACK_SIZE diff --git a/tt_metal/hw/toolchain/sections.ld b/tt_metal/hw/toolchain/sections.ld index 2f0e0544e5f..13b40b4ecd1 100644 --- a/tt_metal/hw/toolchain/sections.ld +++ b/tt_metal/hw/toolchain/sections.ld @@ -23,7 +23,7 @@ #if defined(TYPE_FIRMWARE) /* Need separation beteen end of firmware and start of kernel, so that they don't share unflushable i$ cache lines. */ -#if defined(TARGET_IERISC) || defined(TARGET_SLAVE_IERISC) +#if defined(TARGET_IERISC) || defined(TARGET_SLAVE_IERISC) || defined(TARGET_AERISC) #define MEM_PAD MEM_IERISC_KERNEL_PAD #else #define MEM_PAD 0 diff --git a/tt_metal/impl/debug/dprint_server.cpp b/tt_metal/impl/debug/dprint_server.cpp index 0042e09efa6..13d83020c53 100644 --- a/tt_metal/impl/debug/dprint_server.cpp +++ b/tt_metal/impl/debug/dprint_server.cpp @@ -60,8 +60,9 @@ static inline float bfloat16_to_float(uint16_t bfloat_val) { static string GetRiscName(CoreType core_type, int hart_id, bool abbreviated = false) { if (core_type == CoreType::ETH) { switch (hart_id) { - case DPRINT_RISCV_INDEX_ER: - return abbreviated ? "ER" : "ERISC"; + case DPRINT_RISCV_INDEX_ER: return abbreviated ? "ER" : "ERISC"; + case DPRINT_RISCV_INDEX_ER1: + return abbreviated ? "ER1" : "ERISC1"; // Default case falls through and handled at end. } } else { diff --git a/tt_metal/impl/device/device.cpp b/tt_metal/impl/device/device.cpp index 706df4cf4bf..ed914e9d410 100644 --- a/tt_metal/impl/device/device.cpp +++ b/tt_metal/impl/device/device.cpp @@ -388,8 +388,15 @@ void Device::initialize_build() { break; } case HalProgrammableCoreType::ACTIVE_ETH: { + // Cooperative means active erisc FW needs to context switch to base FW + bool is_cooperative = this->arch() == ARCH::WORMHOLE_B0; build_states[index] = std::make_shared( - this->build_env_, JitBuiltStateConfig{.processor_id = processor_class, .is_fw=is_fw, .dispatch_message_addr=dispatch_message_addr}); + this->build_env_, + JitBuiltStateConfig{ + .processor_id = processor_class, + .is_fw = is_fw, + .dispatch_message_addr = dispatch_message_addr, + .is_cooperative = is_cooperative}); break; } case HalProgrammableCoreType::IDLE_ETH: { diff --git a/tt_metal/jit_build/build.cpp b/tt_metal/jit_build/build.cpp index 73bbe4e3d83..dbd24bf5478 100644 --- a/tt_metal/jit_build/build.cpp +++ b/tt_metal/jit_build/build.cpp @@ -422,7 +422,7 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit this->includes_ = env_.includes_ + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + "/metal/common " + "-I " + env_.root_ + "tt_metal/hw/ckernels/" + env.arch_name_ + - "/metal/llk_io "; + "/metal/llk_io " + "-I " + env_.root_ + "tt_metal/hw/inc/ethernet "; this->defines_ = env_.defines_; uint32_t l1_cache_disable_mask = tt::llrt::RunTimeOptions::get_instance().get_feature_riscv_mask( @@ -431,19 +431,49 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit this->defines_ += "-DDISABLE_L1_DATA_CACHE "; } - switch (this->core_id_) { + // 0: core_id = 0 and not cooperative + // 1: core_id = 0 and cooperative + uint32_t build_class = (this->core_id_ << 1) | uint32_t(build_config.is_cooperative); + + switch (build_class) { case 0: { - this->target_name_ = "erisc"; - this->cflags_ = env_.cflags_ + "-Os -fno-delete-null-pointer-checks "; + this->target_name_ = "active_erisc"; + this->cflags_ = + env_.cflags_ + "-Os " + "-fno-tree-loop-distribute-patterns "; // don't use memcpy for cpy loops this->defines_ += "-DCOMPILE_FOR_ERISC " "-DERISC " "-DRISC_B0_HW "; + + this->includes_ += "-I " + env_.root_ + "tt_metal/hw/firmware/src "; + + if (this->is_fw_) { + this->srcs_.push_back("tt_metal/hw/firmware/src/active_erisc.cc"); + } else { + this->srcs_.push_back("tt_metal/hw/firmware/src/active_erisck.cc"); + } + this->lflags_ = env_.lflags_ + "-Os "; + if (this->is_fw_) { - this->defines_ += "-DLOADING_NOC=0 "; + this->lflags_ += + "-T" + env_.root_ + "runtime/hw/toolchain/" + get_alias(env_.arch_) + "/firmware_aerisc.ld "; + } else { + this->lflags_ += + "-T" + env_.root_ + "runtime/hw/toolchain/" + get_alias(env_.arch_) + "/kernel_aerisc.ld "; } + break; + } + case 1: { + this->target_name_ = "erisc"; + this->cflags_ = env_.cflags_ + "-Os -fno-delete-null-pointer-checks "; + + this->defines_ += + "-DCOMPILE_FOR_ERISC " + "-DERISC " + "-DRISC_B0_HW "; + this->includes_ += "-I " + env_.root_ + "tt_metal/hw/inc/ethernet "; if (this->is_fw_) { @@ -466,9 +496,14 @@ JitBuildActiveEthernet::JitBuildActiveEthernet(const JitBuildEnv& env, const Jit "/tt_metal/hw/toolchain " "-T" + env_.root_ + linker_str; + break; } - default: TT_THROW("Invalid processor ID {} for Active Ethernet core.", this->core_id_); + default: + TT_THROW( + "Invalid processor ID {} and cooperative scheme {} for Active Ethernet core.", + this->core_id_, + build_config.is_cooperative); } this->process_defines_at_compile = true; diff --git a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp index 97f97e9c5fe..46bd70b7632 100644 --- a/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp +++ b/tt_metal/llrt/blackhole/bh_hal_active_eth.cpp @@ -17,6 +17,8 @@ #include "hal_asserts.hpp" #include "blackhole/bh_hal.hpp" +#include "hostdevcommon/common_runtime_address_map.h" // L1_KERNEL_CONFIG_SIZE + #include "umd/device/tt_soc_descriptor.h" // CoreType #define GET_ETH_MAILBOX_ADDRESS_HOST(x) \ @@ -37,7 +39,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_bases[static_cast(HalL1MemAddrType::DPRINT)] = GET_ETH_MAILBOX_ADDRESS_HOST(dprint_buf); mem_map_bases[static_cast(HalL1MemAddrType::PROFILER)] = GET_ETH_MAILBOX_ADDRESS_HOST(profiler); mem_map_bases[static_cast(HalL1MemAddrType::KERNEL_CONFIG)] = - eth_l1_mem::address_map::ERISC_L1_KERNEL_CONFIG_BASE; + eth_l1_mem::address_map::MEM_ERISC_MAP_END; mem_map_bases[static_cast(HalL1MemAddrType::UNRESERVED)] = eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; mem_map_bases[static_cast(HalL1MemAddrType::CORE_INFO)] = GET_ETH_MAILBOX_ADDRESS_HOST(core_info); @@ -48,8 +50,6 @@ HalCoreInfoType create_active_eth_mem_map() { eth_l1_mem::address_map::ERISC_MEM_BANK_TO_NOC_SCRATCH; mem_map_bases[static_cast(HalL1MemAddrType::APP_SYNC_INFO)] = eth_l1_mem::address_map::ERISC_APP_SYNC_INFO_BASE; - mem_map_bases[static_cast(HalL1MemAddrType::TILE_HEADER_BUFFER)] = - eth_l1_mem::address_map::TILE_HEADER_BUFFER_BASE; mem_map_bases[static_cast(HalL1MemAddrType::APP_ROUTING_INFO)] = eth_l1_mem::address_map::ERISC_APP_ROUTING_INFO_BASE; mem_map_bases[static_cast(HalL1MemAddrType::RETRAIN_COUNT)] = @@ -69,7 +69,7 @@ HalCoreInfoType create_active_eth_mem_map() { mem_map_sizes[static_cast(HalL1MemAddrType::DPRINT)] = sizeof(dprint_buf_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::PROFILER)] = sizeof(profiler_msg_t); mem_map_sizes[static_cast(HalL1MemAddrType::KERNEL_CONFIG)] = - eth_l1_mem::address_map::ERISC_L1_KERNEL_CONFIG_SIZE; + L1_KERNEL_CONFIG_SIZE; // TODO: this is wrong, need eth specific value. For now use same value as idle eth mem_map_sizes[static_cast(HalL1MemAddrType::UNRESERVED)] = eth_l1_mem::address_map::MAX_SIZE - eth_l1_mem::address_map::ERISC_L1_UNRESERVED_BASE; mem_map_sizes[static_cast(HalL1MemAddrType::GO_MSG)] = sizeof(go_msg_t); @@ -91,8 +91,7 @@ HalCoreInfoType create_active_eth_mem_map() { // BH active ethernet runs idle erisc FW on the second ethernet processor_types[0] = HalJitBuildConfig{ .fw_base_addr = eth_l1_mem::address_map::FIRMWARE_BASE, - .local_init_addr = eth_l1_mem::address_map::FIRMWARE_BASE, // this will be uplifted in subsequent commits - // enabling active erisc + .local_init_addr = eth_l1_mem::address_map::MEM_ERISC_INIT_LOCAL_L1_BASE_SCRATCH, .fw_launch_addr = SLAVE_IERISC_RESET_PC, .fw_launch_addr_value = (uint32_t)eth_l1_mem::address_map::FIRMWARE_BASE, };