From 92407daf9e3f5a5c7c56595ec57e225fda77cc06 Mon Sep 17 00:00:00 2001 From: Kyle Mabee Date: Mon, 6 Jan 2025 00:24:42 +0000 Subject: [PATCH] LightMetal - Add test_lightmetal_sanity.cpp for dedicated light-metal-binary trace + replay testing - Add ThreeRISCDataMovementComputeSanity new test - Add 2x Metal Trace tests to test_lightmetal_sanity.cpp (single op, two op). Just creates metal traces, does not explicitly run them during capture, only via lightmetal replay from binary. --- tests/tt_metal/tt_metal/CMakeLists.txt | 2 + .../tt_metal/lightmetal/CMakeLists.txt | 25 ++ .../lightmetal/lightmetal_fixture.hpp | 80 ++++ .../lightmetal/test_lightmetal_sanity.cpp | 395 ++++++++++++++++++ 4 files changed, 502 insertions(+) create mode 100644 tests/tt_metal/tt_metal/lightmetal/CMakeLists.txt create mode 100644 tests/tt_metal/tt_metal/lightmetal/lightmetal_fixture.hpp create mode 100644 tests/tt_metal/tt_metal/lightmetal/test_lightmetal_sanity.cpp diff --git a/tests/tt_metal/tt_metal/CMakeLists.txt b/tests/tt_metal/tt_metal/CMakeLists.txt index 0d515c709595..0b2a3f68cb14 100644 --- a/tests/tt_metal/tt_metal/CMakeLists.txt +++ b/tests/tt_metal/tt_metal/CMakeLists.txt @@ -68,6 +68,7 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/llk) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/perf_microbenchmark) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/stl) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/noc) +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/lightmetal) add_custom_target( metal_tests @@ -83,4 +84,5 @@ add_custom_target( unit_tests_llk unit_tests_stl unit_tests_noc + unit_tests_lightmetal ) diff --git a/tests/tt_metal/tt_metal/lightmetal/CMakeLists.txt b/tests/tt_metal/tt_metal/lightmetal/CMakeLists.txt new file mode 100644 index 000000000000..c130747c0d93 --- /dev/null +++ b/tests/tt_metal/tt_metal/lightmetal/CMakeLists.txt @@ -0,0 +1,25 @@ +set(UNIT_TESTS_LIGHTMETAL_SRC ${CMAKE_CURRENT_SOURCE_DIR}/test_lightmetal_sanity.cpp) + +add_executable(unit_tests_lightmetal ${UNIT_TESTS_LIGHTMETAL_SRC}) +TT_ENABLE_UNITY_BUILD(unit_tests_lightmetal) + +target_link_libraries(unit_tests_lightmetal PUBLIC test_metal_common_libs) + +target_include_directories( + unit_tests_lightmetal + PRIVATE + ${PROJECT_SOURCE_DIR} + ${PROJECT_SOURCE_DIR}/tt_metal + ${PROJECT_SOURCE_DIR}/tt_metal/common + ${PROJECT_SOURCE_DIR}/tests + ${PROJECT_SOURCE_DIR}/tests/tt_metal/tt_metal/common + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/common +) + +set_target_properties( + unit_tests_lightmetal + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY + ${PROJECT_BINARY_DIR}/test/tt_metal +) diff --git a/tests/tt_metal/tt_metal/lightmetal/lightmetal_fixture.hpp b/tests/tt_metal/tt_metal/lightmetal/lightmetal_fixture.hpp new file mode 100644 index 000000000000..975195e748c7 --- /dev/null +++ b/tests/tt_metal/tt_metal/lightmetal/lightmetal_fixture.hpp @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +// #include "command_queue_fixture.hpp" +#include "dispatch_fixture.hpp" +#include "impl/device/device.hpp" +#include "llrt/hal.hpp" +#include "tt_metal/host_api.hpp" +#include "tt_metal/detail/tt_metal.hpp" +#include "tt_metal/hw/inc/circular_buffer_constants.h" +#include "tt_metal/impl/kernels/kernel.hpp" +#include "tt_metal/common/tt_backend_api_types.hpp" +#include "tt_metal/impl/lightmetal/lightmetal_replay.hpp" +#include "command_queue_fixture.hpp" + +class SingleDeviceLightMetalFixture : public CommandQueueFixture { +protected: + bool replay_binary_; + std::string trace_bin_path_; + bool write_bin_to_disk_; + + void SetUp() override { + this->validate_dispatch_mode(); + this->arch_ = tt::get_arch_from_string(tt::test_utils::get_umd_arch_name()); + } + + void CreateDevice( + const size_t trace_region_size, const bool replay_binary = false, const std::string trace_bin_path = "") { + // Skip writing to disk by default, unless user sets env var for local testing + write_bin_to_disk_ = tt::parse_env("LIGHTMETAL_SAVE_BINARY", false); + + // If user didn't provide a specific trace bin path, set a default here based on test name + if (trace_bin_path == "") { + const auto test_info = ::testing::UnitTest::GetInstance()->current_test_info(); + auto trace_filename = test_info ? std::string(test_info->name()) + ".bin" : "lightmetal_trace.bin"; + this->trace_bin_path_ = "/tmp/" + trace_filename; + } + + this->create_device(trace_region_size); + this->replay_binary_ = replay_binary || tt::parse_env("LIGHTMETAL_RUN", false); + LightMetalBeginCapture(this->device_); + } + + // End light metal tracing, write to optional filename and optionally run from binary blob + void TearDown() override { + auto blob = LightMetalEndCapture(this->device_); + if (write_bin_to_disk_ && !this->trace_bin_path_.empty()) { + WriteBlobToFile(this->trace_bin_path_, blob); + } + + if (!this->IsSlowDispatch()) { + tt::tt_metal::CloseDevice(this->device_); + } + + if (replay_binary_) { + RunLightMetalBinary(blob); + } + } + + // Just write, limited error checking. + bool WriteBlobToFile(const std::string& filename, const std::vector& blob) { + log_info(tt::LogTest, "Writing light metal binary blob of {} bytes to file: {}", blob.size(), filename); + std::ofstream outFile(filename, std::ios::binary); + outFile.write(reinterpret_cast(blob.data()), blob.size()); + return outFile.good(); + } + + // Mimic the light-metal standalone run replay tool by executing the binary. + void RunLightMetalBinary(std::vector& blob) { + tt::tt_metal::LightMetalReplay lm_replay(std::move(blob)); + if (!lm_replay.ExecuteLightMetalBinary()) { + tt::log_fatal("Light Metal Binary failed to execute or encountered errors."); + } else { + log_info(tt::LogMetalTrace, "Light Metal Binary executed successfully!"); + } + } +}; diff --git a/tests/tt_metal/tt_metal/lightmetal/test_lightmetal_sanity.cpp b/tests/tt_metal/tt_metal/lightmetal/test_lightmetal_sanity.cpp new file mode 100644 index 000000000000..81a361a7b45a --- /dev/null +++ b/tests/tt_metal/tt_metal/lightmetal/test_lightmetal_sanity.cpp @@ -0,0 +1,395 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include + +#include "lightmetal_fixture.hpp" +// #include "dispatch_test_utils.hpp" +#include "detail/tt_metal.hpp" +#include "tt_metal/common/env_lib.hpp" +#include "gtest/gtest.h" +#include "tt_metal/impl/allocator/allocator.hpp" +#include "tt_metal/impl/program/program.hpp" +#include "tt_metal/impl/device/device.hpp" +#include "tt_metal/impl/dispatch/command_queue.hpp" +#include "tt_metal/common/logger.hpp" +#include "tt_metal/common/scoped_timer.hpp" +#include "tt_metal/host_api.hpp" + +using std::vector; +using namespace tt; +using namespace tt::tt_metal; + +namespace lightmetal_test_helpers { + +// Single RISC, no CB's here. Very simple. +Program create_simple_datamovement_program(Buffer& input, Buffer& output, Buffer& l1_buffer) { + Program program = CreateProgram(); + IDevice* device = input.device(); + constexpr CoreCoord core = {0, 0}; + + KernelHandle dram_copy_kernel_id = CreateKernel( + program, + "tt_metal/programming_examples/loopback/kernels/loopback_dram_copy.cpp", + core, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + // Since all interleaved buffers have size == page_size, they are entirely contained in the first DRAM bank + const uint32_t input_bank_id = 0; + const uint32_t output_bank_id = 0; + + // Handle Runtime Args + const std::vector runtime_args = { + l1_buffer.address(), input.address(), input_bank_id, output.address(), output_bank_id, l1_buffer.size()}; + + // Note - this interface doesn't take Buffer, just data. + SetRuntimeArgs(program, dram_copy_kernel_id, core, runtime_args); + + return program; +} + +// Copied from test_EnqueueTrace.cpp +Program create_simple_unary_program(Buffer& input, Buffer& output) { + Program program = CreateProgram(); + IDevice* device = input.device(); + CoreCoord worker = {0, 0}; + auto reader_kernel = CreateKernel( + program, + "tt_metal/kernels/dataflow/reader_unary.cpp", + worker, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_1, .noc = NOC::RISCV_1_default}); + + auto writer_kernel = CreateKernel( + program, + "tt_metal/kernels/dataflow/writer_unary.cpp", + worker, + DataMovementConfig{.processor = DataMovementProcessor::RISCV_0, .noc = NOC::RISCV_0_default}); + + auto sfpu_kernel = CreateKernel( + program, + "tt_metal/kernels/compute/eltwise_sfpu.cpp", + worker, + ComputeConfig{ + .math_approx_mode = true, + .compile_args = {1, 1}, + .defines = {{"SFPU_OP_EXP_INCLUDE", "1"}, {"SFPU_OP_CHAIN_0", "exp_tile_init(); exp_tile(0);"}}}); + + CircularBufferConfig input_cb_config = CircularBufferConfig(2048, {{tt::CBIndex::c_0, tt::DataFormat::Float16_b}}) + .set_page_size(tt::CBIndex::c_0, 2048); + + CoreRange core_range({0, 0}); + CreateCircularBuffer(program, core_range, input_cb_config); + std::shared_ptr writer_runtime_args = std::make_shared(); + std::shared_ptr reader_runtime_args = std::make_shared(); + + *writer_runtime_args = {&output, (uint32_t)0, output.num_pages()}; + + *reader_runtime_args = {&input, (uint32_t)0, input.num_pages()}; + + SetRuntimeArgs(device, detail::GetKernel(program, writer_kernel), worker, writer_runtime_args); + SetRuntimeArgs(device, detail::GetKernel(program, reader_kernel), worker, reader_runtime_args); + + CircularBufferConfig output_cb_config = CircularBufferConfig(2048, {{tt::CBIndex::c_16, tt::DataFormat::Float16_b}}) + .set_page_size(tt::CBIndex::c_16, 2048); + + CreateCircularBuffer(program, core_range, output_cb_config); + return program; +} + +} // namespace lightmetal_test_helpers + +namespace lightmetal_basic_tests { + +constexpr bool kBlocking = true; +constexpr bool kNonBlocking = false; +vector blocking_flags = {kBlocking, kNonBlocking}; + +// Test that create buffer, write, readback, and verify works when traced + replayed. +TEST_F(SingleDeviceLightMetalFixture, CreateBufferEnqueueWriteRead_Sanity) { + CreateDevice(2048); + + CommandQueue& command_queue = this->device_->command_queue(); + uint32_t num_loops = parse_env("NUM_LOOPS", 1); + bool keep_buffers_alive = std::getenv("KEEP_BUFFERS_ALIVE"); // For testing, keep buffers alive for longer. + std::vector> buffers_vec; + + for (uint32_t loop_idx = 0; loop_idx < num_loops; loop_idx++) { + log_debug(tt::LogTest, "Running loop: {}", loop_idx); + + // Switch to use top level CreateBuffer API that has trace support. + uint32_t size_bytes = 64; // 16 elements. + auto buffer = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + log_debug( + tt::LogTest, + "created buffer loop: {} with size: {} bytes addr: 0x{:x}", + loop_idx, + buffer->size(), + buffer->address()); + + if (keep_buffers_alive) { + buffers_vec.push_back(buffer); + } + + // We don't want to capture inputs in binary, but do it to start for testing. + uint32_t start_val = loop_idx * 100; + vector input_data(buffer->size() / sizeof(uint32_t), 0); + for (uint32_t i = 0; i < input_data.size(); i++) { + input_data[i] = start_val + i; + } + log_debug(tt::LogTest, "initialize input_data with {} elements start_val: {}", input_data.size(), start_val); + + vector readback_data; + readback_data.resize(input_data.size()); // This is required. + + // Write data to buffer, then readback and verify. + EnqueueWriteBuffer(command_queue, *buffer, input_data.data(), true); + EnqueueReadBuffer(command_queue, *buffer, readback_data.data(), true); + EXPECT_TRUE(input_data == readback_data); + + // For dev/debug go ahead and print the results. Had a replay bug, was seeing wrong data. + for (size_t i = 0; i < readback_data.size(); i++) { + log_debug(tt::LogMetalTrace, "loop: {} rd_data i: {:3d} => data: {}", loop_idx, i, readback_data[i]); + } + } + + Finish(command_queue); +} + +// Test simple case of single datamovement program on single RISC works for trace + replay. +TEST_F(SingleDeviceLightMetalFixture, SingleRISCDataMovementSanity) { + CreateDevice(2048); + + uint32_t size_bytes = 64; // 16 elements. + auto input = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto output = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto l1_buffer = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::L1}); + log_debug( + tt::LogTest, + "Created 3 Buffers. input: 0x{:x} output: 0x{:x} l1_buffer: 0x{:x}", + input->address(), + output->address(), + l1_buffer->address()); + + CommandQueue& command_queue = this->device_->command_queue(); + + Program simple_program = lightmetal_test_helpers::create_simple_datamovement_program(*input, *output, *l1_buffer); + vector input_data(input->size() / sizeof(uint32_t), 0); + for (uint32_t i = 0; i < input_data.size(); i++) { + input_data[i] = i; + } + + vector eager_output_data; + eager_output_data.resize(input_data.size()); + + // Write data to buffer, enqueue program, then readback and verify. + EnqueueWriteBuffer(command_queue, *input, input_data.data(), true); + EnqueueProgram(command_queue, simple_program, true); + EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), true); + EXPECT_TRUE(eager_output_data == input_data); + + // For dev/debug go ahead and print the results + for (size_t i = 0; i < eager_output_data.size(); i++) { + log_debug(tt::LogMetalTrace, "i: {:3d} input: {} output: {}", i, input_data[i], eager_output_data[i]); + } + + Finish(command_queue); +} + +// Test simple case of 3 riscs used for datamovement and compute works for trace + replay. +TEST_F(SingleDeviceLightMetalFixture, ThreeRISCDataMovementComputeSanity) { + CreateDevice(2048); + + uint32_t size_bytes = 64; // 16 elements. + auto input = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto output = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + + CommandQueue& command_queue = this->device_->command_queue(); + + // TODO (kmabee) - There is issue with using make_shared, revisit this. + // auto simple_program = std::make_shared(lightmetal_test_helpers::create_simple_unary_program(*input, + // *output)); + auto simple_program = lightmetal_test_helpers::create_simple_unary_program(*input, *output); + + vector input_data(input->size() / sizeof(uint32_t), 0); + for (uint32_t i = 0; i < input_data.size(); i++) { + input_data[i] = i; + } + + vector eager_output_data; + eager_output_data.resize(input_data.size()); + + // Write data to buffer, enqueue program, then readback and verify. + EnqueueWriteBuffer(command_queue, *input, input_data.data(), true); + EnqueueProgram(command_queue, simple_program, true); + EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), true); + + // For dev/debug go ahead and print the results + for (size_t i = 0; i < eager_output_data.size(); i++) { + log_debug(tt::LogMetalTrace, "i: {:3d} input: {} output: {}", i, input_data[i], eager_output_data[i]); + } + + Finish(command_queue); +} + +// Test simple compute test with metal trace, but no explicit trace replay (added automatically by light metal trace). +TEST_F(SingleDeviceLightMetalFixture, SingleProgramTraceCapture) { + CreateDevice(2048); + + // Must use CreateBuffer not Buffer::create() + uint32_t size_bytes = 64; // 16 elements. Was 2048 in original test. + auto input = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto output = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + + CommandQueue& command_queue = this->device_->command_queue(); + + Program simple_program = lightmetal_test_helpers::create_simple_unary_program(*input, *output); + + // Setup input data for program with some simple values. + vector input_data(input->size() / sizeof(uint32_t), 0); + for (uint32_t i = 0; i < input_data.size(); i++) { + input_data[i] = i; + } + + vector eager_output_data, trace_output_data; + eager_output_data.resize(input_data.size()); + trace_output_data.resize(input_data.size()); + + // Initial run w/o trace. Preloads binary cache, and captures golden output. + EnqueueWriteBuffer(command_queue, *input, input_data.data(), true); + EnqueueProgram(command_queue, simple_program, true); + EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), true); + + // Write junk to output buffer to help make sure trace run from standalone binary works. + bool debug_write_junk_to_output_buffer = true; + if (debug_write_junk_to_output_buffer) { + vector dummy_data(input->size() / sizeof(uint32_t), 0xDEADBEEF); + EnqueueWriteBuffer(command_queue, *output, dummy_data.data(), true); + EnqueueReadBuffer(command_queue, *output, trace_output_data.data(), true); + for (size_t i = 0; i < trace_output_data.size(); i++) { + log_debug( + tt::LogMetalTrace, + "i: {:3d} input: {:x} output: {:x} after writing dummy data", + i, + input_data[i], + trace_output_data[i]); + } + } + + // Now enable Metal Trace and run program again for capture. + uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); + EnqueueProgram(command_queue, simple_program, false); + EndTraceCapture(this->device_, command_queue.id(), tid); + + // Note: Purposely avoiding EnqueueTrace/ReplayTrace here. The trace will be + // automatically be replayed from LightMetal Binary w/ LoadTrace+ReplayTrace. + const bool debug_run_trace_during_capture = parse_env("RUN_TRACE_DURING_CAPTURE", false); + if (debug_run_trace_during_capture) { + EnqueueTrace(command_queue, tid, true); + } + + // Read the output buffer. Without actually enqueue/replaying the trace in this test during + // capture, cannot compare to eager/golde output, only visual check during standalone playback. + EnqueueReadBuffer(command_queue, *output, trace_output_data.data(), true); + if (debug_run_trace_during_capture) { + log_debug(tt::LogTest, "Comparing eager_output_data == trace_output_data"); + EXPECT_TRUE(eager_output_data == trace_output_data); + } + + // For dev/debug go ahead and print the results + for (size_t i = 0; i < trace_output_data.size(); i++) { + log_debug(tt::LogMetalTrace, "i: {:3d} input: {:x} trace_output: {:x}", i, input_data[i], trace_output_data[i]); + } + + // Done + Finish(command_queue); + ReleaseTrace(this->device_, tid); +} + +// Test simple compute test with metal trace, but no explicit trace replay (added automatically by light metal trace). +TEST_F(SingleDeviceLightMetalFixture, TwoProgramTraceCapture) { + CreateDevice(2048); + + // Must use CreateBuffer not Buffer::create() + uint32_t size_bytes = 64; // 16 elements. Was 2048 in original test. + auto input = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto interm = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + auto output = CreateBuffer(InterleavedBufferConfig{this->device_, size_bytes, size_bytes, BufferType::DRAM}); + + CommandQueue& command_queue = this->device_->command_queue(); + + Program op0 = lightmetal_test_helpers::create_simple_unary_program(*input, *interm); + Program op1 = lightmetal_test_helpers::create_simple_unary_program(*interm, *output); + + // Setup input data for program with some simple values. + vector input_data(input->size() / sizeof(uint32_t), 0); + for (uint32_t i = 0; i < input_data.size(); i++) { + input_data[i] = i; + } + + vector eager_output_data, trace_output_data; + eager_output_data.resize(input_data.size()); + trace_output_data.resize(input_data.size()); + + // Initial run w/o trace. Preloads binary cache, and captures golden output. + EnqueueWriteBuffer(command_queue, *input, input_data.data(), true); + EnqueueProgram(command_queue, op0, true); + EnqueueProgram(command_queue, op1, true); + EnqueueReadBuffer(command_queue, *output, eager_output_data.data(), true); + Finish(command_queue); + + // Write junk to output buffer to help make sure trace run from standalone binary works. + bool debug_write_junk_to_output_buffer = true; + if (debug_write_junk_to_output_buffer) { + vector dummy_data(input->size() / sizeof(uint32_t), 0xDEADBEEF); + EnqueueWriteBuffer(command_queue, *output, dummy_data.data(), true); + EnqueueReadBuffer(command_queue, *output, trace_output_data.data(), true); + for (size_t i = 0; i < trace_output_data.size(); i++) { + log_debug( + tt::LogMetalTrace, + "i: {:3d} input: {:x} output: {:x} after writing dummy data", + i, + input_data[i], + trace_output_data[i]); + } + } + + // Now enable Metal Trace and run program again for capture. + uint32_t tid = BeginTraceCapture(this->device_, command_queue.id()); + EnqueueProgram(command_queue, op0, false); + EnqueueProgram(command_queue, op1, false); + EndTraceCapture(this->device_, command_queue.id(), tid); + + // Note: Purposely avoiding EnqueueTrace/ReplayTrace here. The trace will be + // automatically be replayed from LightMetal Binary w/ LoadTrace+ReplayTrace. + const bool debug_run_trace_during_capture = parse_env("RUN_TRACE_DURING_CAPTURE", false); + if (debug_run_trace_during_capture) { + EnqueueTrace(command_queue, tid, true); + } + + // Read the output buffer. Without actually enqueue/replaying the trace in this test during + // capture, cannot compare to eager/golde output, only visual check during standalone playback. + EnqueueReadBuffer(command_queue, *output, trace_output_data.data(), true); + if (debug_run_trace_during_capture) { + EXPECT_TRUE(eager_output_data == trace_output_data); + } + + // For dev/debug go ahead and print the results + for (size_t i = 0; i < trace_output_data.size(); i++) { + log_debug( + tt::LogMetalTrace, + "i: {:3d} input: {:x} eager_output_data: {:x} trace_output: {:x}", + i, + input_data[i], + eager_output_data[i], + trace_output_data[i]); + } + + // Done + Finish(command_queue); + ReleaseTrace(this->device_, tid); +} + +} // namespace lightmetal_basic_tests