tenstorrent · jbedichekTT · Jan 15, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/kernels/moreh_abs_pow_kernel.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/kernels/moreh_abs_pow_kernel.cpp
@@ -0,0 +1,91 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "debug/dprint.h"
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/moreh_common.hpp"
+
+namespace NAMESPACE {
+void MAIN {
+    int i{0};
+    const auto num_rows_per_core = get_arg_val<uint32_t>(i++);
+    const auto Wt = get_arg_val<uint32_t>(i++);
+    const auto origin_w = get_arg_val<uint32_t>(i++);
+    const auto p = get_arg_val<uint32_t>(i++);
+    const bool p_is_negative = get_arg_val<uint32_t>(i++) == 1;
+
+    std::uint8_t input_id{tt::CB::c_in0};
+    const auto cb_x = input_id++;        // input
+    const auto cb_one = input_id++;      // one
+    const auto cb_decimal = input_id++;  // decimal
+    const auto cb_mask_w = input_id++;   // mask_w
+
+    std::uint8_t output_id{tt::CB::c_out0};
+    const auto cb_y = output_id++;  // output
+
+    std::uint8_t intermed_id{tt::CB::c_intermed0};
+    const auto cb_tmp0 = intermed_id++;
+    const auto cb_tmp1 = intermed_id++;
+    const auto cb_tmp2 = intermed_id++;
+    const auto cb_tmp3 = intermed_id++;
+
+    const auto cb_xabs = cb_tmp0;      // |x|
+    const auto cb_xpow = cb_tmp1;      // |x|^p
+    const auto cb_logx = cb_tmp2;      // log(|x|)
+    const auto cb_exp_lxmd = cb_tmp3;  // exp(log(|x|) * decimal)
+
+    constexpr uint32_t onetile = 1;
+    constexpr uint32_t dst0 = 0;
+    constexpr uint32_t dst1 = 1;
+
+    binary_op_init_common(tt::CB::c_in0, tt::CB::c_in0);
+
+    cb_wait_front(cb_one, onetile);      // comes from the reader
+    cb_wait_front(cb_decimal, onetile);  // comes from the reader
+
+    constexpr uint32_t TILE_W = 32;
+    const bool do_mask_w = (origin_w % TILE_W) != 0;
+    const auto mask_w = do_mask_w ? (origin_w % TILE_W) : TILE_W;
+
+    if (do_mask_w) {
+        cb_wait_front(cb_mask_w, onetile);  // comes from the reader
+    }
+    for (uint32_t row_idx = 0; row_idx < num_rows_per_core; ++row_idx) {
+        for (uint32_t col_idx = 0; col_idx < Wt; ++col_idx) {
+            // |x|
+            tile_regs_acquire();
+            cb_wait_front(cb_x, onetile);  // comes from the reader
+            cb_reserve_back(cb_xabs, onetile);
+
+            copy_tile_init_with_dt(cb_x);
+            copy_tile(cb_x, 0, dst0);
+
+            if (do_mask_w && (col_idx == Wt - 1)) {
+                copy_tile_init_with_dt(cb_mask_w);
+                copy_tile(cb_mask_w, 0, dst1);
+
+                mask_tile_init();
+                mask_tile(dst0, dst1);
+            }
+
+            abs_tile_init();
+            abs_tile(dst0);
+            tile_regs_commit();
+
+            tile_regs_wait();
+            pack_tile_with_dt(dst0, cb_xabs);
+            tile_regs_release();
+
+            cb_pop_front(cb_x, onetile);
+            cb_push_back(cb_xabs, onetile);
+
+            power_tile_to_cb(cb_xabs, cb_xpow, cb_logx, cb_decimal, cb_exp_lxmd, cb_y, p, p_is_negative);
+        }
+    }
+
+    cb_pop_front(cb_one, onetile);
+    cb_pop_front(cb_decimal, onetile);
+    if (do_mask_w) {
+        cb_pop_front(cb_mask_w, onetile);
+    }
+}  // void MAIN
+}  // namespace NAMESPACE
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/kernels/reader_moreh_abs_pow.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/kernels/reader_moreh_abs_pow.cpp
@@ -0,0 +1,62 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/dataflow/moreh_common.hpp"
+
+void kernel_main() {
+    int i{0};
+    const auto input_addr = get_arg_val<uint32_t>(i++);
+    const bool input_is_dram = get_arg_val<uint32_t>(i++) == 1;
+    const auto decimal = get_arg_val<uint32_t>(i++);
+    const auto num_rows_per_core = get_arg_val<uint32_t>(i++);
+    const auto Wt = get_arg_val<uint32_t>(i++);
+    const auto tile_offset = get_arg_val<uint32_t>(i++);
+    const auto origin_w = get_arg_val<uint32_t>(i++);
+
+    uint32_t cb_id{0};
+    const auto cb_id_input = cb_id++;
+    const auto cb_id_one = cb_id++;
+    const auto cb_id_decimal = cb_id++;
+    const auto cb_id_mask_w = cb_id++;
+
+    const uint32_t input_tile_bytes = get_tile_size(cb_id_input);
+    const auto input_data_format = get_dataformat(cb_id_input);
+
+    const InterleavedAddrGenFast<true> dram_input_addrg = {
+        .bank_base_address = input_addr, .page_size = input_tile_bytes, .data_format = input_data_format};
+
+    const InterleavedAddrGenFast<false> l1_input_addrg = {
+        .bank_base_address = input_addr, .page_size = input_tile_bytes, .data_format = input_data_format};
+
+    Scalar one;
+    one.f = 1.0f;
+    fill_cb_with_value(cb_id_one, one.u);
+    fill_cb_with_value(cb_id_decimal, decimal);
+
+    constexpr uint32_t TILE_W = 32;
+    const bool do_mask_w = (origin_w % TILE_W) != 0;
+    const auto mask_w = do_mask_w ? (origin_w % TILE_W) : TILE_W;
+
+    if (do_mask_w) {
+        generate_mask_w(cb_id_mask_w, mask_w);
+    }
+
+    const auto start_tile_idx = tile_offset;
+    const auto input_l1_write_ptr = get_write_ptr(cb_id_input);
+
+    for (uint32_t row_idx = 0; row_idx < num_rows_per_core; ++row_idx) {
+        for (uint32_t col_idx = 0; col_idx < Wt; ++col_idx) {
+            const auto tile_idx = start_tile_idx + row_idx * Wt + col_idx;
+            cb_reserve_back(cb_id_input, 1);
+            if (input_is_dram) {
+                noc_async_read_tile(tile_idx, dram_input_addrg, input_l1_write_ptr);
+            } else {
+                noc_async_read_tile(tile_idx, l1_input_addrg, input_l1_write_ptr);
+            }
+            noc_async_read_barrier();
+            cb_push_back(cb_id_input, 1);
+        }
+    }
+
+}  // void kernel_main()
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/kernels/writer_moreh_abs_pow.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/kernels/writer_moreh_abs_pow.cpp
@@ -0,0 +1,45 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+
+#include "dataflow_api.h"
+
+void kernel_main() {
+    int i{0};
+    const auto output_addr = get_arg_val<uint32_t>(i++);
+    const bool output_is_dram = get_arg_val<uint32_t>(i++) == 1;
+    const auto num_rows_per_core = get_arg_val<uint32_t>(i++);
+    const auto Wt = get_arg_val<uint32_t>(i++);
+    const auto tile_offset = get_arg_val<uint32_t>(i++);
+
+    uint32_t cb_id{16};
+    const auto cb_id_output = cb_id++;
+
+    const uint32_t output_tile_bytes = get_tile_size(cb_id_output);
+    const auto output_data_format = get_dataformat(cb_id_output);
+
+    const InterleavedAddrGenFast<true> dram_output_addrg = {
+        .bank_base_address = output_addr, .page_size = output_tile_bytes, .data_format = output_data_format};
+
+    const InterleavedAddrGenFast<false> l1_output_addrg = {
+        .bank_base_address = output_addr, .page_size = output_tile_bytes, .data_format = output_data_format};
+
+    const auto start_tile_idx = tile_offset;
+    const auto output_l1_read_addr = get_read_ptr(cb_id_output);
+
+    for (uint32_t row_idx = 0; row_idx < num_rows_per_core; ++row_idx) {
+        for (uint32_t col_idx = 0; col_idx < Wt; ++col_idx) {
+            const auto tile_idx = start_tile_idx + row_idx * Wt + col_idx;
+            cb_wait_front(cb_id_output, 1);
+            if (output_is_dram) {
+                noc_async_write_tile(tile_idx, dram_output_addrg, output_l1_read_addr);
+            } else {
+                noc_async_write_tile(tile_idx, l1_output_addrg, output_l1_read_addr);
+            }
+            noc_async_write_barrier();
+            cb_pop_front(cb_id_output, 1);
+        }
+    }
+}  // void kernel_main()
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.cpp b/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.cpp
@@ -0,0 +1,84 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "moreh_abs_pow_device_operation.hpp"
+
+#include "ttnn/operations/moreh/moreh_helper_functions.hpp"
+#include "ttnn/tensor/tensor.hpp"
+
+namespace ttnn::operations::moreh::moreh_abs_pow {
+
+std::tuple<uint32_t, float, bool> get_floored_p_and_decimal_and_p_is_negative(float p) {
+    auto floored_p = std::floor(p);
+    auto decimal = p - floored_p;
+    bool p_is_negative = floored_p < 0.0f;
+    if (p_is_negative) {
+        floored_p = -floored_p;
+    }
+    return std::make_tuple(static_cast<uint32_t>(floored_p), decimal, p_is_negative);
+}
+
+MorehAbsPowOperation::program_factory_t MorehAbsPowOperation::select_program_factory(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    // Case for int32
+    return MorehAbsPowFactory{};
+}
+
+void validate_tensors(
+    const MorehAbsPowOperation::operation_attributes_t& operation_attributes,
+    const MorehAbsPowOperation::tensor_args_t& tensor_args) {
+    const auto& input = tensor_args.input;
+    auto& output = tensor_args.output;
+
+    check_tensor(input, "moreh_abs_pow", "input", {DataType::BFLOAT16, DataType::INT32});
+    check_tensor(output, "moreh_abs_pow", "output", {DataType::BFLOAT16, DataType::INT32});
+}
+
+void MorehAbsPowOperation::validate_on_program_cache_miss(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    validate_tensors(operation_attributes, tensor_args);
+};
+
+void MorehAbsPowOperation::validate_on_program_cache_hit(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    validate_tensors(operation_attributes, tensor_args);
+};
+MorehAbsPowOperation::spec_return_value_t MorehAbsPowOperation::compute_output_specs(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    if (tensor_args.output.has_value()) {
+        return tensor_args.output->get_tensor_spec();
+    }
+    const auto& input = tensor_args.input;
+    return TensorSpec(
+        input.get_logical_shape(),
+        TensorLayout(input.get_dtype(), PageConfig(input.get_layout()), operation_attributes.memory_config));
+}
+
+MorehAbsPowOperation::tensor_return_value_t MorehAbsPowOperation::create_output_tensors(
+    const operation_attributes_t& operation_attributes, const tensor_args_t& tensor_args) {
+    if (tensor_args.output.has_value()) {
+        log_debug(tt::LogOp, "{}:{} use output tensor", __func__, __LINE__);
+        return {tensor_args.output.value()};
+    }
+
+    log_debug(tt::LogOp, "{}:{} create output tensor", __func__, __LINE__);
+    return create_device_tensor(compute_output_specs(operation_attributes, tensor_args), tensor_args.input.device());
+};
+
+std::tuple<MorehAbsPowOperation::operation_attributes_t, MorehAbsPowOperation::tensor_args_t>
+MorehAbsPowOperation::invoke(
+    const Tensor& input,
+    const float p,
+    const std::optional<Tensor>& output,
+    const std::optional<MemoryConfig>& memory_config,
+    const std::optional<DeviceComputeKernelConfig>& compute_kernel_config) {
+    const operation_attributes_t operation_attributes{
+        p,
+        memory_config.value_or(input.memory_config()),
+        init_device_compute_kernel_config(input.device()->arch(), compute_kernel_config, MathFidelity::HiFi4)};
+    const tensor_args_t tensor_args{input, output};
+
+    return {operation_attributes, tensor_args};
+}
+}  // namespace ttnn::operations::moreh::moreh_abs_pow
diff --git a/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp b/ttnn/cpp/ttnn/operations/moreh/moreh/moreh_abs_pow/device/moreh_abs_pow_device_operation.hpp
@@ -0,0 +1,76 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <variant>
+
+#include "ttnn/decorators.hpp"
+#include "ttnn/operations/core/compute_kernel/compute_kernel_config.hpp"
+#include "ttnn/tensor/types.hpp"
+
+#define MOREH_ABS_POW_FACTORY_H(name)                                                       \
+    struct name {                                                                           \
+        struct shared_variables_t {                                                         \
+            KernelHandle reader_kernels_id;                                                 \
+            KernelHandle writer_kernels_id;                                                 \
+            std::size_t num_cores_to_be_used;                                               \
+            std::size_t num_cores_y;                                                        \
+        };                                                                                  \
+                                                                                            \
+        using cached_program_t = ttnn::device_operation::CachedProgram<shared_variables_t>; \
+                                                                                            \
+        static cached_program_t create(                                                     \
+            const operation_attributes_t& operation_attributes,                             \
+            const tensor_args_t& tensor_args,                                               \
+            tensor_return_value_t& output_tensor);                                          \
+                                                                                            \
+        static void override_runtime_arguments(                                             \
+            cached_program_t& cached_program,                                               \
+            const operation_attributes_t& operation_attributes,                             \
+            const tensor_args_t& tensor_args,                                               \
+            tensor_return_value_t& output_tensor);                                          \
+    };
+
+namespace ttnn::operations::moreh::moreh_abs_pow {
+
+std::tuple<uint32_t, float, bool> get_floored_p_and_decimal_and_p_is_negative(float p);
+
+struct MorehAbsPowOperation {
+    struct operation_attributes_t {
+        const float p;
+
+        const MemoryConfig memory_config;
+        const DeviceComputeKernelConfig compute_kernel_config;
+    };
+    struct tensor_args_t {
+        const Tensor& input;
+        const std::optional<Tensor>& output;
+    };
+
+    using spec_return_value_t = TensorSpec;
+    using tensor_return_value_t = Tensor;
+
+    MOREH_ABS_POW_FACTORY_H(MorehAbsPowFactory)
+
+    using program_factory_t = std::variant<MorehAbsPowFactory>;
+    static program_factory_t select_program_factory(const operation_attributes_t&, const tensor_args_t&);
+    static void validate_on_program_cache_miss(const operation_attributes_t&, const tensor_args_t&);
+    static void validate_on_program_cache_hit(const operation_attributes_t&, const tensor_args_t&);
+    static spec_return_value_t compute_output_specs(const operation_attributes_t&, const tensor_args_t&);
+    static tensor_return_value_t create_output_tensors(const operation_attributes_t&, const tensor_args_t&);
+    static std::tuple<operation_attributes_t, tensor_args_t> invoke(
+        const Tensor& input,
+        const float p,
+        const std::optional<Tensor>& output,
+        const std::optional<MemoryConfig>& memory_config,
+        const std::optional<DeviceComputeKernelConfig>& compute_kernel_config);
+};
+
+}  // namespace ttnn::operations::moreh::moreh_abs_pow
+
+namespace ttnn::prim {
+constexpr auto moreh_abs_pow = ttnn::
+    register_operation<"ttnn::prim::moreh_abs_pow", ttnn::operations::moreh::moreh_abs_pow::MorehAbsPowOperation>();
+}  // namespace ttnn::prim