oneapi-src · fadara01 · Jul 22, 2024 · Jun 4, 2024 · Nov 8, 2024
@@ -1,5 +1,6 @@
 /*******************************************************************************
 * Copyright 2016-2024 Intel Corporation
+* Copyright 2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -234,8 +235,16 @@ struct convolution_pd_t : public primitive_desc_t {
                         || invariant_dst_md()->data_type == dst_dt)
                 && (acc_dt == data_type::undef
                         || desc_.accum_data_type == acc_dt);
-        if (with_bias() && bia_dt != data_type::undef)
+        if (with_bias() && bia_dt != data_type::undef) {
+#ifdef __aarch64__
+            // ACL only supports s32 bias for quantization. Therefore internally
+            // we convert from f32 to s32. So here the types doesn't match.
+            if (utils::one_of(
+                        dst_dt, data_type_t::dnnl_s8, data_type_t::dnnl_u8))
+                return ok;
+#endif
             ok = ok && invariant_bia_md()->data_type == bia_dt;
+        }
         return ok;
     }
 

@@ -179,6 +179,7 @@ enum {
     key_conv_amx_wsp_buffer,
     key_conv_bia_reduction,
     key_conv_bias_bf16_convert_wsp,
+    key_conv_bias_s32_convert,
     key_conv_cudnn,
     key_conv_cudnn_algo,
     key_conv_cudnn_filter,

@@ -65,8 +65,13 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
                               everyone_is(data_type::f16, src_d.data_type(),
                                       wei_d.data_type(), dst_d.data_type()),
                               everyone_is(data_type::bf16, src_d.data_type(),
-                                      wei_d.data_type(), dst_d.data_type())),
-            " src, dst and wei must be fp16, bf16 or fp32");
+                                      wei_d.data_type(), dst_d.data_type()),
+                              everyone_is(data_type::s8, src_d.data_type(),
+                                      wei_d.data_type(), dst_d.data_type()),
+                              (everyone_is(data_type::u8, src_d.data_type(),
+                                       dst_d.data_type())
+                                      && wei_d.data_type() == data_type::s8)),
+            " src, dst and wei must be s8, u8, bf16, fp16 or fp32");
     // batch size
     const int mb = src_d.dims()[0];
 
@@ -165,7 +170,8 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
                                     : arm_compute::DataLayout::NCHW;
 
     // all have the same datatype
-    auto acl_data_type = acl_utils::get_acl_data_t(src_d.data_type());
+    auto acl_data_type
+            = acl_utils::get_acl_data_t(src_d.data_type(), acp.is_quantized);
 
     // clang-format off
     acp.src_tensor_info = arm_compute::TensorInfo(
@@ -179,8 +185,9 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
             is_nhwc ? arm_compute::TensorShape(ic, kw, kh, oc) :
             arm_compute::TensorShape(kw, kh, ic, oc),
             1,
-            acl_data_type,
+            acl_utils::get_acl_data_t(wei_d.data_type(), acp.is_quantized),
             acl_layout);
+
     if(is_depthwise) {
        // We need to set that values are not constant so that we
        // we can update them in-place in ACL
@@ -198,10 +205,20 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
             acp.with_bias ? arm_compute::TensorShape(oc)
                           : arm_compute::TensorShape(),
             1,
-            acl_data_type,
+            acp.is_quantized ? acl_utils::get_acl_data_t(data_type::s32) : acl_data_type,
             acl_layout);
     // clang-format on
 
+    if (acp.is_quantized) {
+        // ACL rejects the operation if quantization information is empty during configuration.
+        // Since the correct parameters are not available at this stage, we provide placeholder values.
+        // These values are then updated with the correct ones during the run stage.
+        arm_compute::QuantizationInfo qi {1.0, 0, true};
+        acp.src_tensor_info.set_quantization_info(qi);
+        acp.wei_tensor_info.set_quantization_info(qi);
+        acp.dst_tensor_info.set_quantization_info(qi);
+    }
+
     // ACL Winograd is not prepared for fixed format kernels
     if (acp.alg_winograd) {
         const bool is_1d = ndims == 3;
@@ -216,7 +233,7 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
     // Are we allowed to cast down to bf16 or not?
     acp.fast_math
             = one_of(attr.fpmath_.mode_, fpmath_mode::bf16, fpmath_mode::any);
-    if (is_depthwise) {
+    if (is_depthwise || acp.is_quantized) {
         // There is no support for fixed format kernels for depthwise convolution
         // in ACL so we are going to use weight format that we set up earlier
         return status::success;

@@ -20,9 +20,11 @@
 #include <map>
 #include "acl_post_ops.hpp"
 #include "acl_utils.hpp"
-#include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h"
 #include "cpu/cpu_convolution_pd.hpp"
-#include <type_traits>
+#include "cpu/cpu_primitive.hpp"
+
+#include "arm_compute/runtime/experimental/operators/CpuGemmConv2d.h"
+
 namespace dnnl {
 namespace impl {
 namespace cpu {
@@ -44,6 +46,8 @@ struct acl_conv_conf_t {
     // algorithm can be set to algorithm::convolution_auto and later on we need to
     // skip fixed-format protocol as ACL Winograd does not support it.
     bool alg_winograd;
+    // currently, only CpuGemmConv2d has the static quantization update interface.
+    bool is_quantized;
     arm_compute::TensorInfo src_tensor_info;
     arm_compute::TensorInfo wei_tensor_info;
     arm_compute::TensorInfo bia_tensor_info;
@@ -70,11 +74,13 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
 using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
 
 template <typename op_t, typename post_ops_t>
-status_t init_scratchpad(op_t &conv, memory_tracking::registrar_t &scratchpad,
+status_t init_scratchpad(const op_t &conv,
+        memory_tracking::registrar_t &scratchpad,
         const std::map<int, conv_key_t> &conv_keys, engine_t *engine,
         post_ops_t &post_ops, dnnl::impl::post_ops_t &attr_post_ops,
         arm_compute::ActivationLayerInfo &act_info, bool &use_dst_acc_for_sum,
-        const dnnl::impl::memory_desc_t &dst_md) {
+        const dnnl::impl::memory_desc_t &dst_md,
+        const dnnl::impl::memory_desc_t &bias_md, const bool is_quantized) {
 
     // Book temp mem.
     const auto aux_mem_req = conv.workspace();
@@ -95,14 +101,20 @@ status_t init_scratchpad(op_t &conv, memory_tracking::registrar_t &scratchpad,
                 dst_d.data_type_size());
     }
 
+    if (is_quantized && bias_md.format_kind != format_kind::undef) {
+        const memory_desc_wrapper bias_d(&bias_md);
+        scratchpad.book(memory_tracking::names::key_conv_bias_s32_convert,
+                bias_d.nelems(), bias_d.data_type_size());
+    }
+
     return status::success;
 }
 
 template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
         typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
         typename bia_data_t = src_data_t>
 status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
-        conv_obj_t *acl_conv_obj, const conv_pd_t *pd,
+        conv_obj_t *acl_conv_obj, const conv_pd_t *pd_,
         const std::map<int, conv_key_t> &conv_keys) {
 
     auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
@@ -115,16 +127,49 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
     arm_compute::Tensor bia_tensor = nullptr;
     arm_compute::Tensor dst_tensor;
 
-    auto const acp = pd->acp_;
+    auto const acp = pd_->acp_;
     src_tensor.allocator()->init(acp.src_tensor_info);
     wei_tensor.allocator()->init(acp.wei_tensor_info);
     dst_tensor.allocator()->init(acp.dst_tensor_info);
 
+    const auto scratchpad = ctx.get_scratchpad_grantor();
+
+    if (acp.is_quantized) {
+        // DEFINE_(ARG|ZERO)... demands 'pd' as a function
+        auto pd = [pd_] { return pd_; };
+
+        DEFINE_ARG_SCALES_BUFFER(src_scale, DNNL_ARG_SRC);
+        DEFINE_ZERO_POINT_VALUE(src_zero_point, DNNL_ARG_SRC);
+        DEFINE_ARG_SCALES_BUFFER(wei_scale, DNNL_ARG_WEIGHTS);
+        DEFINE_ZERO_POINT_VALUE(wei_zero_point, DNNL_ARG_WEIGHTS);
+        DEFINE_ARG_SCALES_BUFFER(dst_scale, DNNL_ARG_DST);
+        DEFINE_ZERO_POINT_VALUE(dst_zero_point, DNNL_ARG_DST);
+
+        // s8s8s8 uses D = Sx*Sy*(XY + X*zy + Y*zx + zx*zy) and u8s8u8 uses D = Sx*Sy*(XW - X*zw - W*zx + zx*zw)
+        if (dst_tensor.info()->data_type() == arm_compute::DataType::QASYMM8) {
+            src_tensor.info()->set_quantization_info(
+                    arm_compute::QuantizationInfo(
+                            *src_scale, -src_zero_point, true));
+            wei_tensor.info()->set_quantization_info(
+                    arm_compute::QuantizationInfo(
+                            *wei_scale, -wei_zero_point, true));
+        } else {
+            src_tensor.info()->set_quantization_info(
+                    arm_compute::QuantizationInfo(
+                            *src_scale, src_zero_point, true));
+            wei_tensor.info()->set_quantization_info(
+                    arm_compute::QuantizationInfo(
+                            *wei_scale, wei_zero_point, true));
+        }
+
+        // for efficiency reasons, OneDNN saves the inverse of the destination
+        dst_tensor.info()->set_quantization_info(arm_compute::QuantizationInfo(
+                1.0 / (*dst_scale), dst_zero_point, true));
+    }
+
     src_tensor.allocator()->import_memory(const_cast<src_data_t *>(src_base));
     wei_tensor.allocator()->import_memory(const_cast<wei_data_t *>(wei_base));
 
-    const auto scratchpad = ctx.get_scratchpad_grantor();
-
     // If we have an unfused sum post op, put the result in a scratchpad tensor.
     // Result will be summed to the dst during acl_post_ops.execute
     auto dst_base = acp.use_dst_acc_for_sum
@@ -133,10 +178,30 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
     dst_tensor.allocator()->import_memory(dst_base);
 
     if (acp.with_bias) {
-        auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
-        bia_tensor.allocator()->init(acp.bia_tensor_info);
-        bia_tensor.allocator()->import_memory(
-                const_cast<bia_data_t *>(bia_base));
+        if (acp.is_quantized) {
+            auto bia_s32_base = scratchpad.get<uint32_t>(
+                    memory_tracking::names::key_conv_bias_s32_convert);
+            auto bia_f32_base = CTX_IN_MEM(const float32_t *, DNNL_ARG_BIAS);
+            auto src_scale
+                    = src_tensor.info()->quantization_info().uniform().scale;
+            auto wei_scale
+                    = wei_tensor.info()->quantization_info().uniform().scale;
+            const float bias_scale = 1 / (src_scale * wei_scale);
+            const int num_elements
+                    = acp.bia_tensor_info.total_size() / sizeof(float32_t);
+            parallel_nd(num_elements, [&](dim_t e) {
+                const auto b
+                        = int32_t(std::round(bia_f32_base[e] * bias_scale));
+                bia_s32_base[e] = b;
+            });
+            bia_tensor.allocator()->init(acp.bia_tensor_info);
+            bia_tensor.allocator()->import_memory(bia_s32_base);
+        } else {
+            auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
+            bia_tensor.allocator()->init(acp.bia_tensor_info);
+            bia_tensor.allocator()->import_memory(
+                    const_cast<bia_data_t *>(bia_base));
+        }
     }
 
     // Constness of the weight tensor matters for depthwise conv in ACL.
@@ -167,10 +232,17 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
         }
     }
 
+    if (acp.is_quantized) {
+        arm_compute::experimental::op::CpuGemmConv2d *conv
+                = dynamic_cast<arm_compute::experimental::op::CpuGemmConv2d *>(
+                        &acl_conv_obj->conv);
+        if (conv) conv->update_quantization_parameters(pack);
+    }
+
     acl_conv_obj->conv.run(pack);
 
     void *dst = dst_tensor.buffer();
-    pd->post_ops.execute(ctx, dst);
+    pd_->post_ops.execute(ctx, dst);
 
     return status::success;
 }

@@ -74,7 +74,7 @@ status_t acl_depthwise_convolution_fwd_t::pd_t::init(engine_t *engine) {
     auto scratchpad = scratchpad_registry().registrar();
     return init_scratchpad(conv, scratchpad, depthwise_conv_keys, engine,
             post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum,
-            dst_md_);
+            dst_md_, bias_md_, false);
 }
 
 status_t acl_depthwise_convolution_fwd_t::init(engine_t *engine) {

@@ -52,17 +52,20 @@ template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
 status_t acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t, bia_t>::pd_t::init(
         engine_t *engine) {
     using namespace data_type;
-    using smask_t = primitive_attr_t::skip_mask_t;
 
     bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
             && expect_data_types(src_t, wei_t, bia_t, dst_t, undef)
-            && !has_zero_dim_memory()
-            && attr()->has_default_values(
-                    smask_t::post_ops | smask_t::fpmath_mode, dst_t);
+            && !has_zero_dim_memory() && output_scales_mask_ok()
+            && zero_points_ok();
+
     if (!ok) return status::unimplemented;
 
     if (weights_md_.ndims != 4) return status::unimplemented;
 
+    // currently, only CpuGemmConv2d has the static quantization update interface.
+    acp_.is_quantized
+            = utils::one_of(dst_md_.data_type, data_type::s8, data_type::u8);
+
     // General Compute Library checks, memory tags are also set there
     CHECK(acl_convolution_utils::acl_init_conf(
             acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));
@@ -82,7 +85,25 @@ status_t acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t, bia_t>::pd_t::init(
     auto scratchpad = scratchpad_registry().registrar();
     const auto mem_req = conv.workspace();
     return init_scratchpad(conv, scratchpad, gemm_conv_keys, engine, post_ops,
-            attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_);
+            attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_,
+            bias_md_, acp_.is_quantized);
+}
+
+template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
+        data_type_t bia_t>
+bool acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t,
+        bia_t>::pd_t::output_scales_mask_ok() const {
+    int mask_src = attr()->scales_.get(DNNL_ARG_SRC).mask_;
+    int mask_wei = attr()->scales_.get(DNNL_ARG_WEIGHTS).mask_;
+    int mask_dst = attr()->scales_.get(DNNL_ARG_DST).mask_;
+    return mask_src == 0 && mask_wei == 0 && mask_dst == 0;
+}
+
+template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
+        data_type_t bia_t>
+bool acl_gemm_convolution_fwd_t<src_t, wei_t, dst_t,
+        bia_t>::pd_t::zero_points_ok() const {
+    return attr()->zero_points_.common();
 }
 
 template <data_type_t src_t, data_type_t wei_t, data_type_t dst_t,
@@ -133,6 +154,7 @@ using namespace data_type;
 template struct acl_gemm_convolution_fwd_t<f32>;
 template struct acl_gemm_convolution_fwd_t<f16>;
 template struct acl_gemm_convolution_fwd_t<s8, s8, s8, s32>;
+template struct acl_gemm_convolution_fwd_t<u8, s8, u8, s32>;
 
 } // namespace aarch64
 } // namespace cpu

@@ -44,6 +44,8 @@ struct acl_gemm_convolution_fwd_t : public primitive_t {
                 "gemm:acl", acl_gemm_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
 
         status_t init(engine_t *engine);
+        bool output_scales_mask_ok() const;
+        bool zero_points_ok() const;
 
         acl_conv_conf_t acp_;
         acl_post_ops_t post_ops;

@@ -117,7 +117,7 @@ status_t acl_indirect_gemm_convolution_fwd_t::pd_t::init(engine_t *engine) {
     auto scratchpad = scratchpad_registry().registrar();
     return init_scratchpad(conv, scratchpad, indirect_conv_keys, engine,
             post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum,
-            dst_md_);
+            dst_md_, bias_md_, false);
 }
 
 } // namespace aarch64

@@ -75,7 +75,8 @@ status_t acl_wino_convolution_fwd_t::pd_t::init(engine_t *engine) {
     auto scratchpad = scratchpad_registry().registrar();
     const auto aux_mem = conv.workspace();
     return init_scratchpad(conv, scratchpad, wino_conv_keys, engine, post_ops,
-            attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_);
+            attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_,
+            bias_md_, acp_.is_quantized);
 }
 
 status_t acl_wino_convolution_fwd_t::init(engine_t *engine) {