From: Shubham Gupta/SNAP /SRI-Bangalore/Engineer/삼성전자 <shub98.gupta@samsung.com>
Date: Fri, 30 Nov 2018 07:38:52 +0000 (+0530)
Subject: Add Quant8 support to NotEqual CL (#3754)
X-Git-Tag: 0.3~277
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=15662fa9830ca75ec4df19de0657f38fad4a06fc;p=platform%2Fcore%2Fml%2Fnnfw.git

Add Quant8 support to NotEqual CL (#3754)

This patch will provide quant8 support for NotEqual Op

Signed-off-by: shubham <shub98.gupta@samsung.com>
---

diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 562d16d..f0ca2db 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -217,6 +217,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"normalization_layer_cross_map", "normalization_layer.cl"},
     {"normalization_layer_in_map", "normalization_layer.cl"},
     {"notequal", "notequal.cl"},
+    {"notequal_quantized", "notequal_quantized.cl"},
     {"NV12_to_IYUV_bt709", "color_convert.cl"},
     {"NV12_to_RGB888_bt709", "color_convert.cl"},
     {"NV12_to_RGBA8888_bt709", "color_convert.cl"},
@@ -387,6 +388,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/notequal.clembed"
     },
     {
+        "notequal_quantized.cl",
+#include "./cl_kernels/notequal_quantized.clembed"
+    },
+    {
         "pad.cl",
 #include "./cl_kernels/pad.clembed"
     },
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/notequal_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/notequal_quantized.cl
new file mode 100644
index 0000000..a4a7026
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/notequal_quantized.cl
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+/** Checks if values in both tensors are not equal.
+ * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[in]  input2_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void notequal_quantized(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
+    int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
+
+    in_a = SUB(in_a, (int16)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (int16)((int)OFFSET_IN2));
+
+    const float16 in1f32  = convert_float16(in_a) * (float16)((float)SCALE_IN1);
+    const float16 in2f32  = convert_float16(in_b) * (float16)((float)SCALE_IN2);
+    const float16 qresf32 = convert_float16((in1f32 != in2f32) ? 1 : 0);
+    const uchar16 res = convert_uchar16_sat(convert_int16_rte(qresf32));
+    // Store result
+    vstore16(res, 0, (__global uchar *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNotEqualKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNotEqualKernel.cpp
index 92a4a21..c662616 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLNotEqualKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNotEqualKernel.cpp
@@ -32,10 +32,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
   const TensorShape &out_shape =
       TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
 
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8);
 
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
                                   "Inputs are not broadcast compatible");
@@ -72,6 +72,19 @@ void CLNotEqualKernel::configure(const ICLTensor *input1, const ICLTensor *input
   build_opts.emplace(
       ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
 
+  if (is_data_type_quantized_asymmetric(input1->info()->data_type()))
+  {
+    build_opts.emplace("-DOFFSET_IN1=" +
+                       support::cpp11::to_string(input1->info()->quantization_info().offset));
+    build_opts.emplace("-DOFFSET_IN2=" +
+                       support::cpp11::to_string(input2->info()->quantization_info().offset));
+    build_opts.emplace("-DSCALE_IN1=" +
+                       support::cpp11::to_string(input1->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_IN2=" +
+                       support::cpp11::to_string(input2->info()->quantization_info().scale));
+    kernel_name += "_quantized";
+  }
+
   _kernel =
       static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));