From: Prasanna R/SNAP /SRI-Bangalore/Engineer/삼성전자 <prasanna.r@samsung.com>
Date: Tue, 18 Dec 2018 04:47:17 +0000 (+0530)
Subject: Making comparison_op_quantized.cl generic with VEC_SIZE (#4042)
X-Git-Tag: 0.3~51
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5a1045348d9b9d90886bac72a404e3185486439c;p=platform%2Fcore%2Fml%2Fnnfw.git

Making comparison_op_quantized.cl generic with VEC_SIZE (#4042)

This patch makes comparison op quantized.cl more generic based on VEC_SIZE
replacement from 16 -> vec_size
This is done for better scaling of this kernel.

Signed-off-by: prasannar <prasanna.r@samsung.com>
---

diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 6326ee1..40efbcc 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -75,7 +75,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"combine_gradients_L1", "canny.cl"},
     {"combine_gradients_L2", "canny.cl"},
     {"comparison_op", "comparison_op.cl"},
-    {"comparison_op_quantized", "comparison_op_quantized.cl"},
+    {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
     {"concatenate_depth", "concatenate.cl"},
     {"concatenate_width", "concatenate.cl"},
     {"convolution_rectangle", "convolution_rectangle.cl"},
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
index 133fcee..41c90b7 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl
@@ -17,6 +17,12 @@
 #include "helpers.h"
 #define SUB(x, y) (x) - (y)
 
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(VEC_SIZE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+
 /** Returns the truth value of comparison .
  * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5
  * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
@@ -48,7 +54,7 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void comparison_op_quantized(
+__kernel void comparison_op_qasymm8(
     TENSOR3D_DECLARATION(in1),
     TENSOR3D_DECLARATION(in2),
     TENSOR3D_DECLARATION(out))
@@ -58,17 +64,18 @@ __kernel void comparison_op_quantized(
     Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
     Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
 
-    int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
-    int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
+    VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT);
+    VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT);
 
-    in_a = SUB(in_a, (int16)((int)OFFSET_IN1));
-    in_b = SUB(in_b, (int16)((int)OFFSET_IN2));
+    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
 
-    const float16 in1f32  = convert_float16(in_a) * (float16)((float)SCALE_IN1);
-    const float16 in2f32  = convert_float16(in_b) * (float16)((float)SCALE_IN2);
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
 #if OPCODE == 0 //EQUAL QUANTIZED
-    vstore16(CONVERT(in1f32 == in2f32, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global uchar *)out.ptr);
+    VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
 #elif OPCODE == 1 //NOT EQUAL QUANTIZED
-    vstore16(CONVERT(in1f32 != in2f32, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global uchar *)out.ptr);
+    VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr);
 #endif
 }
+#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
index 9174c87..136ed3a 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp
@@ -103,7 +103,7 @@ void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *i
                        support::cpp11::to_string(input1->info()->quantization_info().scale));
     build_opts.emplace("-DSCALE_IN2=" +
                        support::cpp11::to_string(input2->info()->quantization_info().scale));
-    kernel_name += "_quantized";
+    kernel_name += "_qasymm8";
   }
 
   _kernel =