From: Prasanna R/SNAP /SRI-Bangalore/Engineer/삼성전자 Date: Tue, 18 Dec 2018 04:47:17 +0000 (+0530) Subject: Making comparison_op_quantized.cl generic with VEC_SIZE (#4042) X-Git-Tag: 0.3~51 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5a1045348d9b9d90886bac72a404e3185486439c;p=platform%2Fcore%2Fml%2Fnnfw.git Making comparison_op_quantized.cl generic with VEC_SIZE (#4042) This patch makes comparison op quantized.cl more generic based on VEC_SIZE replacement from 16 -> vec_size This is done for better scaling of this kernel. Signed-off-by: prasannar --- diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp index 6326ee1..40efbcc 100644 --- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp +++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp @@ -75,7 +75,7 @@ const std::map CLKernelLibraryEx::_kernel_program_map {"combine_gradients_L1", "canny.cl"}, {"combine_gradients_L2", "canny.cl"}, {"comparison_op", "comparison_op.cl"}, - {"comparison_op_quantized", "comparison_op_quantized.cl"}, + {"comparison_op_qasymm8", "comparison_op_quantized.cl"}, {"concatenate_depth", "concatenate.cl"}, {"concatenate_width", "concatenate.cl"}, {"convolution_rectangle", "convolution_rectangle.cl"}, diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl index 133fcee..41c90b7 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/comparison_op_quantized.cl @@ -17,6 +17,12 @@ #include "helpers.h" #define SUB(x, y) (x) - (y) +#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(VEC_SIZE) + +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_OUT VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE) + /** Returns the truth value of comparison . * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 @@ -48,7 +54,7 @@ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ -__kernel void comparison_op_quantized( +__kernel void comparison_op_qasymm8( TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2), TENSOR3D_DECLARATION(out)) @@ -58,17 +64,18 @@ __kernel void comparison_op_quantized( Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2); Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out); - int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16); - int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16); + VEC_INT in_a = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in1.ptr), VEC_INT); + VEC_INT in_b = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)in2.ptr), VEC_INT); - in_a = SUB(in_a, (int16)((int)OFFSET_IN1)); - in_b = SUB(in_b, (int16)((int)OFFSET_IN2)); + in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1)); + in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2)); - const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1); - const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2); + const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1); + const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2); #if OPCODE == 0 //EQUAL QUANTIZED - vstore16(CONVERT(in1f32 == in2f32, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global uchar *)out.ptr); + VSTORE(VEC_SIZE)(CONVERT(in1f32 == in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); #elif OPCODE == 1 //NOT EQUAL QUANTIZED - vstore16(CONVERT(in1f32 != in2f32, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global uchar *)out.ptr); + VSTORE(VEC_SIZE)(CONVERT(in1f32 != in2f32, VEC_OUT), 0, (__global DATA_TYPE_OUT *)out.ptr); #endif } +#endif // defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(VEC_SIZE) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp index 9174c87..136ed3a 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLComparisonOpKernel.cpp @@ -103,7 +103,7 @@ void CLComparisonOpKernel::configure(const ICLTensor *input1, const ICLTensor *i support::cpp11::to_string(input1->info()->quantization_info().scale)); build_opts.emplace("-DSCALE_IN2=" + support::cpp11::to_string(input2->info()->quantization_info().scale)); - kernel_name += "_quantized"; + kernel_name += "_qasymm8"; } _kernel =