Add Quant8 support to NotEqual CL (#3754)

author Shubham Gupta/SNAP /SRI-Bangalore/Engineer/삼성전자 <shub98.gupta@samsung.com>

Fri, 30 Nov 2018 07:38:52 +0000 (13:08 +0530)

committer 오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>

Fri, 30 Nov 2018 07:38:52 +0000 (16:38 +0900)
author Shubham Gupta/SNAP /SRI-Bangalore/Engineer/삼성전자 <shub98.gupta@samsung.com>
Fri, 30 Nov 2018 07:38:52 +0000 (13:08 +0530)
committer 오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Fri, 30 Nov 2018 07:38:52 +0000 (16:38 +0900)
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp

index 562d16d..f0ca2db 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -217,6 +217,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
      {"normalization_layer_cross_map", "normalization_layer.cl"},
      {"normalization_layer_in_map", "normalization_layer.cl"},
      {"notequal", "notequal.cl"},
+    {"notequal_quantized", "notequal_quantized.cl"},
      {"NV12_to_IYUV_bt709", "color_convert.cl"},
      {"NV12_to_RGB888_bt709", "color_convert.cl"},
      {"NV12_to_RGBA8888_bt709", "color_convert.cl"},
@@ -387,6 +388,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
  #include "./cl_kernels/notequal.clembed"
      },
      {
+        "notequal_quantized.cl",
+#include "./cl_kernels/notequal_quantized.clembed"
+    },
+    {
          "pad.cl",
  #include "./cl_kernels/pad.clembed"
      },
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/notequal_quantized.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/notequal_quantized.cl

new file mode 100644 (file)

index 0000000..a4a7026
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/notequal_quantized.cl
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+#define SUB(x, y) (x) - (y)
+
+/** Checks if values in both tensors are not equal.
+ * @attention Offset and Scale of both input should be given as a preprocessor argument using -DOFFSET_IN1=int, -DOFFSET_IN2=int, -DSCALE_IN1=float and -DSCALE_IN2=float. e.g. -DOFFSET_IN1=1, -DOFFSET_IN2=0, -DSCALE_IN1=0.5, -DSCALE_IN2=0.5
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input1_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input1_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input1_step_x                         input1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input1_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[in]  input2_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input2_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input2_step_x                         input2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input2_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input2_step_y                         input2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input2_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input2_step_z                         input2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input2_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void notequal_quantized(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    int16 in_a = CONVERT(vload16(0, (__global uchar *)in1.ptr), int16);
+    int16 in_b = CONVERT(vload16(0, (__global uchar *)in2.ptr), int16);
+
+    in_a = SUB(in_a, (int16)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (int16)((int)OFFSET_IN2));
+
+    const float16 in1f32  = convert_float16(in_a) * (float16)((float)SCALE_IN1);
+    const float16 in2f32  = convert_float16(in_b) * (float16)((float)SCALE_IN2);
+    const float16 qresf32 = convert_float16((in1f32 != in2f32) ? 1 : 0);
+    const uchar16 res = convert_uchar16_sat(convert_int16_rte(qresf32));
+    // Store result
+    vstore16(res, 0, (__global uchar *)out.ptr);
+}
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLNotEqualKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLNotEqualKernel.cpp

index 92a4a21..c662616 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLNotEqualKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLNotEqualKernel.cpp
@@ -32,10 +32,10 @@ Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
    const TensorShape &out_shape =
        TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
  
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16,
-                                                       DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8);
  
    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
                                    "Inputs are not broadcast compatible");
@@ -72,6 +72,19 @@ void CLNotEqualKernel::configure(const ICLTensor *input1, const ICLTensor *input
    build_opts.emplace(
        ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
  
+  if (is_data_type_quantized_asymmetric(input1->info()->data_type()))
+  {
+    build_opts.emplace("-DOFFSET_IN1=" +
+                       support::cpp11::to_string(input1->info()->quantization_info().offset));
+    build_opts.emplace("-DOFFSET_IN2=" +
+                       support::cpp11::to_string(input2->info()->quantization_info().offset));
+    build_opts.emplace("-DSCALE_IN1=" +
+                       support::cpp11::to_string(input1->info()->quantization_info().scale));
+    build_opts.emplace("-DSCALE_IN2=" +
+                       support::cpp11::to_string(input2->info()->quantization_info().scale));
+    kernel_name += "_quantized";
+  }
+
    _kernel =
        static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
author	Shubham Gupta/SNAP /SRI-Bangalore/Engineer/삼성전자 <shub98.gupta@samsung.com>
	Fri, 30 Nov 2018 07:38:52 +0000 (13:08 +0530)
committer	오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
	Fri, 30 Nov 2018 07:38:52 +0000 (16:38 +0900)
libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp		patch \| blob \| history
libs/ARMComputeEx/src/core/CL/cl_kernels/notequal_quantized.cl	[new file with mode: 0644]	patch \| blob
libs/ARMComputeEx/src/core/CL/kernels/CLNotEqualKernel.cpp		patch \| blob \| history