Implement CL Kernel for ReduceMin op (#3744)
authorPrasanna R/SNAP /SRI-Bangalore/Engineer/삼성전자 <prasanna.r@samsung.com>
Fri, 30 Nov 2018 02:17:27 +0000 (07:47 +0530)
committer오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Fri, 30 Nov 2018 02:17:27 +0000 (11:17 +0900)
This patch implements CL Kernel for ReduceMin op.

Signed-off-by: prasannar <prasanna.r@samsung.com>
libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
libs/ARMComputeEx/src/core/CL/cl_kernels/reduce_operation.cl
libs/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp

index ab7962a..562d16d 100644 (file)
@@ -246,6 +246,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl"},
     {"quantization_layer", "quantization_layer.cl"},
     {"reduce_max", "reduce_operation.cl"},
+    {"reduce_min", "reduce_operation.cl"},
     {"reduce_mean", "reduce_operation.cl"},
     {"reduce_sum", "reduce_operation.cl"},
     {"remap_nearest_neighbour", "remap.cl"},
index a3ade0b..690ce7d 100644 (file)
@@ -71,6 +71,60 @@ __kernel void reduce_max(TENSOR4D_DECLARATION(input),
     *((__global DATA_TYPE *)out.ptr) = max_value;
 }
 
+/** Perform reduce min
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  axis                                 Axis through which reduction occurs
+ * @param[in]  dim                                  Dimension across the axis to be reduced.
+ */
+__kernel void reduce_min(TENSOR4D_DECLARATION(input),
+                         TENSOR4D_DECLARATION(output),
+                         const int axis,
+                         const int dim)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int indices[4] =
+    {
+        get_global_id(0),
+        get_global_id(1),
+        get_global_id(2) % DEPTH_OUT,
+        get_global_id(2) / DEPTH_OUT,
+    };
+
+    DATA_TYPE min_value = *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
+    for(int i = 1; i < dim; ++i)
+    {
+      indices[axis] = i;
+      min_value = min(min_value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3])));
+    }
+
+    *((__global DATA_TYPE *)out.ptr) = min_value;
+}
+
 /** Perform reduce mean
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
index 5f994a6..b2ddbfd 100644 (file)
@@ -102,7 +102,10 @@ void CLReduceOperationKernel::configure(const ICLTensor *input, ICLTensor *outpu
   {
     kernel_name = "reduce_sum";
   }
-
+  else if (op == ReduceOperation::MIN)
+  {
+    kernel_name = "reduce_min";
+  }
   // Set kernel build options
   std::set<std::string> build_opts;
   build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(output_info->data_type()));