Support Layout::NHWC for CL Kernel of SpaceToBatchND (#3734)
author장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Wed, 28 Nov 2018 01:33:45 +0000 (10:33 +0900)
committer이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
Wed, 28 Nov 2018 01:33:45 +0000 (10:33 +0900)
This commit supports Layout::NHWC for CL Kernel of SpaceToBatchND.

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp

index 31e4c86..68534f1 100644 (file)
@@ -41,10 +41,15 @@ public:
   ~CLSpaceToBatchNDKernel() = default;
   /** Initialise the kernel's input and output.
    *
+   * @note       The data layout of input and output must be the same.
+   * @note       The number of dimensions of input and output must be 4, and `spatial` dimensions
+   *             are height and width.
    * @param[in]  input         Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                           Data layout supported: NCHW/NHWC
    * @param[in]  block_size    Block size tensor. Data types supported: S32.
    * @param[in]  padding_size  Padding size tensor. Data types supported: S32.
    * @param[out]  output        Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                            Data layout supported: NCHW/NHWC
    */
   void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
                  ICLTensor *output);
index 4f42391..7e2df89 100644 (file)
@@ -34,7 +34,11 @@ class CLSpaceToBatchND : public ICLSimpleFunction
 public:
   /** Initialise the kernel's input and output.
    *
+   * @note       The data layout of input and output must be the same.
+   * @note       The number of dimensions of input and output must be 4, and `spatial` dimensions
+   *             are height and width.
    * @param[in]  input          Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                            Data layout supported: NCHW/NHWC
    * @param[in]  block_size     Tensor of integer values specifying block sizes for spatial
    * dimension.
    *                            Data types supported: S32
@@ -42,6 +46,7 @@ public:
    * dimension.
    *                            Data types supported: S32
    * @param[out] output         Output tensor. Data types supported: same as @p input.
+   *                            Data layout supported: NCHW/NHWC
    */
   void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
                  ICLTensor *output);
index cbe5eff..844a3f1 100644 (file)
@@ -314,6 +314,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"radixsort_reorder", "topkv2_radixsort.cl"},
     {"topkv2_quicksort", "topkv2_quicksort.cl"},
     {"space_to_batch_4d_nchw", "space_to_batch.cl"},
+    {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
     {"space_to_depth", "space_to_depth.cl"},
 };
 
index 4460677..449c8f5 100644 (file)
@@ -86,3 +86,78 @@ __kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input),
     }
 }
 #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+
+#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+/** Perform space to batch with input of 4D and NHWC format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input),
+                                     TENSOR4D_DECLARATION(output),
+                                     VECTOR_DECLARATION(block_size),
+                                     IMAGE_DECLARATION(padding_size))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
+
+    int block_size_x = *((__global int *)(block_size_ptr + block_size_stride_x));
+    int block_size_y = *((__global int *)block_size_ptr);
+    int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
+    int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
+
+    int in_index[4] = {0, };
+    in_index[0] = get_global_id(0) * VEC_SIZE;
+    in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+    in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr));
+    in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
+
+    if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
+    {
+        VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
+    }
+    else
+    {
+        VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])),
+                                 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+                         0, (__global DATA_TYPE *)out.ptr);
+    }
+}
+
+#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
index 03f7959..282dda3 100644 (file)
@@ -24,6 +24,8 @@ using namespace arm_compute;
 
 namespace
 {
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
                           const ITensorInfo *padding_size, const ITensorInfo *output)
 {
@@ -51,10 +53,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_siz
                                         padding_size->dimension(1) != 2,
                                     "Only 2-dimensional spatial block's size was wrong");
   }
+  else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+                                    "Input Depth should be equal to Output Depth");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+                                        padding_size->dimension(1) != 2,
+                                    "Only 2-dimensional spatial block's size was wrong");
+  }
   else
   {
-    ARM_COMPUTE_RETURN_ERROR_MSG(
-        "CLSpaceToBatchNDKernel supports only 4-dimensional input and DataLayout::NCHW");
+    ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
   }
 
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
@@ -88,14 +97,51 @@ void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *
 
   // Set kernel build options
   // TODO Support other cases
-  std::string kernel_name = "space_to_batch_4d_nchw";
+  std::string kernel_name = "space_to_batch_4d";
   std::set<std::string> build_opts;
+  Window win;
+
+  if (input->info()->data_layout() == DataLayout::NCHW)
+  {
+    kernel_name += "_nchw";
+    build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+    win = calculate_max_window(*output->info(), Steps());
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+  }
+  else if (input->info()->data_layout() == DataLayout::NHWC)
+  {
+    kernel_name += "_nhwc";
+    build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DVEC_SIZE=" +
+                       support::cpp11::to_string(num_elems_processed_per_iteration));
+
+    win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    input_access.set_valid_region(win, output->info()->valid_region());
+
+    if (window_changed)
+    {
+      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+    }
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Unsupported layout");
+  }
 
   build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
   build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
-  build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
-  build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
   if (input->info()->data_type() == DataType::QASYMM8)
   {
     build_opts.emplace("-DZERO_VALUE=" +
@@ -111,12 +157,6 @@ void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *
       static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
 
   // Configure kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
   ICLKernel::configure(win);
 }