Support Layout::NHWC for CL Kernel of SpaceToBatchND (#3734)

author 장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>

Wed, 28 Nov 2018 01:33:45 +0000 (10:33 +0900)

committer 이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>

Wed, 28 Nov 2018 01:33:45 +0000 (10:33 +0900)
author 장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Wed, 28 Nov 2018 01:33:45 +0000 (10:33 +0900)
committer 이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
Wed, 28 Nov 2018 01:33:45 +0000 (10:33 +0900)
diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h

index 31e4c86..68534f1 100644 (file)
--- a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
@@ -41,10 +41,15 @@ public:
    ~CLSpaceToBatchNDKernel() = default;
    /** Initialise the kernel's input and output.
     *
+   * @note       The data layout of input and output must be the same.
+   * @note       The number of dimensions of input and output must be 4, and `spatial` dimensions
+   *             are height and width.
     * @param[in]  input         Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                           Data layout supported: NCHW/NHWC
     * @param[in]  block_size    Block size tensor. Data types supported: S32.
     * @param[in]  padding_size  Padding size tensor. Data types supported: S32.
     * @param[out]  output        Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                            Data layout supported: NCHW/NHWC
     */
    void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
                   ICLTensor *output);
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h

index 4f42391..7e2df89 100644 (file)
--- a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
@@ -34,7 +34,11 @@ class CLSpaceToBatchND : public ICLSimpleFunction
  public:
    /** Initialise the kernel's input and output.
     *
+   * @note       The data layout of input and output must be the same.
+   * @note       The number of dimensions of input and output must be 4, and `spatial` dimensions
+   *             are height and width.
     * @param[in]  input          Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   *                            Data layout supported: NCHW/NHWC
     * @param[in]  block_size     Tensor of integer values specifying block sizes for spatial
     * dimension.
     *                            Data types supported: S32
@@ -42,6 +46,7 @@ public:
     * dimension.
     *                            Data types supported: S32
     * @param[out] output         Output tensor. Data types supported: same as @p input.
+   *                            Data layout supported: NCHW/NHWC
     */
    void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
                   ICLTensor *output);
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp

index cbe5eff..844a3f1 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -314,6 +314,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
      {"radixsort_reorder", "topkv2_radixsort.cl"},
      {"topkv2_quicksort", "topkv2_quicksort.cl"},
      {"space_to_batch_4d_nchw", "space_to_batch.cl"},
+    {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
      {"space_to_depth", "space_to_depth.cl"},
  };
  
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl

index 4460677..449c8f5 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
@@ -86,3 +86,78 @@ __kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input),
      }
  }
  #endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+
+#if defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
+/** Perform space to batch with input of 4D and NHWC format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DHEIGHT_OUT=size. e.g. -DHEIGHT_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nhwc(TENSOR4D_DECLARATION(input),
+                                     TENSOR4D_DECLARATION(output),
+                                     VECTOR_DECLARATION(block_size),
+                                     IMAGE_DECLARATION(padding_size))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, HEIGHT_OUT);
+
+    int block_size_x = *((__global int *)(block_size_ptr + block_size_stride_x));
+    int block_size_y = *((__global int *)block_size_ptr);
+    int shift_x = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) % block_size_x;
+    int shift_y = (get_global_id(2) / HEIGHT_OUT / BATCH_IN) / block_size_x;
+
+    int in_index[4] = {0, };
+    in_index[0] = get_global_id(0) * VEC_SIZE;
+    in_index[1] = get_global_id(1) * block_size_x + shift_x - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+    in_index[2] = get_global_id(2) % HEIGHT_OUT * block_size_y + shift_y - *((__global int *)(padding_size_ptr));
+    in_index[3] = (get_global_id(2) / HEIGHT_OUT) % BATCH_IN;
+
+    if (in_index[1] < 0 || in_index[1] >= WIDTH_IN || in_index[2] < 0 || in_index[2] >= HEIGHT_IN)
+    {
+        VSTORE(VEC_SIZE)((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))ZERO_VALUE, 0, (__global DATA_TYPE *)out.ptr);
+    }
+    else
+    {
+        VSTORE(VEC_SIZE)(CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3])),
+                                 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)),
+                         0, (__global DATA_TYPE *)out.ptr);
+    }
+}
+
+#endif // defined(DATA_TYPE) && defined(HEIGHT_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE) && defined(VEC_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp

index 03f7959..282dda3 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
@@ -24,6 +24,8 @@ using namespace arm_compute;
  
  namespace
  {
+constexpr unsigned int num_elems_processed_per_iteration = 16;
+
  Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
                            const ITensorInfo *padding_size, const ITensorInfo *output)
  {
@@ -51,10 +53,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_siz
                                          padding_size->dimension(1) != 2,
                                      "Only 2-dimensional spatial block's size was wrong");
    }
+  else if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NHWC)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(0) != output->dimension(0),
+                                    "Input Depth should be equal to Output Depth");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+                                        padding_size->dimension(1) != 2,
+                                    "Only 2-dimensional spatial block's size was wrong");
+  }
    else
    {
-    ARM_COMPUTE_RETURN_ERROR_MSG(
-        "CLSpaceToBatchNDKernel supports only 4-dimensional input and DataLayout::NCHW");
+    ARM_COMPUTE_RETURN_ERROR_MSG("CLSpaceToBatchNDKernel supports only 4-dimensional input");
    }
  
    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
@@ -88,14 +97,51 @@ void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *
  
    // Set kernel build options
    // TODO Support other cases
-  std::string kernel_name = "space_to_batch_4d_nchw";
+  std::string kernel_name = "space_to_batch_4d";
    std::set<std::string> build_opts;
+  Window win;
+
+  if (input->info()->data_layout() == DataLayout::NCHW)
+  {
+    kernel_name += "_nchw";
+    build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+
+    win = calculate_max_window(*output->info(), Steps());
+
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+  }
+  else if (input->info()->data_layout() == DataLayout::NHWC)
+  {
+    kernel_name += "_nhwc";
+    build_opts.emplace("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+    build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+    build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.emplace("-DVEC_SIZE=" +
+                       support::cpp11::to_string(num_elems_processed_per_iteration));
+
+    win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+    input_access.set_valid_region(win, output->info()->valid_region());
+
+    if (window_changed)
+    {
+      ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!");
+    }
+  }
+  else
+  {
+    ARM_COMPUTE_ERROR("Unsupported layout");
+  }
  
    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
    build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
-  build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
-  build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
    if (input->info()->data_type() == DataType::QASYMM8)
    {
      build_opts.emplace("-DZERO_VALUE=" +
@@ -111,12 +157,6 @@ void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *
        static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
  
    // Configure kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
    ICLKernel::configure(win);
  }
author	장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
	Wed, 28 Nov 2018 01:33:45 +0000 (10:33 +0900)
committer	이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
	Wed, 28 Nov 2018 01:33:45 +0000 (10:33 +0900)
libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h		patch \| blob \| history
libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h		patch \| blob \| history
libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp		patch \| blob \| history
libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl		patch \| blob \| history
libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp		patch \| blob \| history