From 1ab35335135723717ddb064d897cc57b28457dc6 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EC=9E=A5=EC=A7=80=EC=84=AD/On-Device=20Lab=28SR=29/Enginee?=
 =?utf8?q?r/=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= <jiseob.jang@samsung.com>
Date: Mon, 11 Feb 2019 11:18:50 +0900
Subject: [PATCH] Support NHWC to SpaceToDepthKernel (#4369)

This commit supports NHWC to SpaceToDepthKernel

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
---
 libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp  |  3 +-
 .../src/core/CL/cl_kernels/space_to_depth.cl       | 66 ++++++++++++++++++++--
 .../src/core/CL/kernels/CLSpaceToDepthKernel.cpp   | 31 ++++++----
 3 files changed, 83 insertions(+), 17 deletions(-)

diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index c5ab86b..1bc08a2 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -76,7 +76,8 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"topkv2_quicksort", "topkv2_quicksort.cl"},
     {"space_to_batch_4d_nchw", "space_to_batch.cl"},
     {"space_to_batch_4d_nhwc", "space_to_batch.cl"},
-    {"space_to_depth", "space_to_depth.cl"},
+    {"space_to_depth_nchw", "space_to_depth.cl"},
+    {"space_to_depth_nhwc", "space_to_depth.cl"},
 };
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
index f697704..20eeb38 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
@@ -16,11 +16,12 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
 /** Perform space to depth rearrangement of tensor
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
  * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention The value of the z-axis of input tensor depth should be given as a preprocessor argument using -DZ_IN=size. e.g. -DZ_IN=16
  * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
@@ -44,11 +45,11 @@ bytes)
  * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void space_to_depth(
+__kernel void space_to_depth_nchw(
      TENSOR4D_DECLARATION(input),
      TENSOR4D_DECLARATION(output))
  {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
 
     int out_index[4]={0};
@@ -56,8 +57,8 @@ __kernel void space_to_depth(
 
     in_index[0] = get_global_id(0);//W
     in_index[1] = get_global_id(1);//H
-    in_index[2] = get_global_id(2) % DEPTH_IN;//C
-    in_index[3] = get_global_id(2) / DEPTH_IN;//B
+    in_index[2] = get_global_id(2) % Z_IN;//C
+    in_index[3] = get_global_id(2) / Z_IN;//B
 
     out_index[0] = in_index[0]/BLOCK_SIZE;
     out_index[1] = in_index[1]/BLOCK_SIZE;
@@ -66,4 +67,57 @@ __kernel void space_to_depth(
 
     *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) =                                                                                                 *((__global DATA_TYPE *)in.ptr);
  }
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
+#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+
+#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention The value of the z-axis of input tensor depth should be given as a preprocessor argument using -DZ_IN=size. e.g. -DZ_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_depth_nhwc(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output))
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    int out_index[4]={0};
+    int in_index[4]={0};
+
+    in_index[0] = get_global_id(0);//C
+    in_index[1] = get_global_id(1);//W
+    in_index[2] = get_global_id(2) % Z_IN;//H
+    in_index[3] = get_global_id(2) / Z_IN;//B
+
+    out_index[0] = in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
+    out_index[1] = in_index[1]/BLOCK_SIZE;
+    out_index[2] = in_index[2]/BLOCK_SIZE;
+    out_index[3] = in_index[3];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) =                                                                                                 *((__global DATA_TYPE *)in.ptr);
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
index b803366..b085192 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -39,18 +39,25 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
                                   "Input batch should be equal to Output batch");
 
+  auto layout_out = input->data_layout();
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
+
+  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+  auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
+  auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      input->dimension(2) * block_size * block_size != output->dimension(2),
+      input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
       "Output depth should be equal to (input depth * block size *block size)");
 
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(0) % block_size) ||
-                                      (input->dimension(1) % block_size),
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
+                                      (input->dimension(index_height) % block_size),
                                   "Input height and width should be divisible by block size");
 
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((output->dimension(0) != (input->dimension(0) / block_size)) ||
-                                      (output->dimension(1) != (input->dimension(1) / block_size)),
-                                  "Output height and width should be equal to "
-                                  "input_height/blocksize and input_width/blocksize respectively");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
+          (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
+      "Output height and width should be equal to "
+      "input_height/blocksize and input_width/blocksize respectively");
 
   return Status{};
 }
@@ -70,14 +77,18 @@ void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
   _output = output;
 
   // Set kernel build options
+  auto layout_out = input->info()->data_layout();
   std::set<std::string> build_opts;
   build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
   build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
+  auto depth = input->info()->dimension(index_depth);
+  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
+  build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
 
   // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts));
+  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
+      "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
 
   // Configure  kernel window
   Window win = calculate_max_window(*input->info(), Steps());
-- 
2.7.4