From 991af617a89e32878ac1c69ade813a6c841e5fee Mon Sep 17 00:00:00 2001
From: =?utf8?q?Shubham=20Gupta/System=20SW=20/SRI-Bangalore/Engineer/?=
 =?utf8?q?=EC=82=BC=EC=84=B1=EC=A0=84=EC=9E=90?= <shub98.gupta@samsung.com>
Date: Fri, 19 Oct 2018 06:38:32 +0530
Subject: [PATCH] Add CL kernel to provide GPU support for SpaceToDepth op
 (#3134)

This patch will provide kernel to execute SpaceToDepth op on GPU

Signed-off-by: shubham <shub98.gupta@samsung.com>
---
 .../core/CL/kernels/CLSpaceToDepthKernel.h         |  58 ++++++++++
 .../runtime/CL/functions/CLSpaceToDepth.h          |  45 ++++++++
 libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp  |   5 +
 .../src/core/CL/cl_kernels/space_to_depth.cl       |  69 ++++++++++++
 .../src/core/CL/kernels/CLSpaceToDepthKernel.cpp   | 120 +++++++++++++++++++++
 .../src/runtime/CL/functions/CLSpaceToDepth.cpp    |  29 +++++
 6 files changed, 326 insertions(+)
 create mode 100644 libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
 create mode 100644 libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
 create mode 100644 libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
 create mode 100644 libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
 create mode 100644 libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp

diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
new file mode 100644
index 0000000..be845a5
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform spaceTodepth operation */
+class CLSpaceToDepthKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLSpaceToDepthKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
+  /** Default destructor */
+  ~CLSpaceToDepthKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input; /**< Source tensor */
+  ICLTensor *_output;      /**< Destination tensor */
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
new file mode 100644
index 0000000..f9ec13c
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
+#define __ARM_COMPUTE_CLSPACETODEPTH_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToDepthKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
+ * @note The function converts the input tensor to the tensor of the output tensor's type.
+ */
+class CLSpaceToDepth : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
+   * @param[block_size] block size  integer only
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
diff --git a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 0d7f1e1..9f2a3fc 100644
--- a/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -306,6 +306,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
     {"radixsort_reorder", "topkv2_radixsort.cl"},
     {"topkv2_quicksort", "topkv2_quicksort.cl"},
+    {"space_to_depth", "space_to_depth.cl"},
 };
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
@@ -383,6 +384,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/reduction_mean.clembed"
     },
     {
+        "space_to_depth.cl",
+#include "./cl_kernels/space_to_depth.clembed"
+    },
+    {
         "squared_difference.cl",
 #include "./cl_kernels/squared_difference.clembed"
     },
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
new file mode 100644
index 0000000..fb4f3e5
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
+/** Perform space to depth rearrangement of tensor
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g. -DBLOCK_SIZE=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p inpu
+t_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in
+bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_depth(
+     TENSOR4D_DECLARATION(input),
+     TENSOR4D_DECLARATION(output))
+ {
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    int out_index[4]={0};
+    int in_index[4]={0};
+
+    in_index[0] = get_global_id(0);//W
+    in_index[1] = get_global_id(1);//H
+    in_index[2] = get_global_id(2) % DEPTH_IN;//C
+    in_index[3] = get_global_id(2) / DEPTH_IN;//B
+
+    out_index[0] = in_index[0]/BLOCK_SIZE;
+    out_index[1] = in_index[1]/BLOCK_SIZE;
+    out_index[2] = in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
+    out_index[3] = in_index[3];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0],out_index[1],out_index[2],out_index[3])) =                                                                                                 *((__global DATA_TYPE *)in.ptr);
+ }
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
new file mode 100644
index 0000000..6602956
--- /dev/null
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <set>
+#include <string>
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const int32_t block_size)
+{
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                DataType::S16, DataType::S32, DataType::F16,
+                                                DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size >= 1,
+                                  "Block size should be greater than or equal to 1.");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) == output->dimension(3),
+                                  "Input batch should be equal to Output batch");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+      input->dimension(2) * block_size * block_size == output->dimension(2),
+      "Output depth should be equal to (input depth * block size *block size)");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(input->dimension(0) % block_size) &&
+                                      !(input->dimension(1) % block_size),
+                                  "Input height and width should be divisible by block size");
+
+  return Status{};
+}
+
+} // namespace
+
+CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
+                                     const int32_t block_size)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
+
+  _input = input;
+  _output = output;
+
+  // Set kernel build options
+  std::set<std::string> build_opts;
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
+  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("space_to_depth", build_opts));
+
+  // Configure  kernel window
+  Window win = calculate_max_window(*input->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure(win);
+}
+
+void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_out(slice_in);
+  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_out.set(3, Window::Dimension(0, 0, 0));
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    enqueue(queue, *this, slice_in);
+  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
new file mode 100644
index 0000000..b0daf48
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
+  k->configure(input, output, block_size);
+  _kernel = std::move(k);
+}
-- 
2.7.4