Implement CL kernel of SpaceToBatchND for NCHW (#3633)
author장지섭/동작제어Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Tue, 27 Nov 2018 10:33:46 +0000 (19:33 +0900)
committer오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Tue, 27 Nov 2018 10:33:46 +0000 (19:33 +0900)
This commit implements CL kernel of SpaceToBatchND for NCHW.

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h [new file with mode: 0644]
libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h [new file with mode: 0644]
libs/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl [new file with mode: 0644]
libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp [new file with mode: 0644]
libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp [new file with mode: 0644]

diff --git a/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h b/libs/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h
new file mode 100644 (file)
index 0000000..31e4c86
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__
+
+#include "arm_compute/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** OpenCL kernel to perform SPACE_TO_BATCH_ND operation */
+class CLSpaceToBatchNDKernel final : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLSpaceToBatchNDKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToBatchNDKernel(const CLSpaceToBatchNDKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLSpaceToBatchNDKernel &operator=(const CLSpaceToBatchNDKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLSpaceToBatchNDKernel(CLSpaceToBatchNDKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLSpaceToBatchNDKernel &operator=(CLSpaceToBatchNDKernel &&) = default;
+  /** Default destructor */
+  ~CLSpaceToBatchNDKernel() = default;
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input         Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   * @param[in]  block_size    Block size tensor. Data types supported: S32.
+   * @param[in]  padding_size  Padding size tensor. Data types supported: S32.
+   * @param[out]  output        Output tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   */
+  void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+                 ICLTensor *output);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;        /**< Source tensor */
+  const ICLTensor *_block_size;   /**< Block size tensor */
+  const ICLTensor *_padding_size; /**< Padding size tensor */
+  ICLTensor *_output;             /**< Destination tensor */
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_KERNEL_H__ */
diff --git a/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h b/libs/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToBatchND.h
new file mode 100644 (file)
index 0000000..4f42391
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+#define __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__
+
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to run @ref CLSpaceToBatchNDKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/S32/F32.
+ * @note The function divides "spatial" dimensions of the input into a grid of blocks of shape
+ * block_shape, and interleaves these blocks with the "batch" dimension such that in the output.
+ */
+class CLSpaceToBatchND : public ICLSimpleFunction
+{
+public:
+  /** Initialise the kernel's input and output.
+   *
+   * @param[in]  input          Input tensor. Data types supported: U8/QASYMM8/S16/F16/S32/F32.
+   * @param[in]  block_size     Tensor of integer values specifying block sizes for spatial
+   * dimension.
+   *                            Data types supported: S32
+   * @param[in]  padding_size   Tensor of integer values specifying padding sizes for spatial
+   * dimension.
+   *                            Data types supported: S32
+   * @param[out] output         Output tensor. Data types supported: same as @p input.
+   */
+  void configure(const ICLTensor *input, const ICLTensor *block_size, const ICLTensor *padding_size,
+                 ICLTensor *output);
+};
+
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLSPACE_TO_BATCH_ND_H__ */
index c2cbe5e..cbe5eff 100644 (file)
@@ -313,6 +313,7 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"radixsort_pastehistograms", "topkv2_radixsort.cl"},
     {"radixsort_reorder", "topkv2_radixsort.cl"},
     {"topkv2_quicksort", "topkv2_quicksort.cl"},
+    {"space_to_batch_4d_nchw", "space_to_batch.cl"},
     {"space_to_depth", "space_to_depth.cl"},
 };
 
@@ -395,6 +396,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/reduce_operation.clembed"
     },
     {
+        "space_to_batch.cl",
+#include "./cl_kernels/space_to_batch.clembed"
+    },
+    {
         "space_to_depth.cl",
 #include "./cl_kernels/space_to_depth.clembed"
     },
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/space_to_batch.cl
new file mode 100644 (file)
index 0000000..4460677
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016, 2017 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
+/** Perform space to batch with input of 4D and NCHW format
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH_OUT=16
+ * @attention Input tensor batch should be given as a preprocessor argument using -DBATCH_IN=size. e.g. -DBATCH_IN=16
+ * @attention Input tensor height should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DHEIGHT_IN=16
+ * @attention Input tensor width should be given as a preprocessor argument using -DHEIGHT_IN=size. e.g. -DWIDTH_IN=16
+ * @attention The value to be set by pad value using -DZERO_VALUE=value. e.g. -DZERO_VALUE=0
+ *
+ * @param[in]  input_ptr                                   Pointer to the source tensor. Supported data types: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]  input_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                                input_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  input_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                                input_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  input_stride_z                              Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                                input_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  input_stride_w                              Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                                input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[out] output_ptr                                  Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                             Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                               output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                             Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                               output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                             Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                               output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                             Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                               output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes        The offset of the first element in the destination tensor
+ * @param[in]  block_size_ptr                              Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_size_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_size_step_x                           block_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  block_size_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ * @param[in]  padding_size_ptr                            Pointer to the source tensor. Supported data types: S32
+ * @param[in]  padding_size_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  padding_size_step_x                         padding_size_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  padding_size_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  padding_size_step_y                         padding_size_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  padding_size_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void space_to_batch_4d_nchw(TENSOR4D_DECLARATION(input),
+                                     TENSOR4D_DECLARATION(output),
+                                     VECTOR_DECLARATION(block_size),
+                                     IMAGE_DECLARATION(padding_size))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
+
+    int block_size_x = *((__global int *)(block_size_ptr + block_size_stride_x));
+    int block_size_y = *((__global int *)block_size_ptr);
+    int shift_x = (get_global_id(2) / DEPTH_OUT / BATCH_IN) % block_size_x;
+    int shift_y = (get_global_id(2) / DEPTH_OUT / BATCH_IN) / block_size_x;
+
+    int in_index[4] = {0, };
+    in_index[0] = get_global_id(0) * block_size_x + shift_x - *((__global int *)(padding_size_ptr + padding_size_stride_y));
+    in_index[1] = get_global_id(1) * block_size_y + shift_y - *((__global int *)(padding_size_ptr));
+    in_index[2] = get_global_id(2) % DEPTH_OUT;
+    in_index[3] = (get_global_id(2) / DEPTH_OUT) % BATCH_IN;
+
+    if (in_index[0] < 0 || in_index[0] >= WIDTH_IN || in_index[1] < 0 || in_index[1] >= HEIGHT_IN)
+    {
+        *((__global DATA_TYPE *)out.ptr) = (DATA_TYPE)ZERO_VALUE;
+    }
+    else
+    {
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_index[0], in_index[1], in_index[2], in_index[3]));
+    }
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BATCH_IN) && defined(HEIGHT_IN) && defined(WIDTH_IN) && defined(ZERO_VALUE)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLSpaceToBatchNDKernel.cpp
new file mode 100644 (file)
index 0000000..03f7959
--- /dev/null
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_size,
+                          const ITensorInfo *padding_size, const ITensorInfo *output)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_size, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(padding_size, 1, DataType::S32);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
+                                                       DataType::S16, DataType::F16, DataType::S32,
+                                                       DataType::F32);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != output->num_dimensions(),
+                                  "The number of dimensions of input should be equal to output");
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != output->data_layout(),
+                                  "The input and output layouts are different!");
+
+  // TODO Support other cases
+  if (input->num_dimensions() == 4 && input->data_layout() == DataLayout::NCHW)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != output->dimension(2),
+                                    "Input Depth should be equal to Output Depth");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size->dimension(0) != 2 ||
+                                        padding_size->dimension(1) != 2,
+                                    "Only 2-dimensional spatial block's size was wrong");
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ERROR_MSG(
+        "CLSpaceToBatchNDKernel supports only 4-dimensional input and DataLayout::NCHW");
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 2 && input->num_dimensions() > 4,
+                                  "CLSpaceToBatchNDKernel supports dimensions up to 4");
+
+  if (input->data_type() == DataType::QASYMM8)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->quantization_info() != output->quantization_info(),
+                                    "The input and output quantization info are different!");
+  }
+
+  return Status{};
+}
+
+} // namespace
+
+CLSpaceToBatchNDKernel::CLSpaceToBatchNDKernel() : _input(nullptr), _output(nullptr) {}
+
+void CLSpaceToBatchNDKernel::configure(const ICLTensor *input, const ICLTensor *block_size,
+                                       const ICLTensor *padding_size, ICLTensor *output)
+{
+
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+      validate_arguments(input->info(), block_size->info(), padding_size->info(), output->info()));
+
+  _input = input;
+  _block_size = block_size;
+  _padding_size = padding_size;
+  _output = output;
+
+  // Set kernel build options
+  // TODO Support other cases
+  std::string kernel_name = "space_to_batch_4d_nchw";
+  std::set<std::string> build_opts;
+
+  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output->info()->dimension(2)));
+  build_opts.emplace("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(3)));
+  build_opts.emplace("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(1)));
+  build_opts.emplace("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(0)));
+  if (input->info()->data_type() == DataType::QASYMM8)
+  {
+    build_opts.emplace("-DZERO_VALUE=" +
+                       support::cpp11::to_string(input->info()->quantization_info().offset));
+  }
+  else
+  {
+    build_opts.emplace("-DZERO_VALUE=" + support::cpp11::to_string(0));
+  }
+
+  // Create kernel
+  _kernel =
+      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
+
+  // Configure kernel window
+  Window win = calculate_max_window(*output->info(), Steps());
+
+  Coordinates coord;
+  coord.set_num_dimensions(output->info()->num_dimensions());
+  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+
+  ICLKernel::configure(win);
+}
+
+void CLSpaceToBatchNDKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+#if defined(DEBUG)
+  const_cast<ICLTensor *>(_block_size)->map(queue);
+  const_cast<ICLTensor *>(_padding_size)->map(queue);
+
+  const size_t num_dimensions = _input->info()->num_dimensions();
+  const size_t num_spacial_dimensions = _block_size->info()->dimension(0);
+  int32_t batch_size = _input->info()->dimension(num_dimensions - 1);
+  for (size_t i = 0; i < num_spacial_dimensions; ++i)
+  {
+    const int32_t block_size = *reinterpret_cast<int32_t *>(_block_size->ptr_to_element({i}));
+    const int32_t padding_size_pre =
+        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({0, i}));
+    const int32_t padding_size_post =
+        *reinterpret_cast<int32_t *>(_padding_size->ptr_to_element({1, i}));
+
+    ARM_COMPUTE_ERROR_ON_MSG(block_size < 1, "Block size should be greater than or equal to 1");
+    ARM_COMPUTE_ERROR_ON_MSG(padding_size_pre < 0 && padding_size_post < 0,
+                             "Padding size should be greater than or equal to 0");
+
+    if (num_dimensions == 4 && _input->info()->data_layout() == DataLayout::NCHW)
+    {
+      ARM_COMPUTE_ERROR_ON_MSG(
+          _output->info()->dimension(1 - i) !=
+              (_input->info()->dimension(1 - i) + padding_size_pre + padding_size_post) /
+                  block_size,
+          "Dimension value of spatial block does not match output's dimension value");
+    }
+    else
+    {
+      ARM_COMPUTE_ERROR_ON_MSG(
+          _output->info()->dimension(num_dimensions - num_spacial_dimensions - i) !=
+              (_input->info()->dimension(num_dimensions - num_dimensions - i) + padding_size_pre +
+               padding_size_post) /
+                  block_size,
+          "Dimension value of spatial block does not match output's dimension value");
+    }
+
+    batch_size *= block_size;
+  }
+  ARM_COMPUTE_ERROR_ON_MSG(
+      _output->info()->dimension(num_dimensions - 1) != batch_size,
+      "Output batch size should be equal to input batch size * (multiplication of all block size)");
+
+  const_cast<ICLTensor *>(_block_size)->unmap(queue);
+  const_cast<ICLTensor *>(_padding_size)->unmap(queue);
+#endif // defined(DEBUG)
+
+  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+  // Setup output slice
+  Window slice_in(slice_out);
+  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+  slice_in.set(3, Window::Dimension(0, 0, 0));
+
+  // Set block size window
+  Window win_block = calculate_max_window(*_block_size->info(), Steps());
+
+  // Set padding size window
+  Window win_padding = calculate_max_window(*_padding_size->info(), Steps());
+
+  do
+  {
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, _input, slice_in);
+    add_4D_tensor_argument(idx, _output, slice_out);
+    add_1D_tensor_argument(idx, _block_size, win_block);
+    add_2D_tensor_argument(idx, _padding_size, win_padding);
+    enqueue(queue, *this, slice_out);
+  } while (window.slide_window_slice_4D(slice_out) && window.slide_window_slice_4D(slice_in));
+}
diff --git a/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp b/libs/ARMComputeEx/src/runtime/CL/functions/CLSpaceToBatchND.cpp
new file mode 100644 (file)
index 0000000..c038268
--- /dev/null
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2016-2018 ARM Limited.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "arm_compute/runtime/CL/functions/CLSpaceToBatchND.h"
+
+#include "arm_compute/core/CL/kernels/CLSpaceToBatchNDKernel.h"
+
+using namespace arm_compute;
+
+void CLSpaceToBatchND::configure(const ICLTensor *input, const ICLTensor *block_size,
+                                 const ICLTensor *padding_size, ICLTensor *output)
+{
+  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToBatchNDKernel>();
+  k->configure(input, block_size, padding_size, output);
+  _kernel = std::move(k);
+}