From: 윤지영/On-Device Lab(SR)/Engineer/삼성전자 <jy910.yun@samsung.com>
Date: Tue, 29 Jan 2019 07:19:04 +0000 (+0900)
Subject: Support multiple indices upto 4D output ranks in Gather (#4339)
X-Git-Tag: submit/tizen/20190325.013700~296
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=81e316dc724b7e0606e4fd588a26a537fc062c08;p=platform%2Fcore%2Fml%2Fnnfw.git

Support multiple indices upto 4D output ranks in Gather (#4339)

This commit supports below test cases:
2D input + 2D indices => 3D output
2D input + 3D indices => 4D output
3D input + 2D indices => 4D output

Signed-off-by: Jiyoung Yun <jy910.yun@samsung.com>
---

diff --git a/libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h b/libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h
new file mode 100644
index 0000000..0230fa1
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+Â * @fileÂ Â Â Â Â Â Â  GenericGather.h
+ * @briefÂ Â Â Â Â Â  This file contains GenericGather class
+ * @ingroupÂ Â Â Â  COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+#define __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+#include <arm_compute/runtime/CL/functions/CLGather.h>
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+/**
+ * @brief Class to run Gather with both CPU and GPU
+ */
+class GenericGather : public arm_compute::IFunction
+{
+public:
+  GenericGather(void)
+      : _input(nullptr), _output(nullptr), _cl_permuted{}, _cl_permute{}, _cl_gather{}
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] indices The indices tensor
+   * @param[in] output The destination tensor
+   * @param[in] axis (Optional) The axis in input to gather indices from
+   * @return N/A
+   */
+  void configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+                 arm_compute::ITensor *output, int axis = 0);
+
+public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
+
+private:
+  arm_compute::ITensor *_input;
+  arm_compute::ITensor *_indices;
+  arm_compute::ITensor *_output;
+  int _axis;
+  arm_compute::CLTensor _cl_permuted;
+
+private:
+  arm_compute::CLPermute _cl_permute;
+  arm_compute::CLGather _cl_gather;
+};
+
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
index 9f616e4..05560e8 100644
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
@@ -16,7 +16,7 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(AXIS)
+#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
 
 /** Performs the Gather operation along the chosen axis
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
@@ -35,10 +35,14 @@
  * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
  * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
  * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
- * @param[in]  indices_ptr                           Pointer to the indices vector. Supported data types: U32.
- * @param[in]  indices_stride_x                      Stride of the indices vector in X dimension (in bytes)
- * @param[in]  indices_step_x                        input_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  indices_offset_first_element_in_bytes Offset of the first element in the indices vector
+ * @param[in]  indices_ptr                           Pointer to the source tensor. Supported data types: S32
+ * @param[in]  indices_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  indices_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  indices_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the destination tensor
  * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
@@ -52,7 +56,7 @@
  */
 __kernel void gather(
     TENSOR4D_DECLARATION(input),
-    VECTOR_DECLARATION(indices),
+    TENSOR3D_DECLARATION(indices),
     TENSOR4D_DECLARATION(output))
 {
     const int px = get_global_id(0);
@@ -61,24 +65,47 @@ __kernel void gather(
     const int pw = get_global_id(2) / OUTPUT_DIM_Z;
 
     const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
-    const Vector   indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+    const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
     Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
 
 #if AXIS == 0
-    const uint index                 = *(__global const uint *)vector_offset(&indices, px);
+#if INDICES_DIM == 1
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
     __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif INDICES_DIM == 2
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
+    __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
+#elif INDICES_DIM == 3
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+    __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
+#endif
 #elif AXIS == 1
-    const uint index                 = *(__global const uint *)vector_offset(&indices, py);
+#if INDICES_DIM == 1
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
     __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif INDICES_DIM == 2
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
+#elif INDICES_DIM == 3
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
+#endif
 #elif AXIS == 2
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pz);
+#if INDICES_DIM == 1
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
     __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif INDICES_DIM == 2
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
+#endif
 #elif AXIS == 3
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pw);
+#if INDICES_DIM == 1
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
     __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif
 #endif //AXIS
 
     *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
 }
 
-#endif //defined(DATA_TYPE) && defined(AXIS)
+#endif //defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
index aa8a75f..24e3dc9 100644
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
@@ -29,13 +29,36 @@ namespace
 inline TensorShape compute_gather_shape(const TensorShape &input_shape,
                                         const TensorShape &indices_shape, uint32_t actual_axis)
 {
-  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
   ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
   ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
 
   TensorShape output_shape = input_shape;
-  output_shape[actual_axis] = indices_shape[0];
-
+  if (indices_shape.num_dimensions() == 1)
+  {
+    output_shape[actual_axis] = indices_shape[0];
+  }
+  else if (indices_shape.num_dimensions() > 1)
+  {
+    output_shape.shift_right(indices_shape.num_dimensions() - 1);
+
+    for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
+    {
+      if (o == actual_axis)
+      {
+        ++i;
+        for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
+        {
+          output_shape[o] = indices_shape[in];
+        }
+      }
+      else
+      {
+        output_shape[o] = input_shape[i];
+      }
+    }
+  }
   return output_shape;
 }
 
@@ -52,8 +75,9 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
                                  const ITensorInfo *output, int axis)
 {
   const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
-  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
   ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
   ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
@@ -79,10 +103,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
   const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+  std::unique_ptr<ITensorInfo> output_info = input->clone();
+  output_info->set_tensor_shape(
+      compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis));
   // Output auto initialization if not yet initialized
-  TensorShape output_shape =
-      compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
-  auto_init_if_empty((*output), output_shape, 1, input->data_type());
+  auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
 
   // Create window
   Window win = calculate_max_window(*output, Steps());
@@ -119,6 +144,8 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices,
                         support::cpp11::to_string(output->info()->dimension(2)));
   build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
   build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+  build_opts.add_option("-DINDICES_DIM=" +
+                        support::cpp11::to_string(indices->info()->num_dimensions()));
 
   // Create kernel
   _kernel = static_cast<cl::Kernel>(
@@ -142,10 +169,10 @@ void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
   ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
   ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4);
   unsigned int idx = 0;
   add_4D_tensor_argument(idx, _input, window_collapsed);
-  add_1D_tensor_argument(idx, _indices, window_collapsed);
+  add_3D_tensor_argument(idx, _indices, window_collapsed);
   add_4D_tensor_argument(idx, _output, window_collapsed);
   enqueue(queue, *this, window_collapsed);
 }
diff --git a/libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp b/libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp
new file mode 100644
index 0000000..a3e6cce
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/misc/functions/GenericGather.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output)
+{
+  return (input->num_dimensions() != 4 && output->num_dimensions() == 4);
+}
+
+void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+                              arm_compute::ITensor *output, int axis)
+{
+  _input = input;
+  _indices = indices;
+  _output = output;
+  _axis = axis;
+
+  arm_compute::PermutationVector pv;
+  if (shouldPermute(input->info(), output->info()))
+  {
+    // NOTE This vector comes from CLPermuteKernel implementation
+    //
+    // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H /
+    // C
+    //
+    //     Original | Permuted
+    // 0 | C        | W (from 1)
+    // 1 | W        | H (from 2)
+    // 2 | H        | C (from 0)
+    //
+    pv = arm_compute::PermutationVector{1, 2, 0};
+  }
+
+  if (utils::isGpuMode())
+  {
+    if (shouldPermute(input->info(), output->info()))
+    {
+      _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis);
+      _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv);
+
+      // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _cl_permuted.allocator()->allocate();
+    }
+    else
+    {
+      _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis);
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Not supported, yet");
+  }
+}
+
+void GenericGather::run(void)
+{
+  if (utils::isGpuMode())
+  {
+    _cl_gather.run();
+    if (shouldPermute(_input->info(), _output->info()))
+    {
+      _cl_permute.run();
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Not supported, yet");
+  }
+}
+
+} // namespace misc
+} // namespace arm_compute
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index 36d9c5e..ba4964e 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -113,6 +113,7 @@
 #include <arm_compute/runtime/misc/functions/SimpleUnpackLayer.h>
 #include <arm_compute/runtime/misc/functions/SimpleSQRT.h>
 #include <arm_compute/runtime/misc/functions/SimpleArgOperation.h>
+#include <arm_compute/runtime/misc/functions/GenericGather.h>
 
 #include "misc/matrix/IndexIterator.h"
 #include "misc/kernel/IndexIterator.h"
@@ -2975,9 +2976,11 @@ void Planner::visit(const ::internal::tflite::op::Gather::Node &node)
   const auto ifm_shape = _ctx.at(ifm_index).shape();
   const auto indices_shape = _ctx.at(indices_index).shape();
   const auto axis_shape = _ctx.at(axis_index).shape();
+  const auto ofm_shape = _ctx.at(ofm_index).shape();
 
   assert(ifm_shape.rank() <= 4);
-  assert(indices_shape.rank() <= 1);
+  assert(indices_shape.rank() <= 3);
+  assert(ofm_shape.rank() <= 4);
   assert(_ctx.at(axis_index).hasData());
   assert(axis_shape.rank() == 0);
 
@@ -3025,7 +3028,7 @@ void Planner::visit(const ::internal::tflite::op::Gather::Node &node)
     {
       std::unique_ptr<::arm_compute::IFunction> fn;
 
-      auto l = nnfw::cpp14::make_unique<::arm_compute::CLGather>();
+      auto l = nnfw::cpp14::make_unique<GenericGather>();
       l->configure(CAST_CL(ifm_alloc), CAST_CL(indices_alloc), CAST_CL(ofm_alloc), param.axis);
       fn = std::move(l);
       builder.append("Gather", std::move(fn));