Support multiple indices upto 4D output ranks in Gather (#4339)

author 윤지영/On-Device Lab(SR)/Engineer/삼성전자 <jy910.yun@samsung.com>

Tue, 29 Jan 2019 07:19:04 +0000 (16:19 +0900)

committer 오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>

Tue, 29 Jan 2019 07:19:04 +0000 (16:19 +0900)
author 윤지영/On-Device Lab(SR)/Engineer/삼성전자 <jy910.yun@samsung.com>
Tue, 29 Jan 2019 07:19:04 +0000 (16:19 +0900)
committer 오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Tue, 29 Jan 2019 07:19:04 +0000 (16:19 +0900)
diff --git a/libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h b/libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h

new file mode 100644 (file)

index 0000000..0230fa1
--- /dev/null
+++ b/libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file        GenericGather.h
+ * @brief       This file contains GenericGather class
+ * @ingroup     COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+#define __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+#include <arm_compute/runtime/CL/functions/CLGather.h>
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+/**
+ * @brief Class to run Gather with both CPU and GPU
+ */
+class GenericGather : public arm_compute::IFunction
+{
+public:
+  GenericGather(void)
+      : _input(nullptr), _output(nullptr), _cl_permuted{}, _cl_permute{}, _cl_gather{}
+  {
+    // DO NOTHING
+  }
+
+public:
+  /**
+   * @brief Configure the layer
+   * @param[in] input The source tensor
+   * @param[in] indices The indices tensor
+   * @param[in] output The destination tensor
+   * @param[in] axis (Optional) The axis in input to gather indices from
+   * @return N/A
+   */
+  void configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+                 arm_compute::ITensor *output, int axis = 0);
+
+public:
+  /**
+   * @brief Run the operation. Must be called after configure().
+   * @return N/A
+   */
+  void run(void) override;
+
+private:
+  arm_compute::ITensor *_input;
+  arm_compute::ITensor *_indices;
+  arm_compute::ITensor *_output;
+  int _axis;
+  arm_compute::CLTensor _cl_permuted;
+
+private:
+  arm_compute::CLPermute _cl_permute;
+  arm_compute::CLGather _cl_gather;
+};
+
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl

index 9f616e4..05560e8 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
+++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl
@@ -16,7 +16,7 @@
   */
  #include "helpers.h"
  
-#if defined(DATA_TYPE) && defined(AXIS)
+#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
  
  /** Performs the Gather operation along the chosen axis
   * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
@@ -35,10 +35,14 @@
   * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
   * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
   * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
- * @param[in]  indices_ptr                           Pointer to the indices vector. Supported data types: U32.
- * @param[in]  indices_stride_x                      Stride of the indices vector in X dimension (in bytes)
- * @param[in]  indices_step_x                        input_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  indices_offset_first_element_in_bytes Offset of the first element in the indices vector
+ * @param[in]  indices_ptr                           Pointer to the source tensor. Supported data types: S32
+ * @param[in]  indices_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in  bytes)
+ * @param[in]  indices_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in  bytes)
+ * @param[in]  indices_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in  bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the destination tensor
   * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
   * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
   * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
@@ -52,7 +56,7 @@
   */
  __kernel void gather(
      TENSOR4D_DECLARATION(input),
-    VECTOR_DECLARATION(indices),
+    TENSOR3D_DECLARATION(indices),
      TENSOR4D_DECLARATION(output))
  {
      const int px = get_global_id(0);
@@ -61,24 +65,47 @@ __kernel void gather(
      const int pw = get_global_id(2) / OUTPUT_DIM_Z;
  
      const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
-    const Vector   indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+    const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
      Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
  
  #if AXIS == 0
-    const uint index                 = *(__global const uint *)vector_offset(&indices, px);
+#if INDICES_DIM == 1
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
      __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif INDICES_DIM == 2
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
+    __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
+#elif INDICES_DIM == 3
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+    __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
+#endif
  #elif AXIS == 1
-    const uint index                 = *(__global const uint *)vector_offset(&indices, py);
+#if INDICES_DIM == 1
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
      __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif INDICES_DIM == 2
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
+#elif INDICES_DIM == 3
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
+#endif
  #elif AXIS == 2
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pz);
+#if INDICES_DIM == 1
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
      __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif INDICES_DIM == 2
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
+#endif
  #elif AXIS == 3
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pw);
+#if INDICES_DIM == 1
+    const uint index                 = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
      __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif
  #endif //AXIS
  
      *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
  }
  
-#endif //defined(DATA_TYPE) && defined(AXIS)
+#endif //defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp

index aa8a75f..24e3dc9 100644 (file)
--- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp
@@ -29,13 +29,36 @@ namespace
  inline TensorShape compute_gather_shape(const TensorShape &input_shape,
                                          const TensorShape &indices_shape, uint32_t actual_axis)
  {
-  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 1);
+  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
    ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
    ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
  
    TensorShape output_shape = input_shape;
-  output_shape[actual_axis] = indices_shape[0];
-
+  if (indices_shape.num_dimensions() == 1)
+  {
+    output_shape[actual_axis] = indices_shape[0];
+  }
+  else if (indices_shape.num_dimensions() > 1)
+  {
+    output_shape.shift_right(indices_shape.num_dimensions() - 1);
+
+    for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
+    {
+      if (o == actual_axis)
+      {
+        ++i;
+        for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
+        {
+          output_shape[o] = indices_shape[in];
+        }
+      }
+      else
+      {
+        output_shape[o] = input_shape[i];
+      }
+    }
+  }
    return output_shape;
  }
  
@@ -52,8 +75,9 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
                                   const ITensorInfo *output, int axis)
  {
    const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
-  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
    ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
@@ -79,10 +103,11 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
  {
    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
    const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+  std::unique_ptr<ITensorInfo> output_info = input->clone();
+  output_info->set_tensor_shape(
+      compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis));
    // Output auto initialization if not yet initialized
-  TensorShape output_shape =
-      compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
-  auto_init_if_empty((*output), output_shape, 1, input->data_type());
+  auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
  
    // Create window
    Window win = calculate_max_window(*output, Steps());
@@ -119,6 +144,8 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices,
                          support::cpp11::to_string(output->info()->dimension(2)));
    build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
    build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+  build_opts.add_option("-DINDICES_DIM=" +
+                        support::cpp11::to_string(indices->info()->num_dimensions()));
  
    // Create kernel
    _kernel = static_cast<cl::Kernel>(
@@ -142,10 +169,10 @@ void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue)
    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
  
-  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+  Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4);
    unsigned int idx = 0;
    add_4D_tensor_argument(idx, _input, window_collapsed);
-  add_1D_tensor_argument(idx, _indices, window_collapsed);
+  add_3D_tensor_argument(idx, _indices, window_collapsed);
    add_4D_tensor_argument(idx, _output, window_collapsed);
    enqueue(queue, *this, window_collapsed);
  }
diff --git a/libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp b/libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp

new file mode 100644 (file)

index 0000000..a3e6cce
--- /dev/null
+++ b/libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/misc/functions/GenericGather.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output)
+{
+  return (input->num_dimensions() != 4 && output->num_dimensions() == 4);
+}
+
+void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+                              arm_compute::ITensor *output, int axis)
+{
+  _input = input;
+  _indices = indices;
+  _output = output;
+  _axis = axis;
+
+  arm_compute::PermutationVector pv;
+  if (shouldPermute(input->info(), output->info()))
+  {
+    // NOTE This vector comes from CLPermuteKernel implementation
+    //
+    // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H /
+    // C
+    //
+    //     Original | Permuted
+    // 0 | C        | W (from 1)
+    // 1 | W        | H (from 2)
+    // 2 | H        | C (from 0)
+    //
+    pv = arm_compute::PermutationVector{1, 2, 0};
+  }
+
+  if (utils::isGpuMode())
+  {
+    if (shouldPermute(input->info(), output->info()))
+    {
+      _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis);
+      _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv);
+
+      // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _cl_permuted.allocator()->allocate();
+    }
+    else
+    {
+      _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis);
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Not supported, yet");
+  }
+}
+
+void GenericGather::run(void)
+{
+  if (utils::isGpuMode())
+  {
+    _cl_gather.run();
+    if (shouldPermute(_input->info(), _output->info()))
+    {
+      _cl_permute.run();
+    }
+  }
+  else
+  {
+    throw std::runtime_error("Not supported, yet");
+  }
+}
+
+} // namespace misc
+} // namespace arm_compute
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc

index 36d9c5e..ba4964e 100644 (file)
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -113,6 +113,7 @@
  #include <arm_compute/runtime/misc/functions/SimpleUnpackLayer.h>
  #include <arm_compute/runtime/misc/functions/SimpleSQRT.h>
  #include <arm_compute/runtime/misc/functions/SimpleArgOperation.h>
+#include <arm_compute/runtime/misc/functions/GenericGather.h>
  
  #include "misc/matrix/IndexIterator.h"
  #include "misc/kernel/IndexIterator.h"
@@ -2975,9 +2976,11 @@ void Planner::visit(const ::internal::tflite::op::Gather::Node &node)
    const auto ifm_shape = _ctx.at(ifm_index).shape();
    const auto indices_shape = _ctx.at(indices_index).shape();
    const auto axis_shape = _ctx.at(axis_index).shape();
+  const auto ofm_shape = _ctx.at(ofm_index).shape();
  
    assert(ifm_shape.rank() <= 4);
-  assert(indices_shape.rank() <= 1);
+  assert(indices_shape.rank() <= 3);
+  assert(ofm_shape.rank() <= 4);
    assert(_ctx.at(axis_index).hasData());
    assert(axis_shape.rank() == 0);
  
@@ -3025,7 +3028,7 @@ void Planner::visit(const ::internal::tflite::op::Gather::Node &node)
      {
        std::unique_ptr<::arm_compute::IFunction> fn;
  
-      auto l = nnfw::cpp14::make_unique<::arm_compute::CLGather>();
+      auto l = nnfw::cpp14::make_unique<GenericGather>();
        l->configure(CAST_CL(ifm_alloc), CAST_CL(indices_alloc), CAST_CL(ofm_alloc), param.axis);
        fn = std::move(l);
        builder.append("Gather", std::move(fn));
author	윤지영/On-Device Lab(SR)/Engineer/삼성전자 <jy910.yun@samsung.com>
	Tue, 29 Jan 2019 07:19:04 +0000 (16:19 +0900)
committer	오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
	Tue, 29 Jan 2019 07:19:04 +0000 (16:19 +0900)
libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h	[new file with mode: 0644]	patch \| blob
libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl		patch \| blob \| history
libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp		patch \| blob \| history
libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp	[new file with mode: 0644]	patch \| blob
runtimes/pure_arm_compute/src/compilation.cc		patch \| blob \| history