From: 장지섭/On-Device Lab(SR)/Engineer/삼성전자 <jiseob.jang@samsung.com>
Date: Mon, 30 Sep 2019 06:58:48 +0000 (+0900)
Subject: Make to support Gather op for acl neon (#7746)
X-Git-Tag: submit/tizen/20191205.083104~1018
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=52c73854d4d2a4a08a2ccea87e659f35a7188735;p=platform%2Fcore%2Fml%2Fnnfw.git

Make to support Gather op for acl neon (#7746)

This commit makes to support Gather op for acl neon.
  - Introduce NEGatherEx and NEGatherKernelEx
  - Apply NEGatherEx layer for neurun

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
---

diff --git a/runtimes/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/runtimes/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
new file mode 100644
index 0000000..3fa9c6e
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__
+#define __ARM_COMPUTE_NEGATHERKERNELEX_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Kernel to perform other operation on NEON */
+class NEGatherKernelEx : public INEKernel
+{
+public:
+  /** Default constructor. */
+  NEGatherKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEGatherKernelEx(const NEGatherKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers). */
+  NEGatherKernelEx &operator=(const NEGatherKernelEx &) = delete;
+  /** Allow instances of this class to be moved. */
+  NEGatherKernelEx(NEGatherKernelEx &&) = default;
+  /** Allow instances of this class to be moved. */
+  NEGatherKernelEx &operator=(NEGatherKernelEx &&) = default;
+  /** Default detructor */
+  ~NEGatherKernelEx() = default;
+
+  /** Name of the kernel
+   *
+   * @return Kernel name
+   */
+  const char *name() const override { return "NEGatherKernelEx"; }
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[out] output  Destination tensor. Data type supported: Same as @p input
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values
+   * wrap around. Defaults to 0
+   */
+  void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGatherKernelEx
+   *
+   * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[in] output  Destination tensor info. Data type supported: Same as @p input
+   * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values
+   * wrap around. Defaults to 0
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  /** Implementation of the gather operation for 0 axis.
+   *
+   * For gather on the 0 axis an element by element copy is performed.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void gather_0_axis(const Window &window, const ThreadInfo &info);
+
+  /** Implementation of the gather operation.
+   *
+   * For 1<=axis a row-wise copy is taking place.
+   *
+   * @param[in] window Region on which to execute the kernel. (Must be a region of the window
+   * returned by window())
+   * @param[in] info   Info about executing thread and CPU.
+   */
+  template <typename U> void gather_n_axis(const Window &window, const ThreadInfo &info);
+
+  using kernel_ptr = void (NEGatherKernelEx::*)(const Window &window, const ThreadInfo &info);
+
+  const ITensor *_input;
+  const ITensor *_indices;
+  int _axis;
+  ITensor *_output;
+  kernel_ptr _func;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_NEGATHERKERNELEX_H__ */
diff --git a/runtimes/libs/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h b/runtimes/libs/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
index 1a6978a..16fd40e 100644
--- a/runtimes/libs/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
+++ b/runtimes/libs/ARMComputeEx/arm_compute/core/utils/misc/ShapeCalculatorEx.h
@@ -171,6 +171,50 @@ inline TensorShape compute_space_to_depth_shape_ex(const ITensorInfo *input, int
   return output_shape;
 }
 
+/** Calculate the gather output shape of a tensor
+ *
+ * @param[in] input_shape   Input tensor shape
+ * @param[in] indices_shape Indices tensor shape
+ * @param[in] actual_axis   The axis to be gathered
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_gather_shape_ex(const TensorShape &input_shape,
+                                           const TensorShape &indices_shape, uint32_t actual_axis)
+{
+  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
+  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
+  ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
+
+  TensorShape output_shape = input_shape;
+  if (indices_shape.num_dimensions() == 1)
+  {
+    output_shape[actual_axis] = indices_shape[0];
+  }
+  else if (indices_shape.num_dimensions() > 1)
+  {
+    output_shape.shift_right(indices_shape.num_dimensions() - 1);
+
+    for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
+    {
+      if (o == actual_axis)
+      {
+        ++i;
+        for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
+        {
+          output_shape[o] = indices_shape[in];
+        }
+      }
+      else
+      {
+        output_shape[o] = input_shape[i];
+      }
+    }
+  }
+  return output_shape;
+}
+
 } // namespace shape_calculator
 } // namespace misc
 } // namespace arm_compute
diff --git a/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
index 208d5df..4ea7b97 100644
--- a/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -23,6 +23,7 @@
 #include <arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
 #include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEPReLU.h>
 #include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
diff --git a/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
new file mode 100644
index 0000000..d95e6a8
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __ARM_COMPUTE_NEGATHEREX_H__
+#define __ARM_COMPUTE_NEGATHEREX_H__
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref NEGatherKernelEx */
+class NEGatherEx : public INESimpleFunctionNoBorder
+{
+public:
+  /** Initialise the kernel's inputs and outputs
+   *
+   * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the
+   * following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[out] output  Destination tensor. Data type supported: Same as @p input
+   * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   */
+  void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
+
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGatherKernelEx
+   *
+   * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported:
+   * U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+   * @param[in] indices Indices tensor info. Supported tensor rank: up to 3. Must be one of the
+   * following types: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+   * @param[in] output  Destination tensor info. Data type supported: Same as @p input
+   * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
+                         const ITensorInfo *output, int axis);
+};
+
+} // namespace arm_compute
+
+#endif /* __ARM_COMPUTE_NEGATHEREX_H__ */
diff --git a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index c83ece0..718f615 100644
--- a/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/runtimes/libs/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -19,6 +19,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/core/UtilsEx.h"
 
 using namespace arm_compute;
@@ -26,51 +27,6 @@ using namespace arm_compute;
 namespace
 {
 
-inline TensorShape compute_gather_shape(const TensorShape &input_shape,
-                                        const TensorShape &indices_shape, uint32_t actual_axis)
-{
-  ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
-  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
-  ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
-  ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
-
-  TensorShape output_shape = input_shape;
-  if (indices_shape.num_dimensions() == 1)
-  {
-    output_shape[actual_axis] = indices_shape[0];
-  }
-  else if (indices_shape.num_dimensions() > 1)
-  {
-    output_shape.shift_right(indices_shape.num_dimensions() - 1);
-
-    for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
-    {
-      if (o == actual_axis)
-      {
-        ++i;
-        for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
-        {
-          output_shape[o] = indices_shape[in];
-        }
-      }
-      else
-      {
-        output_shape[o] = input_shape[i];
-      }
-    }
-  }
-  return output_shape;
-}
-
-/** Wrap-around a number within the range 0 <= x < m
- *
- * @param[in] x Input value
- * @param[in] m Range
- *
- * @return the wrapped-around number
- */
-template <typename T> inline T wrap_around(T x, T m) { return x >= 0 ? x % m : (x % m + m) % m; }
-
 inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices,
                                  const ITensorInfo *output, int axis)
 {
@@ -88,8 +44,8 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in
   {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-    TensorShape output_shape =
-        compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+        input->tensor_shape(), indices->tensor_shape(), actual_axis);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
   }
 
@@ -104,8 +60,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
   const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
   std::unique_ptr<ITensorInfo> output_info = input->clone();
-  output_info->set_tensor_shape(
-      compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis));
+  output_info->set_tensor_shape(arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+      input->tensor_shape(), indices->tensor_shape(), actual_axis));
   // Output auto initialization if not yet initialized
   auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
 
diff --git a/runtimes/libs/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/runtimes/libs/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
new file mode 100644
index 0000000..ce2413d
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+
+#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+namespace arm_compute
+{
+namespace
+{
+/** Validate the indices
+ *
+ * Validate that indices are not negative
+ *
+ * @param[in] indices Indices tensor info.
+ */
+template <typename U> void validate_indices(const ITensor *indices)
+{
+  for (size_t i = 0; i < indices->info()->tensor_shape()[0]; ++i)
+  {
+    ARM_COMPUTE_ERROR_ON(*(reinterpret_cast<U *>(indices->ptr_to_element(Coordinates(i)))) < 0);
+  }
+}
+
+} // namespace
+
+NEGatherKernelEx::NEGatherKernelEx() : _input{}, _indices{}, _axis{}, _output{}, _func{} {}
+
+template <typename U>
+inline void NEGatherKernelEx::gather_0_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+
+  // Validate that the indices are not negative
+  validate_indices<U>(_indices);
+
+  Iterator output_it(_output, window);
+  execute_window_loop(
+      window,
+      [&](const Coordinates &id) {
+        Coordinates gather_id(id);
+        gather_id.collapse(_indices->info()->num_dimensions(), 0);
+
+        U new_index;
+        switch (_indices->info()->num_dimensions())
+        {
+          case 1:
+            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0]))));
+            break;
+          case 2:
+            new_index =
+                *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1]))));
+            break;
+          case 3:
+            new_index = *(
+                reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[0], id[1], id[2]))));
+            break;
+          default:
+            ARM_COMPUTE_ERROR("Wrong num of dimensions");
+            break;
+        }
+
+        gather_id.set(0, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id), _output->info()->element_size(),
+                    output_it.ptr());
+      },
+      output_it);
+}
+
+template <typename U>
+void NEGatherKernelEx::gather_n_axis(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+
+  // Validate that the indices are not negative
+  validate_indices<U>(_indices);
+
+  Window output_window{window};
+  output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+  Iterator output_it(_output, output_window);
+  execute_window_loop(
+      output_window,
+      [&](const Coordinates &id) {
+        Coordinates gather_id(id);
+        gather_id.collapse(_indices->info()->num_dimensions(), _axis);
+
+        U new_index;
+        switch (_indices->info()->num_dimensions())
+        {
+          case 1:
+            new_index = *(reinterpret_cast<U *>(_indices->ptr_to_element(Coordinates(id[_axis]))));
+            break;
+          case 2:
+            new_index = *(reinterpret_cast<U *>(
+                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1]))));
+            break;
+          case 3:
+            new_index = *(reinterpret_cast<U *>(
+                _indices->ptr_to_element(Coordinates(id[_axis], id[_axis + 1], id[_axis + 2]))));
+            break;
+          default:
+            ARM_COMPUTE_ERROR("Wrong num of dimensions");
+            break;
+        }
+
+        gather_id.set(_axis, new_index);
+
+        std::copy_n(_input->ptr_to_element(gather_id),
+                    _input->info()->dimension(0) * _output->info()->element_size(),
+                    output_it.ptr());
+      },
+      output_it);
+}
+
+void NEGatherKernelEx::configure(const ITensor *input, const ITensor *indices, ITensor *output,
+                                 int axis)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
+  ARM_COMPUTE_ERROR_ON(indices->info()->num_dimensions() > 3);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  _input = input;
+  _indices = indices;
+  _output = output;
+  _axis = axis;
+
+  if (_axis < 0)
+  {
+    _axis += input->info()->num_dimensions();
+  }
+  ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
+
+  if (0 == _axis)
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEGatherKernelEx::gather_0_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEGatherKernelEx::gather_0_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  else
+  {
+    switch (_indices->info()->data_type())
+    {
+      case DataType::U32:
+        _func = &NEGatherKernelEx::gather_n_axis<uint32_t>;
+        break;
+      case DataType::S32:
+        _func = &NEGatherKernelEx::gather_n_axis<int32_t>;
+        break;
+      default:
+        ARM_COMPUTE_ERROR("Not supported");
+        break;
+    }
+  }
+  // Output auto initialization if not yet initialized
+  TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+      input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
+
+  // Create window
+  Window win = calculate_max_window(*output->info(), Steps());
+  output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+  INEKernel::configure(win);
+}
+
+Status NEGatherKernelEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                                  const ITensorInfo *output, int axis)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
+  ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+  ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
+
+  if (axis < 0)
+  {
+    axis += input->num_dimensions();
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::U16, DataType::S16,
+      DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+
+  if (output->total_size() != 0)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape_ex(
+        input->tensor_shape(), indices->tensor_shape(), axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32, DataType::S32);
+
+  return Status{};
+}
+
+void NEGatherKernelEx::run(const Window &window, const ThreadInfo &info)
+{
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+  (this->*_func)(window, info);
+}
+
+} // namespace arm_compute
diff --git a/runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
new file mode 100644
index 0000000..90dabb3
--- /dev/null
+++ b/runtimes/libs/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
+
+#include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
+#include "support/ToolchainSupport.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
+{
+  auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
+  k->configure(input, indices, output, axis);
+  _kernel = std::move(k);
+}
+
+Status NEGatherEx::validate(const ITensorInfo *input, const ITensorInfo *indices,
+                            const ITensorInfo *output, int axis)
+{
+  return NEGatherKernelEx::validate(input, indices, output, axis);
+}
+
+} // namespace arm_compute
diff --git a/runtimes/neurun/backend/acl_neon/KernelGenerator.cc b/runtimes/neurun/backend/acl_neon/KernelGenerator.cc
index b714f7e..0508a95 100644
--- a/runtimes/neurun/backend/acl_neon/KernelGenerator.cc
+++ b/runtimes/neurun/backend/acl_neon/KernelGenerator.cc
@@ -716,6 +716,49 @@ void KernelGenerator::visit(const model::operation::HashtableLookupNode &node)
   _execution_builder->append(std::move(acl_fn));
 }
 
+void KernelGenerator::visit(const model::operation::GatherNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+
+  const auto ifm_index{node.getInputs().at(model::operation::GatherNode::Input::INPUT)};
+  const auto indices_index{node.getInputs().at(model::operation::GatherNode::Input::INDICES)};
+
+  const auto axis_index{node.param().axis_index};
+
+  const auto ifm_shape = _ctx.at(ifm_index).shape();
+
+  const auto axis_value = static_cast<int>(_ctx.at(axis_index).asScalar<int32_t>());
+  // Converting in reverse order
+  const int axis =
+      ::neurun::backend::acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value();
+
+  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
+  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto indices_alloc = _tensor_builder->at(indices_index).get();
+  const auto backend_layout = ofm_alloc->layout();
+  UNUSED_RELEASE(backend_layout);
+
+  // NOTE The frontend layout and backend layout must be the same for this operation.
+  //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
+  //      is not not efficient even if it works well. If so, it would be better to set the
+  //      layout of these backend tensors to the same layout.
+  //      There is also one thing we have to think about. This operation depends on the layout of
+  //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
+  //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
+  //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
+  assert(backend_layout == ifm_alloc->layout());
+  assert(backend_layout == indices_alloc->layout());
+  assert(ifm_shape.rank() < 4 || _current_subg_layout == backend_layout);
+
+  auto fn = nnfw::cpp14::make_unique<::arm_compute::NEGatherEx>();
+
+  fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+
+  auto acl_fn = asAclFunction(std::move(fn));
+
+  _execution_builder->append(std::move(acl_fn));
+}
+
 void KernelGenerator::visit(const model::operation::L2NormalizationNode &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
diff --git a/runtimes/neurun/backend/acl_neon/KernelGenerator.h b/runtimes/neurun/backend/acl_neon/KernelGenerator.h
index fe1ff7c..2603860 100644
--- a/runtimes/neurun/backend/acl_neon/KernelGenerator.h
+++ b/runtimes/neurun/backend/acl_neon/KernelGenerator.h
@@ -50,6 +50,7 @@ public:
   void visit(const model::operation::EmbeddingLookupNode &) override;
   void visit(const model::operation::FloorNode &) override;
   void visit(const model::operation::FullyConnectedNode &) override;
+  void visit(const model::operation::GatherNode &) override;
   void visit(const model::operation::HashtableLookupNode &) override;
   void visit(const model::operation::L2NormalizationNode &) override;
   void visit(const model::operation::L2Pool2DNode &) override;
diff --git a/runtimes/neurun/backend/acl_neon/ShapeFixer.cc b/runtimes/neurun/backend/acl_neon/ShapeFixer.cc
index 915c2e9..6ba16c8 100644
--- a/runtimes/neurun/backend/acl_neon/ShapeFixer.cc
+++ b/runtimes/neurun/backend/acl_neon/ShapeFixer.cc
@@ -122,6 +122,16 @@ void ShapeFixer::visit(const model::operation::HashtableLookupNode &node)
   _tensor_builder->dimCorrection(output_index, false);
 }
 
+void ShapeFixer::visit(const model::operation::GatherNode &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(model::operation::GatherNode::Input::INPUT)};
+  const auto indices_index{node.getInputs().at(model::operation::GatherNode::Input::INDICES)};
+  _tensor_builder->dimCorrection(ofm_index, false);
+  _tensor_builder->dimCorrection(ifm_index, false);
+  _tensor_builder->dimCorrection(indices_index, false);
+}
+
 void ShapeFixer::visit(const model::operation::L2NormalizationNode &) { /* DO NOTHING */}
 
 void ShapeFixer::visit(const model::operation::L2Pool2DNode &) { /* DO NOTHING */}
diff --git a/runtimes/neurun/backend/acl_neon/ShapeFixer.h b/runtimes/neurun/backend/acl_neon/ShapeFixer.h
index 4a83bc6..ab7bd20 100644
--- a/runtimes/neurun/backend/acl_neon/ShapeFixer.h
+++ b/runtimes/neurun/backend/acl_neon/ShapeFixer.h
@@ -52,6 +52,7 @@ public:
   void visit(const model::operation::ExpNode &) override;
   void visit(const model::operation::FloorNode &) override;
   void visit(const model::operation::FullyConnectedNode &) override;
+  void visit(const model::operation::GatherNode &) override;
   void visit(const model::operation::HashtableLookupNode &) override;
   void visit(const model::operation::L2NormalizationNode &) override;
   void visit(const model::operation::L2Pool2DNode &) override;
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
index cdec336..df06758 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
@@ -7,7 +7,6 @@ GeneratedTests.mobilenet*
 GeneratedTests.svdf*
 GeneratedTests.batch_to_space*
 GeneratedTests.space_to_batch*
-GeneratedTests.gather_ex*
 GeneratedTests.topk_v2*
 # Unexpected result
 GeneratedTests.pack*
diff --git a/tests/scripts/neurun_frameworktest_list.armv7l.acl_neon.txt b/tests/scripts/neurun_frameworktest_list.armv7l.acl_neon.txt
index fd3410e..8bcdaef 100644
--- a/tests/scripts/neurun_frameworktest_list.armv7l.acl_neon.txt
+++ b/tests/scripts/neurun_frameworktest_list.armv7l.acl_neon.txt
@@ -8,6 +8,7 @@ div
 embedding_lookup
 floor
 fullyconnected
+gather
 hashtable_lookup
 l2_normalization
 l2_pool_2d