--- /dev/null
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file GenericGather.h
+ * @brief This file contains GenericGather class
+ * @ingroup COM_AI_RUNTIME
+ */
+
+#ifndef __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+#define __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+#include <arm_compute/runtime/CL/functions/CLGather.h>
+
+#include "Utils.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+/**
+ * @brief Class to run Gather with both CPU and GPU
+ */
+class GenericGather : public arm_compute::IFunction
+{
+public:
+ GenericGather(void)
+ : _input(nullptr), _output(nullptr), _cl_permuted{}, _cl_permute{}, _cl_gather{}
+ {
+ // DO NOTHING
+ }
+
+public:
+ /**
+ * @brief Configure the layer
+ * @param[in] input The source tensor
+ * @param[in] indices The indices tensor
+ * @param[in] output The destination tensor
+ * @param[in] axis (Optional) The axis in input to gather indices from
+ * @return N/A
+ */
+ void configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+ arm_compute::ITensor *output, int axis = 0);
+
+public:
+ /**
+ * @brief Run the operation. Must be called after configure().
+ * @return N/A
+ */
+ void run(void) override;
+
+private:
+ arm_compute::ITensor *_input;
+ arm_compute::ITensor *_indices;
+ arm_compute::ITensor *_output;
+ int _axis;
+ arm_compute::CLTensor _cl_permuted;
+
+private:
+ arm_compute::CLPermute _cl_permute;
+ arm_compute::CLGather _cl_gather;
+};
+
+} // namespace misc
+} // namespace arm_compute
+
+#endif // __ARM_COMPUTE_MISC_GENERIC_GATHER_H__
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(AXIS)
+#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
/** Performs the Gather operation along the chosen axis
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
* @param[in] input_stride_w Stride of the source tensor in Z dimension (in bytes)
* @param[in] input_step_w input_stride_w * number of elements along W processed per work item (in bytes)
* @param[in] input_offset_first_element_in_bytes Offset of the first element in the source tensor
- * @param[in] indices_ptr Pointer to the indices vector. Supported data types: U32.
- * @param[in] indices_stride_x Stride of the indices vector in X dimension (in bytes)
- * @param[in] indices_step_x input_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the indices vector
+ * @param[in] indices_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] indices_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] indices_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] indices_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the destination tensor
* @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
* @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per work item (in bytes)
*/
__kernel void gather(
TENSOR4D_DECLARATION(input),
- VECTOR_DECLARATION(indices),
+ TENSOR3D_DECLARATION(indices),
TENSOR4D_DECLARATION(output))
{
const int px = get_global_id(0);
const int pw = get_global_id(2) / OUTPUT_DIM_Z;
const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
- const Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+ const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices);
Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
#if AXIS == 0
- const uint index = *(__global const uint *)vector_offset(&indices, px);
+#if INDICES_DIM == 1
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0);
__global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif INDICES_DIM == 2
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0);
+#elif INDICES_DIM == 3
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz);
+ __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0);
+#endif
#elif AXIS == 1
- const uint index = *(__global const uint *)vector_offset(&indices, py);
+#if INDICES_DIM == 1
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0);
__global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif INDICES_DIM == 2
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0);
+#elif INDICES_DIM == 3
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0);
+#endif
#elif AXIS == 2
- const uint index = *(__global const uint *)vector_offset(&indices, pz);
+#if INDICES_DIM == 1
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0);
__global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif INDICES_DIM == 2
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0);
+ __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0);
+#endif
#elif AXIS == 3
- const uint index = *(__global const uint *)vector_offset(&indices, pw);
+#if INDICES_DIM == 1
+ const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0);
__global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif
#endif //AXIS
*(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
}
-#endif //defined(DATA_TYPE) && defined(AXIS)
+#endif //defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM)
inline TensorShape compute_gather_shape(const TensorShape &input_shape,
const TensorShape &indices_shape, uint32_t actual_axis)
{
- ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 1);
+ ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3);
ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4);
ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
TensorShape output_shape = input_shape;
- output_shape[actual_axis] = indices_shape[0];
-
+ if (indices_shape.num_dimensions() == 1)
+ {
+ output_shape[actual_axis] = indices_shape[0];
+ }
+ else if (indices_shape.num_dimensions() > 1)
+ {
+ output_shape.shift_right(indices_shape.num_dimensions() - 1);
+
+ for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i)
+ {
+ if (o == actual_axis)
+ {
+ ++i;
+ for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o)
+ {
+ output_shape[o] = indices_shape[in];
+ }
+ }
+ else
+ {
+ output_shape[o] = input_shape[i];
+ }
+ }
+ }
return output_shape;
}
const ITensorInfo *output, int axis)
{
const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
- ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1);
+ ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3);
ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+ ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4);
ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
+ std::unique_ptr<ITensorInfo> output_info = input->clone();
+ output_info->set_tensor_shape(
+ compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis));
// Output auto initialization if not yet initialized
- TensorShape output_shape =
- compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
- auto_init_if_empty((*output), output_shape, 1, input->data_type());
+ auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type());
// Create window
Window win = calculate_max_window(*output, Steps());
support::cpp11::to_string(output->info()->dimension(2)));
build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis));
+ build_opts.add_option("-DINDICES_DIM=" +
+ support::cpp11::to_string(indices->info()->num_dimensions()));
// Create kernel
_kernel = static_cast<cl::Kernel>(
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
- Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+ Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4);
unsigned int idx = 0;
add_4D_tensor_argument(idx, _input, window_collapsed);
- add_1D_tensor_argument(idx, _indices, window_collapsed);
+ add_3D_tensor_argument(idx, _indices, window_collapsed);
add_4D_tensor_argument(idx, _output, window_collapsed);
enqueue(queue, *this, window_collapsed);
}
--- /dev/null
+/*
+ * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_compute/runtime/misc/functions/GenericGather.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+
+bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output)
+{
+ return (input->num_dimensions() != 4 && output->num_dimensions() == 4);
+}
+
+void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices,
+ arm_compute::ITensor *output, int axis)
+{
+ _input = input;
+ _indices = indices;
+ _output = output;
+ _axis = axis;
+
+ arm_compute::PermutationVector pv;
+ if (shouldPermute(input->info(), output->info()))
+ {
+ // NOTE This vector comes from CLPermuteKernel implementation
+ //
+ // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H /
+ // C
+ //
+ // Original | Permuted
+ // 0 | C | W (from 1)
+ // 1 | W | H (from 2)
+ // 2 | H | C (from 0)
+ //
+ pv = arm_compute::PermutationVector{1, 2, 0};
+ }
+
+ if (utils::isGpuMode())
+ {
+ if (shouldPermute(input->info(), output->info()))
+ {
+ _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis);
+ _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv);
+
+ // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _cl_permuted.allocator()->allocate();
+ }
+ else
+ {
+ _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis);
+ }
+ }
+ else
+ {
+ throw std::runtime_error("Not supported, yet");
+ }
+}
+
+void GenericGather::run(void)
+{
+ if (utils::isGpuMode())
+ {
+ _cl_gather.run();
+ if (shouldPermute(_input->info(), _output->info()))
+ {
+ _cl_permute.run();
+ }
+ }
+ else
+ {
+ throw std::runtime_error("Not supported, yet");
+ }
+}
+
+} // namespace misc
+} // namespace arm_compute
#include <arm_compute/runtime/misc/functions/SimpleUnpackLayer.h>
#include <arm_compute/runtime/misc/functions/SimpleSQRT.h>
#include <arm_compute/runtime/misc/functions/SimpleArgOperation.h>
+#include <arm_compute/runtime/misc/functions/GenericGather.h>
#include "misc/matrix/IndexIterator.h"
#include "misc/kernel/IndexIterator.h"
const auto ifm_shape = _ctx.at(ifm_index).shape();
const auto indices_shape = _ctx.at(indices_index).shape();
const auto axis_shape = _ctx.at(axis_index).shape();
+ const auto ofm_shape = _ctx.at(ofm_index).shape();
assert(ifm_shape.rank() <= 4);
- assert(indices_shape.rank() <= 1);
+ assert(indices_shape.rank() <= 3);
+ assert(ofm_shape.rank() <= 4);
assert(_ctx.at(axis_index).hasData());
assert(axis_shape.rank() == 0);
{
std::unique_ptr<::arm_compute::IFunction> fn;
- auto l = nnfw::cpp14::make_unique<::arm_compute::CLGather>();
+ auto l = nnfw::cpp14::make_unique<GenericGather>();
l->configure(CAST_CL(ifm_alloc), CAST_CL(indices_alloc), CAST_CL(ofm_alloc), param.axis);
fn = std::move(l);
builder.append("Gather", std::move(fn));