From: 윤지영/On-Device Lab(SR)/Engineer/삼성전자 Date: Tue, 29 Jan 2019 07:19:04 +0000 (+0900) Subject: Support multiple indices upto 4D output ranks in Gather (#4339) X-Git-Tag: submit/tizen/20190325.013700~296 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=81e316dc724b7e0606e4fd588a26a537fc062c08;p=platform%2Fcore%2Fml%2Fnnfw.git Support multiple indices upto 4D output ranks in Gather (#4339) This commit supports below test cases: 2D input + 2D indices => 3D output 2D input + 3D indices => 4D output 3D input + 2D indices => 4D output Signed-off-by: Jiyoung Yun --- diff --git a/libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h b/libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h new file mode 100644 index 0000000..0230fa1 --- /dev/null +++ b/libs/ARMComputeEx/arm_compute/runtime/misc/functions/GenericGather.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * @file        GenericGather.h + * @brief       This file contains GenericGather class + * @ingroup     COM_AI_RUNTIME + */ + +#ifndef __ARM_COMPUTE_MISC_GENERIC_GATHER_H__ +#define __ARM_COMPUTE_MISC_GENERIC_GATHER_H__ + +#include +#include + +#include +#include + +#include "Utils.h" + +namespace arm_compute +{ +namespace misc +{ + +/** + * @brief Class to run Gather with both CPU and GPU + */ +class GenericGather : public arm_compute::IFunction +{ +public: + GenericGather(void) + : _input(nullptr), _output(nullptr), _cl_permuted{}, _cl_permute{}, _cl_gather{} + { + // DO NOTHING + } + +public: + /** + * @brief Configure the layer + * @param[in] input The source tensor + * @param[in] indices The indices tensor + * @param[in] output The destination tensor + * @param[in] axis (Optional) The axis in input to gather indices from + * @return N/A + */ + void configure(arm_compute::ITensor *input, arm_compute::ITensor *indices, + arm_compute::ITensor *output, int axis = 0); + +public: + /** + * @brief Run the operation. Must be called after configure(). + * @return N/A + */ + void run(void) override; + +private: + arm_compute::ITensor *_input; + arm_compute::ITensor *_indices; + arm_compute::ITensor *_output; + int _axis; + arm_compute::CLTensor _cl_permuted; + +private: + arm_compute::CLPermute _cl_permute; + arm_compute::CLGather _cl_gather; +}; + +} // namespace misc +} // namespace arm_compute + +#endif // __ARM_COMPUTE_MISC_GENERIC_GATHER_H__ diff --git a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl index 9f616e4..05560e8 100644 --- a/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl +++ b/libs/ARMComputeEx/src/core/CL/cl_kernels/gather.cl @@ -16,7 +16,7 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(AXIS) +#if defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) /** Performs the Gather operation along the chosen axis * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short @@ -35,10 +35,14 @@ * @param[in] input_stride_w Stride of the source tensor in Z dimension (in bytes) * @param[in] input_step_w input_stride_w * number of elements along W processed per work item (in bytes) * @param[in] input_offset_first_element_in_bytes Offset of the first element in the source tensor - * @param[in] indices_ptr Pointer to the indices vector. Supported data types: U32. - * @param[in] indices_stride_x Stride of the indices vector in X dimension (in bytes) - * @param[in] indices_step_x input_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] indices_offset_first_element_in_bytes Offset of the first element in the indices vector + * @param[in] indices_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] indices_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the destination tensor * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per work item (in bytes) @@ -52,7 +56,7 @@ */ __kernel void gather( TENSOR4D_DECLARATION(input), - VECTOR_DECLARATION(indices), + TENSOR3D_DECLARATION(indices), TENSOR4D_DECLARATION(output)) { const int px = get_global_id(0); @@ -61,24 +65,47 @@ __kernel void gather( const int pw = get_global_id(2) / OUTPUT_DIM_Z; const Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z); - const Vector indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices); + const Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(indices); Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z); #if AXIS == 0 - const uint index = *(__global const uint *)vector_offset(&indices, px); +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, 0, 0); __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, 0); + __global const uchar *input_addr = tensor4D_offset(&input, index, pz, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, px, py, pz); + __global const uchar *input_addr = tensor4D_offset(&input, index, pw, 0, 0); +#endif #elif AXIS == 1 - const uint index = *(__global const uint *)vector_offset(&indices, py); +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, 0, 0); __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, pw, 0); +#elif INDICES_DIM == 3 + const uint index = *(__global const uint *)tensor3D_offset(&indices, py, pz, pw); + __global const uchar *input_addr = tensor4D_offset(&input, px, index, 0, 0); +#endif #elif AXIS == 2 - const uint index = *(__global const uint *)vector_offset(&indices, pz); +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, 0, 0); __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw); +#elif INDICES_DIM == 2 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pz, pw, 0); + __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, 0); +#endif #elif AXIS == 3 - const uint index = *(__global const uint *)vector_offset(&indices, pw); +#if INDICES_DIM == 1 + const uint index = *(__global const uint *)tensor3D_offset(&indices, pw, 0, 0); __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index); +#endif #endif //AXIS *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr); } -#endif //defined(DATA_TYPE) && defined(AXIS) +#endif //defined(DATA_TYPE) && defined(AXIS) && defined(INDICES_DIM) diff --git a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp index aa8a75f..24e3dc9 100644 --- a/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp +++ b/libs/ARMComputeEx/src/core/CL/kernels/CLGatherKernel.cpp @@ -29,13 +29,36 @@ namespace inline TensorShape compute_gather_shape(const TensorShape &input_shape, const TensorShape &indices_shape, uint32_t actual_axis) { - ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 1); + ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 3); ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() + indices_shape.num_dimensions() - 1 > 4); ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions()); TensorShape output_shape = input_shape; - output_shape[actual_axis] = indices_shape[0]; - + if (indices_shape.num_dimensions() == 1) + { + output_shape[actual_axis] = indices_shape[0]; + } + else if (indices_shape.num_dimensions() > 1) + { + output_shape.shift_right(indices_shape.num_dimensions() - 1); + + for (uint32_t i = 0, o = 0; o < output_shape.num_dimensions(); ++o, ++i) + { + if (o == actual_axis) + { + ++i; + for (uint32_t in = 0; in < indices_shape.num_dimensions(); ++in, ++o) + { + output_shape[o] = indices_shape[in]; + } + } + else + { + output_shape[o] = input_shape[i]; + } + } + } return output_shape; } @@ -52,8 +75,9 @@ inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *in const ITensorInfo *output, int axis) { const uint32_t actual_axis = wrap_around(axis, static_cast(input->num_dimensions())); - ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(indices->num_dimensions() > 3); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + ARM_COMPUTE_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > 4); ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions()); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( @@ -79,10 +103,11 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices); const uint32_t actual_axis = wrap_around(axis, static_cast(input->num_dimensions())); + std::unique_ptr output_info = input->clone(); + output_info->set_tensor_shape( + compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis)); // Output auto initialization if not yet initialized - TensorShape output_shape = - compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis); - auto_init_if_empty((*output), output_shape, 1, input->data_type()); + auto_init_if_empty((*output), output_info->tensor_shape(), 1, input->data_type()); // Create window Window win = calculate_max_window(*output, Steps()); @@ -119,6 +144,8 @@ void CLGatherKernel::configure(const ICLTensor *input, const ICLTensor *indices, support::cpp11::to_string(output->info()->dimension(2))); build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2))); build_opts.add_option("-DAXIS=" + support::cpp11::to_string(_axis)); + build_opts.add_option("-DINDICES_DIM=" + + support::cpp11::to_string(indices->info()->num_dimensions())); // Create kernel _kernel = static_cast( @@ -142,10 +169,10 @@ void CLGatherKernel::run(const Window &window, cl::CommandQueue &queue) ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, 4); unsigned int idx = 0; add_4D_tensor_argument(idx, _input, window_collapsed); - add_1D_tensor_argument(idx, _indices, window_collapsed); + add_3D_tensor_argument(idx, _indices, window_collapsed); add_4D_tensor_argument(idx, _output, window_collapsed); enqueue(queue, *this, window_collapsed); } diff --git a/libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp b/libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp new file mode 100644 index 0000000..a3e6cce --- /dev/null +++ b/libs/ARMComputeEx/src/runtime/misc/functions/GenericGather.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arm_compute/runtime/misc/functions/GenericGather.h" + +namespace arm_compute +{ +namespace misc +{ + +bool shouldPermute(arm_compute::ITensorInfo *input, arm_compute::ITensorInfo *output) +{ + return (input->num_dimensions() != 4 && output->num_dimensions() == 4); +} + +void GenericGather::configure(arm_compute::ITensor *input, arm_compute::ITensor *indices, + arm_compute::ITensor *output, int axis) +{ + _input = input; + _indices = indices; + _output = output; + _axis = axis; + + arm_compute::PermutationVector pv; + if (shouldPermute(input->info(), output->info())) + { + // NOTE This vector comes from CLPermuteKernel implementation + // + // This implementation permutes a tensor of shape C / W / H into another tensor of shape W / H / + // C + // + // Original | Permuted + // 0 | C | W (from 1) + // 1 | W | H (from 2) + // 2 | H | C (from 0) + // + pv = arm_compute::PermutationVector{1, 2, 0}; + } + + if (utils::isGpuMode()) + { + if (shouldPermute(input->info(), output->info())) + { + _cl_gather.configure(CAST_CL(input), CAST_CL(indices), &_cl_permuted, axis); + _cl_permute.configure(&_cl_permuted, CAST_CL(output), pv); + + // NOTE _permuted is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_permuted.allocator()->allocate(); + } + else + { + _cl_gather.configure(CAST_CL(input), CAST_CL(indices), CAST_CL(output), axis); + } + } + else + { + throw std::runtime_error("Not supported, yet"); + } +} + +void GenericGather::run(void) +{ + if (utils::isGpuMode()) + { + _cl_gather.run(); + if (shouldPermute(_input->info(), _output->info())) + { + _cl_permute.run(); + } + } + else + { + throw std::runtime_error("Not supported, yet"); + } +} + +} // namespace misc +} // namespace arm_compute diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc index 36d9c5e..ba4964e 100644 --- a/runtimes/pure_arm_compute/src/compilation.cc +++ b/runtimes/pure_arm_compute/src/compilation.cc @@ -113,6 +113,7 @@ #include #include #include +#include #include "misc/matrix/IndexIterator.h" #include "misc/kernel/IndexIterator.h" @@ -2975,9 +2976,11 @@ void Planner::visit(const ::internal::tflite::op::Gather::Node &node) const auto ifm_shape = _ctx.at(ifm_index).shape(); const auto indices_shape = _ctx.at(indices_index).shape(); const auto axis_shape = _ctx.at(axis_index).shape(); + const auto ofm_shape = _ctx.at(ofm_index).shape(); assert(ifm_shape.rank() <= 4); - assert(indices_shape.rank() <= 1); + assert(indices_shape.rank() <= 3); + assert(ofm_shape.rank() <= 4); assert(_ctx.at(axis_index).hasData()); assert(axis_shape.rank() == 0); @@ -3025,7 +3028,7 @@ void Planner::visit(const ::internal::tflite::op::Gather::Node &node) { std::unique_ptr<::arm_compute::IFunction> fn; - auto l = nnfw::cpp14::make_unique<::arm_compute::CLGather>(); + auto l = nnfw::cpp14::make_unique(); l->configure(CAST_CL(ifm_alloc), CAST_CL(indices_alloc), CAST_CL(ofm_alloc), param.axis); fn = std::move(l); builder.append("Gather", std::move(fn));