From 2f47fad111e7158e1aea34d8352ccc6a9880bbbd Mon Sep 17 00:00:00 2001 From: =?utf8?q?Vishal=20Keshav/SNAP=20/SRI-Bangalore/Engineer/=EC=82=BC?= =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= Date: Thu, 6 Dec 2018 13:57:47 +0530 Subject: [PATCH] Implementation of CUSTOM Unpack op (#3669) ACL implementation Used CLSubtensor and CLReshape CLPermute to overcome backward swizzle Signed-off-by: Vishal Keshav --- runtimes/pure_arm_compute/src/compilation.cc | 72 +++++++++++++++++++++- .../src/internal/layers/SimpleUnpackLayer.cc | 60 ++++++++++++++++++ .../src/internal/layers/SimpleUnpackLayer.h | 29 +++++++++ .../pure_arm_compute/src/internal/op/Unpack.cc | 5 +- 4 files changed, 163 insertions(+), 3 deletions(-) create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc index 7e15d7d..b1092c4 100644 --- a/runtimes/pure_arm_compute/src/compilation.cc +++ b/runtimes/pure_arm_compute/src/compilation.cc @@ -104,6 +104,7 @@ #include "internal/layers/SimplePackLayer.h" #include "internal/layers/SimpleSpaceToBatchND.h" #include "internal/layers/SimpleNeg.h" +#include "internal/layers/SimpleUnpackLayer.h" #include "internal/layers/SimpleSQRT.h" #include "internal/layers/SimpleArgMinMax.h" @@ -5109,8 +5110,77 @@ void Planner::visit(const ::internal::tflite::op::DepthToSpace::Node &node) void Planner::visit(const ::internal::tflite::op::Unpack::Node &node) { VERBOSE(Unpack) << "Configure Unpack operation" << std::endl; + const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index}; + uint32_t input_rank = _ctx.at(ifm_index).shape().rank(); - throw std::runtime_error("Not supported, yet"); + assert(input_rank == 4 || input_rank == 3 || input_rank == 2); + _builder.addShapeConstr(ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()), + _ctx.at(ifm_index).type())); + + int32_t axis = + _ctx.at(::internal::tflite::operand::Index{node.param().axis_index}).asScalar(); + // int32_t num_split = + // _ctx.at(::internal::tflite::operand::Index{node.param().num_split_index}).asScalar(); + + for (const auto &index : node.param().ofm_indexes) + { + const ::internal::tflite::operand::Index ofm_index{index}; + _builder.addShapeConstr(ofm_index, asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape()), + _ctx.at(ofm_index).type())); + } + + struct Param + { + std::vector ofm_indexes; + int ifm_index; + int axis; + }; + + if (input_rank == 4) + { + Param param; + param.ifm_index = ifm_index.asInt(); + param.axis = axis; + for (const auto &index : node.param().ofm_indexes) + { + param.ofm_indexes.push_back(index); + } + + auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) { + auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index}); + + if (::internal::arm_compute::isGpuMode()) + { + auto fn = nnfw::make_unique(); + std::vector<::arm_compute::ICLTensor *> outputs; + for (const auto &index : param.ofm_indexes) + { + auto output_alloc = ctx.at(::internal::tflite::operand::Index{index}); + outputs.push_back(CAST_CL(output_alloc)); + } + fn->configure(CAST_CL(input_alloc), outputs, param.axis); + + builder.append("Unpack", std::move(fn)); + } + else + throw std::runtime_error("Not supported, yet"); + }; + + _builder.addStage(stage); + } + else if (input_rank == 3) + { + // TODO: generate test case for this and generalize 4D method all cases. + throw std::runtime_error("UNPACK_3D not implemented"); + } + else if (input_rank == 2) + { + throw std::runtime_error("UNPACK_2D not implemented"); + } + else + { + throw std::runtime_error("UNPACK axis is not valid"); + } } void Planner::visit(const ::internal::tflite::op::Pack::Node &node) diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc new file mode 100644 index 0000000..8dde475 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc @@ -0,0 +1,60 @@ +#include "internal/arm_compute.h" +#include "internal/Swizzle.h" +#include "SimpleUnpackLayer.h" + +void SimpleUnpackLayer::configure(::arm_compute::ICLTensor *input, + const std::vector<::arm_compute::ICLTensor *> &output_vector, + int32_t axis) +{ + uint32_t nr_outputs = output_vector.size(); + _cl_permuted_vector.resize(nr_outputs); + _cl_permute_vector.resize(nr_outputs); + uint32_t input_rank = input->info()->num_dimensions(); + const ::arm_compute::PermutationVector pv{2, 0, 1}; + _input = input; + // Negatige axis is supported, -1 implies R-1 axis where R is input rank + if (axis < 0) + { + axis += input_rank; + } + _axis = ToARMComputeAxis(input_rank, axis).value(); + _cl_reshape_vector.resize(nr_outputs); + + ::arm_compute::TensorShape subTensor_shape{}; + for (int i = 0; i < input_rank; i++) + { + if (i != _axis) + { + subTensor_shape.set(i, _input->info()->tensor_shape()[i]); + } + else + { + subTensor_shape.set(i, 1); + } + } + + auto subTensor_offset = ::arm_compute::Coordinates{}; + subTensor_offset.set_num_dimensions(input_rank); + + for (int i = 0; i < output_vector.size(); i++) + { + _output_vector.push_back(output_vector[i]); + subTensor_offset[_axis] = i; + auto temp_tensor = std::make_shared<::arm_compute::CLSubTensor>( + CAST_CL(_input), subTensor_shape, subTensor_offset, true); + _sub_tensor_vector.push_back(temp_tensor); + // Copies into the subtensor + _cl_permute_vector[i].configure(_sub_tensor_vector[i].get(), &_cl_permuted_vector[i], pv); + _cl_reshape_vector[i].configure(&_cl_permuted_vector[i], CAST_CL(_output_vector[i])); + _cl_permuted_vector[i].allocator()->allocate(); + } +} + +void SimpleUnpackLayer::run(void) +{ + for (int i = 0; i < _output_vector.size(); i++) + { + _cl_permute_vector[i].run(); + _cl_reshape_vector[i].run(); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h new file mode 100644 index 0000000..f2a78d2 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h @@ -0,0 +1,29 @@ +#ifndef __UNPACK_LAYER_H__ +#define __UNPACK_LAYER_H__ + +#include +#include +#include +#include +#include + +class SimpleUnpackLayer : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ICLTensor *input, + const std::vector<::arm_compute::ICLTensor *> &output_vector, int32_t axis); + +public: + void run(void) override; + +private: + std::vector<::arm_compute::CLTensor> _cl_permuted_vector; + std::vector<::arm_compute::ICLTensor *> _output_vector; + std::vector> _sub_tensor_vector; + std::vector<::arm_compute::CLReshapeLayer> _cl_reshape_vector; + std::vector<::arm_compute::CLPermute> _cl_permute_vector; + ::arm_compute::ICLTensor *_input; + int32_t _axis; +}; + +#endif // __UNPACK_LAYER_H__ diff --git a/runtimes/pure_arm_compute/src/internal/op/Unpack.cc b/runtimes/pure_arm_compute/src/internal/op/Unpack.cc index 7f4fb04..a1be028 100644 --- a/runtimes/pure_arm_compute/src/internal/op/Unpack.cc +++ b/runtimes/pure_arm_compute/src/internal/op/Unpack.cc @@ -43,8 +43,9 @@ namespace op { namespace Unpack { -// There are two inputs: tensor which is to be unpacked -// and axis along which tensor needs to be unpacked. +// There are three inputs: tensor which is to be unpacked, +// axis along which tensor needs to be unpacked +// and number of splits along the axis. Param::Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount, const uint32_t *outputs) -- 2.7.4