From 3ce3098ded332187b395c17c4eeea5ee391e1095 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EA=B9=80=EC=9A=A9=EC=84=AD/=EB=8F=99=EC=9E=91=EC=A0=9C?= =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?= =?utf8?q?=EC=9E=90?= Date: Thu, 30 Aug 2018 14:00:04 +0900 Subject: [PATCH] Introduce GenericFullyConnectedLayer for fc (#2525) 4d fc tests of GeneratedTests are failed because runtimes can't guess 4d input to 2d so far. This GenericFullyConnectedLayer handles the case. Signed-off-by: Yongseop Kim --- runtimes/pure_arm_compute/src/compilation.cc | 35 +++++----- .../internal/layers/GenericFullyConnectedLayer.cc | 74 ++++++++++++++++++++++ .../internal/layers/GenericFullyConnectedLayer.h | 37 +++++++++++ 3 files changed, 126 insertions(+), 20 deletions(-) create mode 100644 runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc create mode 100644 runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc index 5d7b521..6958fd6 100644 --- a/runtimes/pure_arm_compute/src/compilation.cc +++ b/runtimes/pure_arm_compute/src/compilation.cc @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -35,7 +34,6 @@ #include #include #include -#include #include "internal/arm_compute.h" #include "internal/arm_compute/Cast.h" @@ -51,6 +49,7 @@ #include "internal/layers/GenericReshapeLayer.h" #include "internal/layers/SimpleArithmeticAddition.h" #include "internal/layers/SimpleCastLayer.h" +#include "internal/layers/GenericFullyConnectedLayer.h" #include "util/matrix/IndexIterator.h" #include "util/kernel/IndexIterator.h" @@ -1997,7 +1996,9 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node) const auto batch_size = _ctx.at(output_index).shape().dim(0); const auto input_size = _ctx.at(weight_index).shape().dim(1); - // Check for reshaping input's shape into rank-2 and do reshaping + // Check for reshaping input's shape into rank-2 + bool needs_reshape = false; + nnfw::util::matrix::Shape reshape; if (input_rank == 4) { nnfw::util::feature::Shape ifm_shape_feature = _ctx.at(input_index).shape().asFeature(); @@ -2005,10 +2006,14 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node) ifm_shape_feature.N * ifm_shape_feature.C * ifm_shape_feature.H * ifm_shape_feature.W; assert(feature_size == batch_size * input_size); - // TODO Add reshaping _builder.addShapeConstr( input_index, asTensorInfo(ifm_shape_feature, _ctx.at(input_index).type(), _ctx.at(input_index).scale(), _ctx.at(input_index).zeroPoint())); + + // for reshaping + needs_reshape = true; + reshape.H = batch_size; + reshape.W = input_size; } else if (input_rank == 2) { @@ -2055,29 +2060,19 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node) param.activation = static_cast(_ctx.at(activation_index).asScalar()); - auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) { + auto stage = [param, needs_reshape, reshape](const IAllocationContext &ctx, + IExecutionBuilder &builder) { auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index}); auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index}); auto weight_alloc = ctx.at(::internal::tflite::operand::Index{param.weight_index}); auto bias_alloc = ctx.at(::internal::tflite::operand::Index{param.bias_index}); - if (::internal::arm_compute::isGpuMode()) - { - auto fn = nnfw::make_unique<::arm_compute::CLFullyConnectedLayer>(); - - fn->configure(CAST_CL(input_alloc), CAST_CL(weight_alloc), CAST_CL(bias_alloc), - CAST_CL(output_alloc)); + auto fn = nnfw::make_unique(); - builder.append("FullyConnected", std::move(fn)); - } - else // NEON - { - auto fn = nnfw::make_unique<::arm_compute::NEFullyConnectedLayer>(); + fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc, needs_reshape, + asTensorShape(reshape)); - fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc); - - builder.append("FullyConnected", std::move(fn)); - } + builder.append("FullyConnected", std::move(fn)); ActivationBuilder{builder}.append(param.activation, output_alloc); }; diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc new file mode 100644 index 0000000..33255a9 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc @@ -0,0 +1,74 @@ +#include "GenericFullyConnectedLayer.h" +#include "internal/arm_compute.h" + +#include + +void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input, + ::arm_compute::ITensor *weights, + ::arm_compute::ITensor *biases, + ::arm_compute::ITensor *output, bool needs_reshape, + ::arm_compute::TensorShape reshape) +{ + _input = input; + _weights = weights; + _biases = biases; + _output = output; + _needs_reshape = needs_reshape; + + // TODO Too many duplicated code. Revise below code. + if (::internal::arm_compute::isGpuMode()) + { + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + _generic_reshape.configure(CAST_CL(_input), &_cl_buffer); + + _cl_fc.configure(&_cl_buffer, CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output)); + + // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here. + _cl_buffer.allocator()->allocate(); + } + else + { + _cl_fc.configure(CAST_CL(_input), CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output)); + } + } + else + { + if (_needs_reshape) + { + // reshape + auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape)); + _generic_reshape.configure(CAST_NE(_input), &_neon_buffer); + + _neon_fc.configure(&_neon_buffer, CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output)); + + // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate + // here. + _neon_buffer.allocator()->allocate(); + } + else + { + _neon_fc.configure(CAST_NE(_input), CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output)); + } + } +} + +void GenericFullyConnectedLayer::run(void) +{ + if (::internal::arm_compute::isGpuMode()) + { + if (_needs_reshape) + _generic_reshape.run(); + + _cl_fc.run(); + } + else + { + if (_needs_reshape) + _generic_reshape.run(); + + _neon_fc.run(); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h new file mode 100644 index 0000000..bc4960a --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h @@ -0,0 +1,37 @@ +#ifndef __GENERIC_FULLY_CONNECTED_LAYER_H__ +#define __GENERIC_FULLY_CONNECTED_LAYER_H__ + +#include +#include +#include +#include +#include "internal/layers/GenericReshapeLayer.h" + +class GenericFullyConnectedLayer : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights, + ::arm_compute::ITensor *biases, ::arm_compute::ITensor *output, bool needs_reshape, + ::arm_compute::TensorShape reshape); + +public: + void run(void) override; + +private: + ::arm_compute::ITensor *_input; + ::arm_compute::ITensor *_weights; + ::arm_compute::ITensor *_biases; + ::arm_compute::ITensor *_output; + + // buffer for reshaping input tensor + ::arm_compute::CLTensor _cl_buffer; + ::arm_compute::Tensor _neon_buffer; + +private: + ::arm_compute::CLFullyConnectedLayer _cl_fc; + ::arm_compute::NEFullyConnectedLayer _neon_fc; + GenericReshapeLayer _generic_reshape; + bool _needs_reshape; +}; + +#endif // __GENERIC_FULLY_CONNECTED_LAYER_H__ -- 2.7.4