#include <arm_compute/runtime/CL/functions/CLScale.h>
#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
#include <arm_compute/runtime/CL/functions/CLStridedSlice.h>
-#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
#include <arm_compute/runtime/CL/functions/CLSoftmaxLayer.h>
#include <arm_compute/runtime/CL/functions/CLGather.h>
#include <arm_compute/runtime/CL/functions/CLTopKV2.h>
#include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
#include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
#include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
-#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
#include "internal/arm_compute.h"
#include "internal/arm_compute/Cast.h"
#include "internal/layers/GenericReshapeLayer.h"
#include "internal/layers/SimpleArithmeticAddition.h"
#include "internal/layers/SimpleCastLayer.h"
+#include "internal/layers/GenericFullyConnectedLayer.h"
#include "util/matrix/IndexIterator.h"
#include "util/kernel/IndexIterator.h"
const auto batch_size = _ctx.at(output_index).shape().dim(0);
const auto input_size = _ctx.at(weight_index).shape().dim(1);
- // Check for reshaping input's shape into rank-2 and do reshaping
+ // Check for reshaping input's shape into rank-2
+ bool needs_reshape = false;
+ nnfw::util::matrix::Shape reshape;
if (input_rank == 4)
{
nnfw::util::feature::Shape ifm_shape_feature = _ctx.at(input_index).shape().asFeature();
ifm_shape_feature.N * ifm_shape_feature.C * ifm_shape_feature.H * ifm_shape_feature.W;
assert(feature_size == batch_size * input_size);
- // TODO Add reshaping
_builder.addShapeConstr(
input_index, asTensorInfo(ifm_shape_feature, _ctx.at(input_index).type(),
_ctx.at(input_index).scale(), _ctx.at(input_index).zeroPoint()));
+
+ // for reshaping
+ needs_reshape = true;
+ reshape.H = batch_size;
+ reshape.W = input_size;
}
else if (input_rank == 2)
{
param.activation = static_cast<FuseCode>(_ctx.at(activation_index).asScalar<int32_t>());
- auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+ auto stage = [param, needs_reshape, reshape](const IAllocationContext &ctx,
+ IExecutionBuilder &builder) {
auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index});
auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index});
auto weight_alloc = ctx.at(::internal::tflite::operand::Index{param.weight_index});
auto bias_alloc = ctx.at(::internal::tflite::operand::Index{param.bias_index});
- if (::internal::arm_compute::isGpuMode())
- {
- auto fn = nnfw::make_unique<::arm_compute::CLFullyConnectedLayer>();
-
- fn->configure(CAST_CL(input_alloc), CAST_CL(weight_alloc), CAST_CL(bias_alloc),
- CAST_CL(output_alloc));
+ auto fn = nnfw::make_unique<GenericFullyConnectedLayer>();
- builder.append("FullyConnected", std::move(fn));
- }
- else // NEON
- {
- auto fn = nnfw::make_unique<::arm_compute::NEFullyConnectedLayer>();
+ fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc, needs_reshape,
+ asTensorShape(reshape));
- fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc);
-
- builder.append("FullyConnected", std::move(fn));
- }
+ builder.append("FullyConnected", std::move(fn));
ActivationBuilder{builder}.append(param.activation, output_alloc);
};
--- /dev/null
+#include "GenericFullyConnectedLayer.h"
+#include "internal/arm_compute.h"
+
+#include <arm_compute/core/Helpers.h>
+
+void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input,
+ ::arm_compute::ITensor *weights,
+ ::arm_compute::ITensor *biases,
+ ::arm_compute::ITensor *output, bool needs_reshape,
+ ::arm_compute::TensorShape reshape)
+{
+ _input = input;
+ _weights = weights;
+ _biases = biases;
+ _output = output;
+ _needs_reshape = needs_reshape;
+
+ // TODO Too many duplicated code. Revise below code.
+ if (::internal::arm_compute::isGpuMode())
+ {
+ if (_needs_reshape)
+ {
+ // reshape
+ auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+ _generic_reshape.configure(CAST_CL(_input), &_cl_buffer);
+
+ _cl_fc.configure(&_cl_buffer, CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+
+ // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+ _cl_buffer.allocator()->allocate();
+ }
+ else
+ {
+ _cl_fc.configure(CAST_CL(_input), CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+ }
+ }
+ else
+ {
+ if (_needs_reshape)
+ {
+ // reshape
+ auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+ _generic_reshape.configure(CAST_NE(_input), &_neon_buffer);
+
+ _neon_fc.configure(&_neon_buffer, CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+
+ // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate
+ // here.
+ _neon_buffer.allocator()->allocate();
+ }
+ else
+ {
+ _neon_fc.configure(CAST_NE(_input), CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+ }
+ }
+}
+
+void GenericFullyConnectedLayer::run(void)
+{
+ if (::internal::arm_compute::isGpuMode())
+ {
+ if (_needs_reshape)
+ _generic_reshape.run();
+
+ _cl_fc.run();
+ }
+ else
+ {
+ if (_needs_reshape)
+ _generic_reshape.run();
+
+ _neon_fc.run();
+ }
+}
--- /dev/null
+#ifndef __GENERIC_FULLY_CONNECTED_LAYER_H__
+#define __GENERIC_FULLY_CONNECTED_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include "internal/layers/GenericReshapeLayer.h"
+
+class GenericFullyConnectedLayer : public ::arm_compute::IFunction
+{
+public:
+ void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights,
+ ::arm_compute::ITensor *biases, ::arm_compute::ITensor *output, bool needs_reshape,
+ ::arm_compute::TensorShape reshape);
+
+public:
+ void run(void) override;
+
+private:
+ ::arm_compute::ITensor *_input;
+ ::arm_compute::ITensor *_weights;
+ ::arm_compute::ITensor *_biases;
+ ::arm_compute::ITensor *_output;
+
+ // buffer for reshaping input tensor
+ ::arm_compute::CLTensor _cl_buffer;
+ ::arm_compute::Tensor _neon_buffer;
+
+private:
+ ::arm_compute::CLFullyConnectedLayer _cl_fc;
+ ::arm_compute::NEFullyConnectedLayer _neon_fc;
+ GenericReshapeLayer _generic_reshape;
+ bool _needs_reshape;
+};
+
+#endif // __GENERIC_FULLY_CONNECTED_LAYER_H__