From 3ce3098ded332187b395c17c4eeea5ee391e1095 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EA=B9=80=EC=9A=A9=EC=84=AD/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?=
 =?utf8?q?=EC=9E=90?= <yons.kim@samsung.com>
Date: Thu, 30 Aug 2018 14:00:04 +0900
Subject: [PATCH] Introduce GenericFullyConnectedLayer for fc (#2525)

4d fc tests of GeneratedTests are failed because runtimes can't
guess 4d input to 2d so far. This GenericFullyConnectedLayer handles the
case.

Signed-off-by: Yongseop Kim <yons.kim@samsung.com>
---
 runtimes/pure_arm_compute/src/compilation.cc       | 35 +++++-----
 .../internal/layers/GenericFullyConnectedLayer.cc  | 74 ++++++++++++++++++++++
 .../internal/layers/GenericFullyConnectedLayer.h   | 37 +++++++++++
 3 files changed, 126 insertions(+), 20 deletions(-)
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index 5d7b521..6958fd6 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -12,7 +12,6 @@
 #include <arm_compute/runtime/CL/functions/CLScale.h>
 #include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
 #include <arm_compute/runtime/CL/functions/CLStridedSlice.h>
-#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
 #include <arm_compute/runtime/CL/functions/CLSoftmaxLayer.h>
 #include <arm_compute/runtime/CL/functions/CLGather.h>
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
@@ -35,7 +34,6 @@
 #include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
-#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 
 #include "internal/arm_compute.h"
 #include "internal/arm_compute/Cast.h"
@@ -51,6 +49,7 @@
 #include "internal/layers/GenericReshapeLayer.h"
 #include "internal/layers/SimpleArithmeticAddition.h"
 #include "internal/layers/SimpleCastLayer.h"
+#include "internal/layers/GenericFullyConnectedLayer.h"
 
 #include "util/matrix/IndexIterator.h"
 #include "util/kernel/IndexIterator.h"
@@ -1997,7 +1996,9 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node)
   const auto batch_size = _ctx.at(output_index).shape().dim(0);
   const auto input_size = _ctx.at(weight_index).shape().dim(1);
 
-  // Check for reshaping input's shape into rank-2 and do reshaping
+  // Check for reshaping input's shape into rank-2
+  bool needs_reshape = false;
+  nnfw::util::matrix::Shape reshape;
   if (input_rank == 4)
   {
     nnfw::util::feature::Shape ifm_shape_feature = _ctx.at(input_index).shape().asFeature();
@@ -2005,10 +2006,14 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node)
         ifm_shape_feature.N * ifm_shape_feature.C * ifm_shape_feature.H * ifm_shape_feature.W;
     assert(feature_size == batch_size * input_size);
 
-    // TODO Add reshaping
     _builder.addShapeConstr(
         input_index, asTensorInfo(ifm_shape_feature, _ctx.at(input_index).type(),
                                   _ctx.at(input_index).scale(), _ctx.at(input_index).zeroPoint()));
+
+    // for reshaping
+    needs_reshape = true;
+    reshape.H = batch_size;
+    reshape.W = input_size;
   }
   else if (input_rank == 2)
   {
@@ -2055,29 +2060,19 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node)
 
   param.activation = static_cast<FuseCode>(_ctx.at(activation_index).asScalar<int32_t>());
 
-  auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+  auto stage = [param, needs_reshape, reshape](const IAllocationContext &ctx,
+                                               IExecutionBuilder &builder) {
     auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index});
     auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index});
     auto weight_alloc = ctx.at(::internal::tflite::operand::Index{param.weight_index});
     auto bias_alloc = ctx.at(::internal::tflite::operand::Index{param.bias_index});
 
-    if (::internal::arm_compute::isGpuMode())
-    {
-      auto fn = nnfw::make_unique<::arm_compute::CLFullyConnectedLayer>();
-
-      fn->configure(CAST_CL(input_alloc), CAST_CL(weight_alloc), CAST_CL(bias_alloc),
-                    CAST_CL(output_alloc));
+    auto fn = nnfw::make_unique<GenericFullyConnectedLayer>();
 
-      builder.append("FullyConnected", std::move(fn));
-    }
-    else // NEON
-    {
-      auto fn = nnfw::make_unique<::arm_compute::NEFullyConnectedLayer>();
+    fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc, needs_reshape,
+                  asTensorShape(reshape));
 
-      fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc);
-
-      builder.append("FullyConnected", std::move(fn));
-    }
+    builder.append("FullyConnected", std::move(fn));
 
     ActivationBuilder{builder}.append(param.activation, output_alloc);
   };
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
new file mode 100644
index 0000000..33255a9
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
@@ -0,0 +1,74 @@
+#include "GenericFullyConnectedLayer.h"
+#include "internal/arm_compute.h"
+
+#include <arm_compute/core/Helpers.h>
+
+void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input,
+                                           ::arm_compute::ITensor *weights,
+                                           ::arm_compute::ITensor *biases,
+                                           ::arm_compute::ITensor *output, bool needs_reshape,
+                                           ::arm_compute::TensorShape reshape)
+{
+  _input = input;
+  _weights = weights;
+  _biases = biases;
+  _output = output;
+  _needs_reshape = needs_reshape;
+
+  // TODO Too many duplicated code. Revise below code.
+  if (::internal::arm_compute::isGpuMode())
+  {
+    if (_needs_reshape)
+    {
+      // reshape
+      auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+      _generic_reshape.configure(CAST_CL(_input), &_cl_buffer);
+
+      _cl_fc.configure(&_cl_buffer, CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+
+      // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _cl_buffer.allocator()->allocate();
+    }
+    else
+    {
+      _cl_fc.configure(CAST_CL(_input), CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+    }
+  }
+  else
+  {
+    if (_needs_reshape)
+    {
+      // reshape
+      auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+      _generic_reshape.configure(CAST_NE(_input), &_neon_buffer);
+
+      _neon_fc.configure(&_neon_buffer, CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+
+      // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate
+      // here.
+      _neon_buffer.allocator()->allocate();
+    }
+    else
+    {
+      _neon_fc.configure(CAST_NE(_input), CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+    }
+  }
+}
+
+void GenericFullyConnectedLayer::run(void)
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    if (_needs_reshape)
+      _generic_reshape.run();
+
+    _cl_fc.run();
+  }
+  else
+  {
+    if (_needs_reshape)
+      _generic_reshape.run();
+
+    _neon_fc.run();
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
new file mode 100644
index 0000000..bc4960a
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
@@ -0,0 +1,37 @@
+#ifndef __GENERIC_FULLY_CONNECTED_LAYER_H__
+#define __GENERIC_FULLY_CONNECTED_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include "internal/layers/GenericReshapeLayer.h"
+
+class GenericFullyConnectedLayer : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights,
+                 ::arm_compute::ITensor *biases, ::arm_compute::ITensor *output, bool needs_reshape,
+                 ::arm_compute::TensorShape reshape);
+
+public:
+  void run(void) override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_weights;
+  ::arm_compute::ITensor *_biases;
+  ::arm_compute::ITensor *_output;
+
+  // buffer for reshaping input tensor
+  ::arm_compute::CLTensor _cl_buffer;
+  ::arm_compute::Tensor _neon_buffer;
+
+private:
+  ::arm_compute::CLFullyConnectedLayer _cl_fc;
+  ::arm_compute::NEFullyConnectedLayer _neon_fc;
+  GenericReshapeLayer _generic_reshape;
+  bool _needs_reshape;
+};
+
+#endif // __GENERIC_FULLY_CONNECTED_LAYER_H__
-- 
2.7.4