Introduce GenericFullyConnectedLayer for fc (#2525)

author 김용섭/동작제어Lab(SR)/Engineer/삼성전자 <yons.kim@samsung.com>

Thu, 30 Aug 2018 05:00:04 +0000 (14:00 +0900)

committer 이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>

Thu, 30 Aug 2018 05:00:04 +0000 (14:00 +0900)
author 김용섭/동작제어Lab(SR)/Engineer/삼성전자 <yons.kim@samsung.com>
Thu, 30 Aug 2018 05:00:04 +0000 (14:00 +0900)
committer 이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
Thu, 30 Aug 2018 05:00:04 +0000 (14:00 +0900)
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc

index 5d7b521..6958fd6 100644 (file)
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -12,7 +12,6 @@
  #include <arm_compute/runtime/CL/functions/CLScale.h>
  #include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
  #include <arm_compute/runtime/CL/functions/CLStridedSlice.h>
-#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
  #include <arm_compute/runtime/CL/functions/CLSoftmaxLayer.h>
  #include <arm_compute/runtime/CL/functions/CLGather.h>
  #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
@@ -35,7 +34,6 @@
  #include <arm_compute/runtime/NEON/functions/NEPoolingLayer.h>
  #include <arm_compute/runtime/NEON/functions/NEActivationLayer.h>
  #include <arm_compute/runtime/NEON/functions/NEConvolutionLayer.h>
-#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
  
  #include "internal/arm_compute.h"
  #include "internal/arm_compute/Cast.h"
@@ -51,6 +49,7 @@
  #include "internal/layers/GenericReshapeLayer.h"
  #include "internal/layers/SimpleArithmeticAddition.h"
  #include "internal/layers/SimpleCastLayer.h"
+#include "internal/layers/GenericFullyConnectedLayer.h"
  
  #include "util/matrix/IndexIterator.h"
  #include "util/kernel/IndexIterator.h"
@@ -1997,7 +1996,9 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node)
    const auto batch_size = _ctx.at(output_index).shape().dim(0);
    const auto input_size = _ctx.at(weight_index).shape().dim(1);
  
-  // Check for reshaping input's shape into rank-2 and do reshaping
+  // Check for reshaping input's shape into rank-2
+  bool needs_reshape = false;
+  nnfw::util::matrix::Shape reshape;
    if (input_rank == 4)
    {
      nnfw::util::feature::Shape ifm_shape_feature = _ctx.at(input_index).shape().asFeature();
@@ -2005,10 +2006,14 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node)
          ifm_shape_feature.N * ifm_shape_feature.C * ifm_shape_feature.H * ifm_shape_feature.W;
      assert(feature_size == batch_size * input_size);
  
-    // TODO Add reshaping
      _builder.addShapeConstr(
          input_index, asTensorInfo(ifm_shape_feature, _ctx.at(input_index).type(),
                                    _ctx.at(input_index).scale(), _ctx.at(input_index).zeroPoint()));
+
+    // for reshaping
+    needs_reshape = true;
+    reshape.H = batch_size;
+    reshape.W = input_size;
    }
    else if (input_rank == 2)
    {
@@ -2055,29 +2060,19 @@ void Planner::visit(const ::internal::tflite::op::FullyConnected::Node &node)
  
    param.activation = static_cast<FuseCode>(_ctx.at(activation_index).asScalar<int32_t>());
  
-  auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+  auto stage = [param, needs_reshape, reshape](const IAllocationContext &ctx,
+                                               IExecutionBuilder &builder) {
      auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index});
      auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.input_index});
      auto weight_alloc = ctx.at(::internal::tflite::operand::Index{param.weight_index});
      auto bias_alloc = ctx.at(::internal::tflite::operand::Index{param.bias_index});
  
-    if (::internal::arm_compute::isGpuMode())
-    {
-      auto fn = nnfw::make_unique<::arm_compute::CLFullyConnectedLayer>();
-
-      fn->configure(CAST_CL(input_alloc), CAST_CL(weight_alloc), CAST_CL(bias_alloc),
-                    CAST_CL(output_alloc));
+    auto fn = nnfw::make_unique<GenericFullyConnectedLayer>();
  
-      builder.append("FullyConnected", std::move(fn));
-    }
-    else // NEON
-    {
-      auto fn = nnfw::make_unique<::arm_compute::NEFullyConnectedLayer>();
+    fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc, needs_reshape,
+                  asTensorShape(reshape));
  
-      fn->configure(input_alloc, weight_alloc, bias_alloc, output_alloc);
-
-      builder.append("FullyConnected", std::move(fn));
-    }
+    builder.append("FullyConnected", std::move(fn));
  
      ActivationBuilder{builder}.append(param.activation, output_alloc);
    };
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc

new file mode 100644 (file)

index 0000000..33255a9
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc
@@ -0,0 +1,74 @@
+#include "GenericFullyConnectedLayer.h"
+#include "internal/arm_compute.h"
+
+#include <arm_compute/core/Helpers.h>
+
+void GenericFullyConnectedLayer::configure(::arm_compute::ITensor *input,
+                                           ::arm_compute::ITensor *weights,
+                                           ::arm_compute::ITensor *biases,
+                                           ::arm_compute::ITensor *output, bool needs_reshape,
+                                           ::arm_compute::TensorShape reshape)
+{
+  _input = input;
+  _weights = weights;
+  _biases = biases;
+  _output = output;
+  _needs_reshape = needs_reshape;
+
+  // TODO Too many duplicated code. Revise below code.
+  if (::internal::arm_compute::isGpuMode())
+  {
+    if (_needs_reshape)
+    {
+      // reshape
+      auto_init_if_empty(*_cl_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+      _generic_reshape.configure(CAST_CL(_input), &_cl_buffer);
+
+      _cl_fc.configure(&_cl_buffer, CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+
+      // NOTE _cl_buffer is inaccessible from outside, and thus it is safe to invoke allocate here.
+      _cl_buffer.allocator()->allocate();
+    }
+    else
+    {
+      _cl_fc.configure(CAST_CL(_input), CAST_CL(_weights), CAST_CL(_biases), CAST_CL(_output));
+    }
+  }
+  else
+  {
+    if (_needs_reshape)
+    {
+      // reshape
+      auto_init_if_empty(*_neon_buffer.info(), _input->info()->clone()->set_tensor_shape(reshape));
+      _generic_reshape.configure(CAST_NE(_input), &_neon_buffer);
+
+      _neon_fc.configure(&_neon_buffer, CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+
+      // NOTE _neon_buffer is inaccessible from outside, and thus it is safe to invoke allocate
+      // here.
+      _neon_buffer.allocator()->allocate();
+    }
+    else
+    {
+      _neon_fc.configure(CAST_NE(_input), CAST_NE(_weights), CAST_NE(_biases), CAST_NE(_output));
+    }
+  }
+}
+
+void GenericFullyConnectedLayer::run(void)
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    if (_needs_reshape)
+      _generic_reshape.run();
+
+    _cl_fc.run();
+  }
+  else
+  {
+    if (_needs_reshape)
+      _generic_reshape.run();
+
+    _neon_fc.run();
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h

new file mode 100644 (file)

index 0000000..bc4960a
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h
@@ -0,0 +1,37 @@
+#ifndef __GENERIC_FULLY_CONNECTED_LAYER_H__
+#define __GENERIC_FULLY_CONNECTED_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
+#include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
+#include "internal/layers/GenericReshapeLayer.h"
+
+class GenericFullyConnectedLayer : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *weights,
+                 ::arm_compute::ITensor *biases, ::arm_compute::ITensor *output, bool needs_reshape,
+                 ::arm_compute::TensorShape reshape);
+
+public:
+  void run(void) override;
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_weights;
+  ::arm_compute::ITensor *_biases;
+  ::arm_compute::ITensor *_output;
+
+  // buffer for reshaping input tensor
+  ::arm_compute::CLTensor _cl_buffer;
+  ::arm_compute::Tensor _neon_buffer;
+
+private:
+  ::arm_compute::CLFullyConnectedLayer _cl_fc;
+  ::arm_compute::NEFullyConnectedLayer _neon_fc;
+  GenericReshapeLayer _generic_reshape;
+  bool _needs_reshape;
+};
+
+#endif // __GENERIC_FULLY_CONNECTED_LAYER_H__
author	김용섭/동작제어Lab(SR)/Engineer/삼성전자 <yons.kim@samsung.com>
	Thu, 30 Aug 2018 05:00:04 +0000 (14:00 +0900)
committer	이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
	Thu, 30 Aug 2018 05:00:04 +0000 (14:00 +0900)
runtimes/pure_arm_compute/src/compilation.cc		patch \| blob \| history
runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.cc	[new file with mode: 0644]	patch \| blob
runtimes/pure_arm_compute/src/internal/layers/GenericFullyConnectedLayer.h	[new file with mode: 0644]	patch \| blob