[neurun] Use 'GenericFullyConnectedLayer' (#4396)

author 김수진/On-Device Lab(SR)/Engineer/삼성전자 <sjsujin.kim@samsung.com>

Wed, 13 Feb 2019 00:58:45 +0000 (09:58 +0900)

committer 오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>

Wed, 13 Feb 2019 00:58:45 +0000 (09:58 +0900)
author 김수진/On-Device Lab(SR)/Engineer/삼성전자 <sjsujin.kim@samsung.com>
Wed, 13 Feb 2019 00:58:45 +0000 (09:58 +0900)
committer 오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Wed, 13 Feb 2019 00:58:45 +0000 (09:58 +0900)
diff --git a/runtimes/neurun/src/backend/acl_cl/StageGenerator.cc b/runtimes/neurun/src/backend/acl_cl/StageGenerator.cc

index 2c71b7e..0c1831d 100644 (file)
--- a/runtimes/neurun/src/backend/acl_cl/StageGenerator.cc
+++ b/runtimes/neurun/src/backend/acl_cl/StageGenerator.cc
@@ -21,11 +21,11 @@
  #include <arm_compute/runtime/CL/functions/CLConvolutionLayer.h>
  #include <arm_compute/runtime/CL/functions/CLPoolingLayer.h>
  #include <arm_compute/runtime/CL/functions/CLActivationLayer.h>
-#include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
  #include <arm_compute/runtime/CL/functions/CLSoftmaxLayer.h>
  #include <arm_compute/runtime/CL/functions/CLArithmeticAddition.h>
  #include <arm_compute/runtime/misc/functions/GenericReshapeLayer.h>
  #include <arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h>
+#include <arm_compute/runtime/misc/functions/GenericFullyConnectedLayer.h>
  
  #include "kernel/ConcatLayer.h"
  
@@ -39,6 +39,8 @@
  
  #include "util/Utils.h"
  
+#include "Convert.h"
+
  template <typename T> std::unique_ptr<T> make_layer(void) { return std::unique_ptr<T>{new T}; }
  
  std::unique_ptr<::neurun::backend::acl_cl::kernel::CLFunction>
@@ -682,6 +684,40 @@ void StageGenerator::visit(const model::operation::FullyConnectedNode &node)
    const auto bias_index{node.getInputs().at(FullyConnectedNode::Input::BIAS)};
    const auto activation_index{node.param().activation_index};
  
+  auto tensors = _tensor_builder;
+
+  const auto input_rank = _ctx.at(input_index).shape().rank();
+  // TODO Currently we are not handling where the case is that the input's rank is 3.
+  // The handling should be added in the future.
+  assert(input_rank != 3);
+
+  const auto output_size = _ctx.at(output_index).shape().dim(1);
+  UNUSED_RELEASE(output_size);
+  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
+  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
+  const auto batch_size = _ctx.at(output_index).shape().dim(0);
+  const auto input_size = _ctx.at(weight_index).shape().dim(1);
+
+  // Check for reshaping input's shape into rank-2
+  bool needs_reshape = false;
+  neurun::model::operand::Shape reshape(2);
+  if (input_rank == 4)
+  {
+    nnfw::misc::feature::Shape ifm_shape_feature = _ctx.at(input_index).shape().asFeature();
+    auto feature_size =
+        ifm_shape_feature.N * ifm_shape_feature.C * ifm_shape_feature.H * ifm_shape_feature.W;
+
+    UNUSED_RELEASE(feature_size);
+    assert(feature_size == batch_size * input_size);
+
+    tensors->dimCorrection(input_index, false);
+
+    // for reshaping
+    needs_reshape = true;
+    reshape.dim(0) = batch_size; /* H */
+    reshape.dim(1) = input_size; /* W */
+  }
+
    // Construct operation parameters
    struct Param
    {
@@ -692,6 +728,9 @@ void StageGenerator::visit(const model::operation::FullyConnectedNode &node)
      model::operand::Index bias_index;
  
      FuseCode activation;
+
+    bool needs_reshape;
+    neurun::model::operand::Shape reshape;
    };
  
    Param param;
@@ -703,7 +742,8 @@ void StageGenerator::visit(const model::operation::FullyConnectedNode &node)
  
    param.activation = static_cast<FuseCode>(_ctx.at(activation_index).asScalar<int32_t>());
  
-  auto tensors = _tensor_builder;
+  param.needs_reshape = needs_reshape;
+  param.reshape = reshape;
  
    returnStage([tensors, param](IExecutionBuilder &builder) {
      auto output_alloc = tensors->at(param.output_index).get();
@@ -711,10 +751,10 @@ void StageGenerator::visit(const model::operation::FullyConnectedNode &node)
      auto weight_alloc = tensors->at(param.weight_index).get();
      auto bias_alloc = tensors->at(param.bias_index).get();
  
-    auto fn = make_layer<::arm_compute::CLFullyConnectedLayer>();
+    auto fn = make_layer<arm_compute::misc::GenericFullyConnectedLayer>();
  
      fn->configure(input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(),
-                  output_alloc->handle());
+                  output_alloc->handle(), param.needs_reshape, asTensorShape(param.reshape));
  
      auto acl_fn = make_cl_function(std::move(fn));
  
diff --git a/runtimes/neurun/src/backend/cpu/operand/Tensor.cc b/runtimes/neurun/src/backend/cpu/operand/Tensor.cc

index 5df5c9e..081e5ef 100644 (file)
--- a/runtimes/neurun/src/backend/cpu/operand/Tensor.cc
+++ b/runtimes/neurun/src/backend/cpu/operand/Tensor.cc
@@ -25,9 +25,35 @@ namespace cpu
  namespace operand
  {
  
-size_t Tensor::calcOffset(const neurun::util::feature::Coordinate4D &)
+size_t Tensor::calcOffset(const neurun::util::feature::Coordinate4D &coords)
  {
-  throw std::runtime_error("offset_element_in_bytes is not supported for cpu::Tensor now.");
+  size_t rank = num_dimensions();
+  assert(rank == 2 || rank == 3);
+
+  size_t offset = 0;
+
+  switch (rank)
+  {
+    case 2:
+    {
+      size_t W = dimension(1);
+
+      offset += coords.h() * W;
+      break;
+    }
+    case 3:
+    {
+      size_t H = dimension(1);
+      size_t W = dimension(2);
+
+      offset += coords.c() * H * W;
+      offset += coords.h() * W;
+      break;
+    }
+    default:
+      throw std::runtime_error("Not supported rank in Tensor::calcOffset");
+  }
+  return offset;
  }
  
  } // namespace operand
diff --git a/runtimes/neurun/src/compiler/ConstantInitializer.cc b/runtimes/neurun/src/compiler/ConstantInitializer.cc

index 3c298ad..57f62d3 100644 (file)
--- a/runtimes/neurun/src/compiler/ConstantInitializer.cc
+++ b/runtimes/neurun/src/compiler/ConstantInitializer.cc
@@ -68,75 +68,23 @@ void ConstantInitializer::run(const model::operand::Index &ind,
        }
        case 2:
        {
-        // Find corresponding FullyConnected IFM
-        auto operation_index = _graph.operands().at(index).getUses().list().front();
-        auto operation = &_graph.operations().at(operation_index);
-        auto fc_operation =
-            dynamic_cast<const neurun::model::operation::FullyConnectedNode *>(operation);
+        auto matrix_shape = shape.asMatrix();
  
-        if (fc_operation != nullptr)
+        for (auto h = 0; h < matrix_shape.H; ++h)
          {
-          // NOTE We must know the IFM shape to deduce 2D weight shape from 4D IFM.
-          //      This is because of NHWC/NCHW layout, the order of mapping will be different.
-          auto ifm_index = fc_operation->getInputs().at(
-              neurun::model::operation::FullyConnectedNode::Input::INPUT);
-          const auto &ifm = _graph.operands().at(ifm_index);
-          const auto ifm_shape = ifm.shape().asFeature();
-          const auto num_output = shape.dim(0);
-
-          const ::nnfw::misc::feature::Shape ker_shape{num_output, ifm_shape.C, ifm_shape.H,
-                                                       ifm_shape.W};
-          const util::feature::nhwc::Reader<T> from{ker_shape, base, size};
-
+          neurun::util::feature::Coordinate4D coord{0, h, 0, 0};
+          // TODO : Change this WORKAROUND
            if (layout == neurun::graph::operand::Layout::NHWC)
            {
-            ::nnfw::misc::feature::iterate(ker_shape)
-                << [&](uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(nth, ch, row, col);
-
-                     uint32_t offset = 0;
-
-                     // NNAPI uses NHWC ordering
-                     offset += nth * ifm_shape.H * ifm_shape.W * ifm_shape.C;
-                     offset += row * ifm_shape.W * ifm_shape.C;
-                     offset += col * ifm_shape.C;
-                     offset += ch;
-
-                     T *into = reinterpret_cast<T *>(tensor.buffer()) + offset;
-
-                     *into = value;
-                   };
+            memcpy(reinterpret_cast<T *>(tensor.buffer()) + tensor.calcOffset(coord),
+                   reinterpret_cast<const T *>(base) + h * matrix_shape.W,
+                   matrix_shape.W * sizeof(T));
            }
            else
            {
              assert(layout == neurun::graph::operand::Layout::NCHW);
-
-            ::nnfw::misc::feature::iterate(ker_shape)
-                << [&](uint32_t nth, uint32_t ch, uint32_t row, uint32_t col) {
-                     const auto value = from.at(nth, ch, row, col);
-
-                     uint32_t offset = 0;
-
-                     // 'NCHW' ordering
-                     offset += nth * ifm_shape.C * ifm_shape.H * ifm_shape.W;
-                     offset += ch * ifm_shape.H * ifm_shape.W;
-                     offset += row * ifm_shape.W;
-                     offset += col;
-
-                     T *into = reinterpret_cast<T *>(tensor.buffer()) + offset;
-
-                     *into = value;
-                   };
-          }
-        }
-        else // operation != fc operation
-        {
-          auto matrix_shape = shape.asMatrix();
-
-          for (auto h = 0; h < matrix_shape.H; ++h)
-          {
-            neurun::util::feature::Coordinate4D coord{0, h, 0, 0};
-            memcpy(tensor.buffer() + tensor.calcOffset(coord), base + h * matrix_shape.W,
+            memcpy(tensor.buffer() + tensor.calcOffset(coord),
+                   reinterpret_cast<const T *>(base) + h * matrix_shape.W,
                     matrix_shape.W * sizeof(T));
            }
          }
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.neurun b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.neurun

index 19eda7e..42a6c64 100644 (file)
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.neurun
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.neurun
@@ -62,8 +62,6 @@ GeneratedTests.strided_slice_ex*
  GeneratedTests.tensorflowmax_ex*
  GeneratedTests.reduce_sum_ex*
  GeneratedTests.topk_v2*
-# Unhandled exception
-GeneratedTests.fully_connected*
  # Unexpected result
  GeneratedTests.split*
  GeneratedTests.transpose_conv*
author	김수진/On-Device Lab(SR)/Engineer/삼성전자 <sjsujin.kim@samsung.com>
	Wed, 13 Feb 2019 00:58:45 +0000 (09:58 +0900)
committer	오형석/On-Device Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
	Wed, 13 Feb 2019 00:58:45 +0000 (09:58 +0900)
runtimes/neurun/src/backend/acl_cl/StageGenerator.cc		patch \| blob \| history
runtimes/neurun/src/backend/cpu/operand/Tensor.cc		patch \| blob \| history
runtimes/neurun/src/compiler/ConstantInitializer.cc		patch \| blob \| history
tests/nnapi/nnapi_gtest.skip.armv7l-linux.neurun		patch \| blob \| history