[neurun] Remove permutation layer for model's input/output and permute in-place ...
authorДилшоджон Умронхонович Пошшоев/AI Tools Lab /SRR/Engineer/삼성전자 <d.poshshoev@samsung.com>
Tue, 27 Nov 2018 10:34:31 +0000 (13:34 +0300)
committer오형석/동작제어Lab(SR)/Staff Engineer/삼성전자 <hseok82.oh@samsung.com>
Tue, 27 Nov 2018 10:34:31 +0000 (19:34 +0900)
Permute while copying from/to interpreter: this will eliminate extra
copying in permutation layer. This is just for NHWC_TO_NCHW in input
and NCHW_TO_NHWC in output.

Signed-off-by: Poshshoev Dilshodzhon <d.poshshoev@samsung.com>
runtimes/neurun/src/exec/Sink.h
runtimes/neurun/src/exec/Source.h
runtimes/neurun/src/frontend/execution.cc
runtimes/neurun/src/graph/Graph.cc
runtimes/neurun/src/graph/pass/PermutationEliminationPass.h [changed mode: 0755->0644]

index 3cc7a05..c9747d7 100644 (file)
 #include <cassert>
 
 #include <arm_compute/core/ITensor.h>
+#include "nnfw/std/memory.h"
+#include "kernel/cpu/PermuteLayer.h"
+#include "backend/cpu/operand/Tensor.h"
+#include "util/feature/nhwc/View.h"
+#include "util/feature/nchw/View.h"
+#include <util/feature/IndexIterator.h>
 
 namespace neurun
 {
@@ -49,6 +55,102 @@ private:
   const size_t _size;
 };
 
+class PermutateSink final : public ISink
+{
+public:
+  PermutateSink(neurun::backend::cpu::operand::Tensor output, const graph::operand::Shape &shape)
+      : _output{output}, _shape{shape}
+  {
+  }
+
+public:
+  void pull(::arm_compute::ITensor &tensor) const override
+  {
+    // do NCHW_TO_NHWC permutation
+    auto input_buffer = tensor.buffer();
+
+    auto output_buffer = _output.buffer();
+    auto output_size = _output.info()->total_size();
+    auto rank = _shape.rank();
+    switch (rank)
+    {
+      case 0:
+      case 1:
+      {
+        memcpy(output_buffer, input_buffer, output_size);
+        break;
+      }
+      case 2:
+      {
+        using ::arm_compute::Window;
+        using ::arm_compute::Iterator;
+
+        Window window;
+        window.use_tensor_dimensions(tensor.info()->tensor_shape(), Window::DimY);
+
+        Iterator it(&tensor, window);
+
+        int output_width = _shape.asMatrix().W;
+
+        const auto &y = window[Window::DimY];
+        for (auto h = y.start(); h < y.end(); h += y.step(), it.increment(Window::DimY))
+        {
+          memcpy(output_buffer + h * output_width, it.ptr(), output_width * sizeof(output_buffer));
+        }
+        break;
+      }
+      case 3:
+      {
+        using ::arm_compute::Window;
+        using ::arm_compute::Iterator;
+
+        const int32_t height_width = _shape.dim(1) * _shape.dim(2);
+        const int32_t width = _shape.dim(2);
+
+        Window window;
+        window.use_tensor_dimensions(tensor.info()->tensor_shape(), Window::DimY);
+
+        Iterator it(&tensor, window);
+
+        const auto &z = window[Window::DimZ];
+        const auto &y = window[Window::DimY];
+        for (auto c = z.start(); c < z.end(); c += z.step(), it.increment(Window::DimZ))
+        {
+          for (auto h = y.start(); h < y.end(); h += y.step(), it.increment(Window::DimY))
+          {
+            memcpy(output_buffer + c * height_width + h * width, it.ptr(),
+                   width * sizeof(output_buffer));
+          }
+        }
+        break;
+      }
+      case 4:
+      {
+        auto feature = _shape.asFeature();
+
+        // TODO Fix this workaround (We may need codegen::operand::Object instead of ITensor)
+        const util::feature::nchw::View<float> from{&tensor};
+        util::feature::nhwc::View<float> into{feature, reinterpret_cast<float *>(output_buffer),
+                                              output_size};
+
+        ::nnfw::util::feature::iterate(feature)
+            << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
+                 const auto value = from.at(batch, ch, row, col);
+                 into.at(batch, ch, row, col) = value;
+               };
+        break;
+      }
+      default:
+        throw "NYI";
+        break;
+    }
+  }
+
+private:
+  const neurun::backend::cpu::operand::Tensor _output;
+  const graph::operand::Shape _shape;
+};
+
 } // namespace exec
 } // namespace neurun
 
index 47d1c73..86c7ec5 100644 (file)
 #include <cassert>
 
 #include <arm_compute/core/ITensor.h>
+#include "kernel/cpu/PermuteLayer.h"
+#include "nnfw/std/memory.h"
+#include "backend/cpu/operand/Tensor.h"
+#include "util/feature/nchw/View.h"
+#include "util/feature/nhwc/Reader.h"
+#include <util/feature/IndexIterator.h>
 
 namespace neurun
 {
@@ -49,6 +55,103 @@ private:
   const size_t _size;
 };
 
+class PermutateSource final : public ISource
+{
+public:
+  PermutateSource(neurun::backend::cpu::operand::Tensor input, const graph::operand::Shape &shape)
+      : _input{input}, _shape{shape}
+  {
+  }
+
+public:
+  void push(::arm_compute::ITensor &tensor) const override
+  {
+    // do NHWC_TO_NCHW permutation
+    auto input_buffer = _input.buffer();
+    auto input_size = _input.info()->total_size();
+
+    auto output_buffer = tensor.buffer();
+    auto rank = _shape.rank();
+    switch (rank)
+    {
+      case 0:
+      case 1:
+      {
+        memcpy(output_buffer, input_buffer, input_size);
+        break;
+      }
+      case 2:
+      {
+        using ::arm_compute::Window;
+        using ::arm_compute::Iterator;
+
+        auto matrix_shape = _shape.asMatrix();
+
+        Window window;
+        window.use_tensor_dimensions(tensor.info()->tensor_shape(), Window::DimY);
+
+        Iterator it(&tensor, window);
+
+        const auto &y = window[Window::DimY];
+        for (auto h = y.start(); h < y.end(); h += y.step(), it.increment(Window::DimY))
+        {
+          memcpy(it.ptr(), input_buffer + h * matrix_shape.W,
+                 matrix_shape.W * sizeof(input_buffer));
+        }
+        break;
+      }
+      case 3:
+      {
+        using ::arm_compute::Window;
+        using ::arm_compute::Iterator;
+
+        const int32_t height_width = _shape.dim(1) * _shape.dim(2);
+        const int32_t width = _shape.dim(2);
+
+        Window window;
+        window.use_tensor_dimensions(tensor.info()->tensor_shape(), Window::DimY);
+
+        Iterator it(&tensor, window);
+
+        const auto &z = window[Window::DimZ];
+        const auto &y = window[Window::DimY];
+        for (auto c = z.start(); c < z.end(); c += z.step(), it.increment(Window::DimZ))
+        {
+          for (auto h = y.start(); h < y.end(); h += y.step(), it.increment(Window::DimY))
+          {
+            memcpy(it.ptr(), input_buffer + c * height_width + h * width,
+                   width * sizeof(input_buffer));
+          }
+        }
+        break;
+      }
+      case 4:
+      {
+        auto feature = _shape.asFeature();
+
+        const util::feature::nhwc::Reader<float> from{
+            feature, reinterpret_cast<const float *>(input_buffer), input_size};
+        util::feature::nchw::View<float> into{&tensor};
+
+        // TODO Fix this workaround (We may need codegen::operand::Object instead of ITensor)
+        ::nnfw::util::feature::iterate(feature)
+            << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
+                 const auto value = from.at(batch, ch, row, col);
+                 into.at(batch, ch, row, col) = value;
+               };
+        break;
+      }
+      default:
+        throw "NYI";
+        break;
+    }
+  }
+
+private:
+  const neurun::backend::cpu::operand::Tensor _input;
+  const graph::operand::Shape _shape;
+};
+
 } // namespace exec
 } // namespace neurun
 
index bba792e..64ed46a 100644 (file)
 
 #include "graph/operand/DataType.h"
 #include "graph/operand/Index.h"
+#include "kernel/cpu/PermuteLayer.h"
+#include "backend/cpu/operand/Tensor.h"
+#include "internal/Convert.h"
+#include "graph/operand/Layout.h"
+#include "backend/BackendManager.h"
+#include "backend/interface/IConfig.h"
+#include "compiler/BackendResolver.h"
 
 inline void source(ANeuralNetworksExecution *execution,
                    const ::neurun::graph::operand::DataType &type, int32_t index,
                    const void *buffer, size_t length)
 {
+  const auto &operands = execution->plan().model().operands();
+  neurun::graph::operand::IO::Index input_index{index};
+
+  const auto operand_index = execution->plan().model().getInputs().at(input_index);
+  auto operand = &operands.at(operand_index);
+  auto operand_li = operand->lower_info();
+  const auto output_backend = operand_li->def_backends().getOnlyElement();
+  const auto output_layout = output_backend->config()->getOperandLayout();
+  auto input_layout = execution->plan()
+                          .model()
+                          .backend_resolver()
+                          ->getDefaultBackend()
+                          ->config()
+                          ->getOperandLayout();
+  if (input_layout == neurun::graph::operand::Layout::NHWC &&
+      output_layout == neurun::graph::operand::Layout::NCHW)
+  {
+    const auto tensor_info = ::internal::asTensorInfo(operand->shape(), operand->typeInfo());
+    auto tensor_from_interp = neurun::backend::cpu::operand::Tensor(tensor_info);
+    tensor_from_interp.setBuffer((uint8_t *)buffer);
+
+    execution->source<::neurun::exec::PermutateSource>(index, tensor_from_interp, operand->shape());
+    return;
+  }
   using ::neurun::graph::operand::DataType;
   switch (type)
   {
@@ -60,6 +91,30 @@ inline void sink(ANeuralNetworksExecution *execution,
                  const ::neurun::graph::operand::DataType &type, int32_t index, void *buffer,
                  size_t length)
 {
+  const auto &operands = execution->plan().model().operands();
+  neurun::graph::operand::IO::Index input_index{index};
+
+  const auto operand_index = execution->plan().model().getOutputs().at(input_index);
+  auto operand = &operands.at(operand_index);
+  auto operand_li = operand->lower_info();
+  const auto input_backend = operand_li->def_backends().getOnlyElement();
+  const auto input_layout = input_backend->config()->getOperandLayout();
+  auto output_layout = execution->plan()
+                           .model()
+                           .backend_resolver()
+                           ->getDefaultBackend()
+                           ->config()
+                           ->getOperandLayout();
+  if (input_layout == neurun::graph::operand::Layout::NCHW &&
+      output_layout == neurun::graph::operand::Layout::NHWC)
+  {
+    const auto tensor_info = ::internal::asTensorInfo(operand->shape(), operand->typeInfo());
+    auto tensor_from_interp = neurun::backend::cpu::operand::Tensor(tensor_info);
+    tensor_from_interp.setBuffer((uint8_t *)buffer);
+
+    execution->sink<::neurun::exec::PermutateSink>(index, tensor_from_interp, operand->shape());
+    return;
+  }
   using ::neurun::graph::operand::DataType;
   switch (type)
   {
index 33aa25c..f581831 100644 (file)
@@ -30,6 +30,7 @@
 #include "backend/interface/IConfig.h"
 #include "operation/PermuteNode.h"
 #include "pass/PermutationInsertionPass.h"
+#include "pass/PermutationEliminationPass.h"
 
 namespace neurun
 {
@@ -207,6 +208,8 @@ void Graph::lower(void)
   {
     pass::PermutationInsertionPass pi_pass(*this);
     pi_pass.run();
+    pass::PermutationEliminationPass pe_pass(*this);
+    pe_pass.run();
   }
 
   // Graph verifications for the LOWERED phase