From d6971cbfaf3f5898970141badbb024177a42fbaa Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EC=9C=A4=ED=98=84=EC=8B=9D/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Principal=20Engineer/=EC=82=BC=EC=84=B1?=
 =?utf8?q?=EC=A0=84=EC=9E=90?= <hyunsik.yoon@samsung.com>
Date: Thu, 14 Jun 2018 20:09:58 +0900
Subject: [PATCH] Optimized TensorSink (#1665)

Parent issue : #1658

memcpy is used just like #1647.
Tested with `tools/nnapi_unittests/test/mul_1` and `mul_2`.

Signed-off-by: Hyun Sik Yoon hyunsik.yoon@samsung.com
---
 runtimes/pure_arm_compute/src/execution.cc         | 32 ++++++++++---
 .../pure_arm_compute/src/internal/Tensor3DSink.h   | 54 ++++++++++++++++++++++
 2 files changed, 79 insertions(+), 7 deletions(-)
 create mode 100644 runtimes/pure_arm_compute/src/internal/Tensor3DSink.h
diff --git a/runtimes/pure_arm_compute/src/execution.cc b/runtimes/pure_arm_compute/src/execution.cc
index 12c3ec2..194464a 100644
--- a/runtimes/pure_arm_compute/src/execution.cc
+++ b/runtimes/pure_arm_compute/src/execution.cc
@@ -12,6 +12,7 @@
 #include "internal/arm_compute/feature/View.h"
 #include "internal/Sinks.h"
 #include "internal/MatrixSink.h"
+#include "internal/Tensor3DSink.h"
 
 #include "util/feature/IndexIterator.h"
 
@@ -285,20 +286,36 @@ static void asTensorSink(ANeuralNetworksExecution *execution, int32_t type, int3
   {
     case ANEURALNETWORKS_FLOAT32:
     case ANEURALNETWORKS_TENSOR_FLOAT32:
-      execution->sink<TensorSink<float>>(index, shape, reinterpret_cast<float *>(buffer), length);
+      if (shape.rank() == 3)
+        execution->sink<Tensor3DSink<float>>(index, shape, reinterpret_cast<float *>(buffer),
+                                             length);
+      else
+        execution->sink<TensorSink<float>>(index, shape, reinterpret_cast<float *>(buffer), length);
       break;
     case ANEURALNETWORKS_INT32:
     case ANEURALNETWORKS_TENSOR_INT32:
-      execution->sink<TensorSink<int32_t>>(index, shape, reinterpret_cast<int32_t *>(buffer),
-                                           length);
+      if (shape.rank() == 3)
+        execution->sink<Tensor3DSink<int32_t>>(index, shape, reinterpret_cast<int32_t *>(buffer),
+                                               length);
+      else
+        execution->sink<TensorSink<int32_t>>(index, shape, reinterpret_cast<int32_t *>(buffer),
+                                             length);
       break;
     case ANEURALNETWORKS_UINT32:
-      execution->sink<TensorSink<uint32_t>>(index, shape, reinterpret_cast<uint32_t *>(buffer),
-                                            length);
+      if (shape.rank() == 3)
+        execution->sink<Tensor3DSink<uint32_t>>(index, shape, reinterpret_cast<uint32_t *>(buffer),
+                                                length);
+      else
+        execution->sink<TensorSink<uint32_t>>(index, shape, reinterpret_cast<uint32_t *>(buffer),
+                                              length);
       break;
     case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM:
-      execution->sink<TensorSink<uint8_t>>(index, shape, reinterpret_cast<uint8_t *>(buffer),
-                                           length);
+      if (shape.rank() == 3)
+        execution->sink<Tensor3DSink<uint8_t>>(index, shape, reinterpret_cast<uint8_t *>(buffer),
+                                               length);
+      else
+        execution->sink<TensorSink<uint8_t>>(index, shape, reinterpret_cast<uint8_t *>(buffer),
+                                             length);
       break;
     default:
       throw std::runtime_error("Not supported, yet");
@@ -423,6 +440,7 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution *execution, int3
   else
   {
     // NOTE TensorSink is much slower than VectorSink and FeatureSink
+    //      in case of 3D tensor, optimized Tensor3DSink is called inside asTensorSink
     const auto &shape = operands.at(operand_index).shape();
     asTensorSink(execution, output_type, index, shape, buffer, length);
   }
diff --git a/runtimes/pure_arm_compute/src/internal/Tensor3DSink.h b/runtimes/pure_arm_compute/src/internal/Tensor3DSink.h
new file mode 100644
index 0000000..c5fad3f
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/Tensor3DSink.h
@@ -0,0 +1,54 @@
+#ifndef __TENSOR3D_SINK_H__
+#define __TENSOR3D_SINK_H__
+
+#include "internal/Sink.h"
+
+//
+// This is mempcy() version of generic TensorSink for 3D tensor
+//
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/core/Window.h>
+#include <arm_compute/core/Helpers.h>
+
+template <typename T> class Tensor3DSink final : public Sink
+{
+public:
+  Tensor3DSink(const nnfw::util::tensor::Shape &shape, T *base, const size_t size)
+      : _shape{shape}, _base{base}, _size{size}
+  {
+    // DO NOTHING
+  }
+
+public:
+  void pull(::arm_compute::ITensor &tensor) const override
+  {
+    using ::arm_compute::Window;
+    using ::arm_compute::Iterator;
+    using ::arm_compute::Coordinates;
+    using ::arm_compute::execute_window_loop;
+
+    Window window;
+
+    window.use_tensor_dimensions(tensor.info()->tensor_shape(), ::arm_compute::Window::DimY);
+    int32_t height_width = _shape.dim(1) * _shape.dim(2);
+    int32_t width = _shape.dim(2);
+
+    Iterator it(&tensor, window);
+    execute_window_loop(window,
+                        [&](const ::arm_compute::Coordinates &id) {
+                          const auto z = id.z();
+                          const auto y = id.y();
+                          memcpy(_base + z * height_width + y * width, it.ptr(), width * sizeof(T));
+                        },
+                        it);
+  }
+
+private:
+  const nnfw::util::tensor::Shape _shape;
+
+private:
+  T *const _base;
+  const size_t _size;
+};
+
+#endif // __TENSOR3D_SINK_H__
-- 
2.7.4