From 340ef41b77622bb52d887296f5d7f63384ca085e Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EB=B0=95=EC=A2=85=ED=98=84/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Staff=20Engineer/=EC=82=BC=EC=84=B1?=
 =?utf8?q?=EC=A0=84=EC=9E=90?= <jh1302.park@samsung.com>
Date: Mon, 11 Jun 2018 20:56:35 +0900
Subject: [PATCH] [Pure CL] Add 'MatrixSink' (#1647)

* [Pure CL] Add 'MatrixSink'

This commit adds 'MatrixSink' which efficiently copies output of matrix
shape (rank-2).

Signed-off-by: Jonghyun Park <jh1302.park@samsung.com>
---
 runtimes/pure_arm_compute/src/execution.cc         | 44 +++++++++++++++--
 .../pure_arm_compute/src/internal/MatrixSink.h     | 55 ++++++++++++++++++++++
 2 files changed, 95 insertions(+), 4 deletions(-)
 create mode 100644 runtimes/pure_arm_compute/src/internal/MatrixSink.h
diff --git a/runtimes/pure_arm_compute/src/execution.cc b/runtimes/pure_arm_compute/src/execution.cc
index 9810382..faf680e 100644
--- a/runtimes/pure_arm_compute/src/execution.cc
+++ b/runtimes/pure_arm_compute/src/execution.cc
@@ -10,6 +10,7 @@
 
 #include "internal/arm_compute/feature/View.h"
 #include "internal/Sinks.h"
+#include "internal/MatrixSink.h"
 
 #include "util/feature/IndexIterator.h"
 
@@ -190,6 +191,34 @@ static void asVectorSink(ANeuralNetworksExecution *execution, int32_t type, int3
   }
 }
 
+static void asMatrixSink(ANeuralNetworksExecution *execution, int32_t type, int32_t index,
+                         int32_t H, int32_t W, void *buffer, size_t length)
+{
+  switch (type)
+  {
+    case ANEURALNETWORKS_FLOAT32:
+    case ANEURALNETWORKS_TENSOR_FLOAT32:
+      execution->sink<MatrixSink<float>>(index, H, W, reinterpret_cast<float *>(buffer), length);
+      break;
+    case ANEURALNETWORKS_INT32:
+    case ANEURALNETWORKS_TENSOR_INT32:
+      execution->sink<MatrixSink<int32_t>>(index, H, W, reinterpret_cast<int32_t *>(buffer),
+                                           length);
+      break;
+    case ANEURALNETWORKS_UINT32:
+      execution->sink<MatrixSink<uint32_t>>(index, H, W, reinterpret_cast<uint32_t *>(buffer),
+                                            length);
+      break;
+    case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM:
+      execution->sink<MatrixSink<uint8_t>>(index, H, W, reinterpret_cast<uint8_t *>(buffer),
+                                           length);
+      break;
+    default:
+      throw std::runtime_error("Not supported, yet");
+      break;
+  }
+}
+
 static void asFeatureSink(ANeuralNetworksExecution *execution, int32_t type, int32_t index,
                           const nnfw::util::feature::Shape &shape, void *buffer, size_t length)
 {
@@ -340,15 +369,22 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution *execution, int3
 
   const auto operand_index = execution->plan().model().outputs.at(index);
   int32_t output_type = operands.at(operand_index).type();
-  const auto squeezed_shape = squeeze(operands.at(operand_index).shape());
+  const auto &output_shape = operands.at(operand_index).shape();
 
-  if (squeezed_shape.rank() == 1)
+  if (output_shape.rank() == 1)
   {
-    const auto len = squeezed_shape.dim(0);
+    const auto len = output_shape.dim(0);
 
     asVectorSink(execution, output_type, index, len, buffer, length);
   }
-  else if (squeezed_shape.rank() == 3)
+  else if (output_shape.rank() == 2)
+  {
+    const auto H = output_shape.dim(0);
+    const auto W = output_shape.dim(1);
+
+    asMatrixSink(execution, output_type, index, H, W, buffer, length);
+  }
+  else if ((output_shape.rank() == 4) && (output_shape.dim(0) == 1))
   {
     const auto &operand_shape = operands.at(operand_index).shape().asFeature();
 
diff --git a/runtimes/pure_arm_compute/src/internal/MatrixSink.h b/runtimes/pure_arm_compute/src/internal/MatrixSink.h
new file mode 100644
index 0000000..f37b540
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/MatrixSink.h
@@ -0,0 +1,55 @@
+#ifndef __INTERNAL_MATRIX_SINK_H__
+#define __INTERNAL_MATRIX_SINK_H__
+
+#include "internal/Sink.h"
+
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/core/Window.h>
+#include <arm_compute/core/Helpers.h>
+
+#include <cstdint>
+#include <cstring>
+#include <cassert>
+
+template <typename T> class MatrixSink final : public Sink
+{
+public:
+  MatrixSink(const int32_t H, const int32_t W, T *base, const size_t size)
+      : _height{H}, _width{W}, _base{base}
+  {
+    assert(size >= _height * _width * sizeof(T));
+  }
+
+public:
+  void pull(::arm_compute::ITensor &tensor) const override
+  {
+    assert(tensor.info()->dimension(0) == _width);
+    assert(tensor.info()->dimension(1) == _height);
+
+    using ::arm_compute::Window;
+    using ::arm_compute::Iterator;
+    using ::arm_compute::Coordinates;
+    using ::arm_compute::execute_window_loop;
+
+    Window window;
+
+    window.use_tensor_dimensions(tensor.info()->tensor_shape(), ::arm_compute::Window::DimY);
+
+    Iterator it(&tensor, window);
+    execute_window_loop(window,
+                        [&](const ::arm_compute::Coordinates &id) {
+                          const auto row = id.y();
+                          memcpy(_base + row * _width, it.ptr(), _width * sizeof(T));
+                        },
+                        it);
+  }
+
+private:
+  const int32_t _height;
+  const int32_t _width;
+
+private:
+  T *const _base;
+};
+
+#endif // __INTERNAL_MATRIX_SINK_H__
-- 
2.7.4