From 3a7ed54157eaccd346974964e8c8c88248c6cb89 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EB=B0=95=EC=A2=85=ED=98=84/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Staff=20Engineer/=EC=82=BC=EC=84=B1?=
 =?utf8?q?=EC=A0=84=EC=9E=90?= <jh1302.park@samsung.com>
Date: Thu, 31 May 2018 09:56:31 +0900
Subject: [PATCH] [Pure CL] Support Tensor Sinks (#1440)

This commit revises pure CL runtime to support generic tensor outputs.

Signed-off-by: Jonghyun Park <jh1302.park@samsung.com>
---
 runtimes/pure_arm_compute/src/execution.cc         | 20 ++++--
 runtimes/pure_arm_compute/src/internal/Sinks.h     | 62 +++++++++++++++++
 .../src/internal/arm_compute/tensor/View.h         | 75 +++++++++++++++++++++
 .../src/internal/nnapi/tensor/View.h               | 77 ++++++++++++++++++++++
 4 files changed, 228 insertions(+), 6 deletions(-)
 create mode 100644 runtimes/pure_arm_compute/src/internal/Sinks.h
 create mode 100644 runtimes/pure_arm_compute/src/internal/arm_compute/tensor/View.h
 create mode 100644 runtimes/pure_arm_compute/src/internal/nnapi/tensor/View.h
diff --git a/runtimes/pure_arm_compute/src/execution.cc b/runtimes/pure_arm_compute/src/execution.cc
index f56a6bd..0214b6a 100644
--- a/runtimes/pure_arm_compute/src/execution.cc
+++ b/runtimes/pure_arm_compute/src/execution.cc
@@ -7,6 +7,7 @@
 #include "internal/nnapi/feature/View.h"
 
 #include "internal/arm_compute/feature/View.h"
+#include "internal/Sinks.h"
 
 #include "util/feature/IndexIterator.h"
 
@@ -191,14 +192,11 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution *execution, int3
 
   // TODO Check type conflicts
 
-  // NOTE The current implemenation assumes that every output is a feature map.
-  // TODO Remove this assumption
   const auto operand_index = execution->plan().model().outputs.at(index);
 
-  if (operands.at(operand_index).shape().rank() == 2)
+  if ((operands.at(operand_index).shape().rank() == 2) &&
+      (operands.at(operand_index).shape().dim(0) == 1))
   {
-    assert(operands.at(operand_index).shape().dim(0) == 1);
-
     const auto len = operands.at(operand_index).shape().dim(1);
 
     execution->sink<VectorSink>(index, len, reinterpret_cast<uint8_t *>(buffer), length);
@@ -211,7 +209,17 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution *execution, int3
   }
   else
   {
-    throw std::runtime_error{"Not supported, yet"};
+    // NOTE TensorSink is much slower than VectorSink and FeatureSink
+    const uint32_t rank = operands.at(operand_index).shape().rank();
+    // TODO Remove conversion
+    nnfw::util::tensor::Shape shape(rank);
+
+    for (uint32_t axis = 0; axis < rank; ++axis)
+    {
+      shape.dim(axis) = operands.at(operand_index).shape().dim(axis);
+    }
+
+    execution->sink<TensorSink>(index, shape, reinterpret_cast<uint8_t *>(buffer), length);
   }
 
   return ANEURALNETWORKS_NO_ERROR;
diff --git a/runtimes/pure_arm_compute/src/internal/Sinks.h b/runtimes/pure_arm_compute/src/internal/Sinks.h
new file mode 100644
index 0000000..ecbffe8
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/Sinks.h
@@ -0,0 +1,62 @@
+#ifndef __INTERNAL_SINKS_H__
+#define __INTERNAL_SINKS_H__
+
+// TODO Move Sink base class to here
+#include "execution.h"
+
+#include <arm_compute/core/ITensor.h>
+
+// TODO Move VectorSink and FeatureSink into here
+
+//
+// TensorSink
+//
+#include "internal/Swizzle.h"
+
+#include "internal/nnapi/tensor/View.h"
+#include "internal/arm_compute/tensor/View.h"
+
+#include "util/tensor/IndexIterator.h"
+
+class TensorSink final : public Sink
+{
+public:
+  TensorSink(const nnfw::util::tensor::Shape &shape, uint8_t *base, const size_t size)
+      : _shape{shape}, _base{base}, _size{size}
+  {
+    // DO NOTHING
+  }
+
+public:
+  void pull(::arm_compute::ITensor &tensor) const override
+  {
+    const ::internal::arm_compute::tensor::View<float> from{_shape, &tensor};
+    ::internal::nnapi::tensor::View<float> into{_shape, _base, _size};
+
+    using ::nnfw::util::tensor::iterate;
+    using ::nnfw::util::tensor::Index;
+
+    const uint32_t rank = _shape.rank();
+
+    ::nnfw::util::tensor::iterate(_shape) << [&](const Index &raw) {
+      Index permuted(raw.rank());
+
+      for (uint32_t axis = 0; axis < rank; ++axis)
+      {
+        permuted.at(ToARMComputeAxis(rank, axis).value()) = raw.at(axis);
+      }
+
+      const auto value = from.at(permuted);
+      into.at(raw) = value;
+    };
+  }
+
+private:
+  const nnfw::util::tensor::Shape _shape;
+
+private:
+  uint8_t *const _base;
+  const size_t _size;
+};
+
+#endif // __INTERNAL_SINKS_H__
diff --git a/runtimes/pure_arm_compute/src/internal/arm_compute/tensor/View.h b/runtimes/pure_arm_compute/src/internal/arm_compute/tensor/View.h
new file mode 100644
index 0000000..a5af6ed
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/arm_compute/tensor/View.h
@@ -0,0 +1,75 @@
+#ifndef __INTERNAL_ARM_COMPUTE_TENSOR_VIEW_H__
+#define __INTERNAL_ARM_COMPUTE_TENSOR_VIEW_H__
+
+#include "util/tensor/Shape.h"
+#include "util/tensor/Index.h"
+
+#include <arm_compute/core/ITensor.h>
+
+namespace internal
+{
+namespace arm_compute
+{
+namespace tensor
+{
+
+template <typename T> class View
+{
+public:
+  View(const nnfw::util::tensor::Shape &shape, ::arm_compute::ITensor *tensor)
+      : _shape{shape}, _tensor{tensor}
+  {
+    // DO NOTHING
+  }
+
+public:
+  const nnfw::util::tensor::Shape &shape(void) const { return _shape; }
+
+private:
+  uint32_t byte_offset_of(const nnfw::util::tensor::Index &index) const
+  {
+    const uint32_t rank = _shape.rank();
+
+    ::arm_compute::Coordinates coordinates;
+
+    coordinates.set_num_dimensions(rank);
+
+    for (uint32_t axis = 0; axis < rank; ++axis)
+    {
+      coordinates[axis] = index.at(axis);
+    }
+
+    return _tensor->info()->offset_element_in_bytes(coordinates);
+  }
+
+public:
+  T at(const nnfw::util::tensor::Index &index) const
+  {
+    const auto offset = byte_offset_of(index);
+
+    float *ptr = reinterpret_cast<float *>(_tensor->buffer() + offset);
+
+    return *ptr;
+  }
+
+  T &at(const nnfw::util::tensor::Index &index)
+  {
+    const auto offset = byte_offset_of(index);
+
+    float *ptr = reinterpret_cast<float *>(_tensor->buffer() + offset);
+
+    return *ptr;
+  }
+
+private:
+  const nnfw::util::tensor::Shape _shape;
+
+private:
+  ::arm_compute::ITensor *_tensor;
+};
+
+} // namespace tensor
+} // namespace arm_compute
+} // namespace internal
+
+#endif // __INTERNAL_ARM_COMPUTE_TENSOR_VIEW_H__
diff --git a/runtimes/pure_arm_compute/src/internal/nnapi/tensor/View.h b/runtimes/pure_arm_compute/src/internal/nnapi/tensor/View.h
new file mode 100644
index 0000000..e521088
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/nnapi/tensor/View.h
@@ -0,0 +1,77 @@
+#ifndef __INTERNAL_NNAPI_TENSOR_VIEW_H__
+#define __INTERNAL_NNAPI_TENSOR_VIEW_H__
+
+#include "util/tensor/Shape.h"
+#include "util/tensor/Index.h"
+
+namespace internal
+{
+namespace nnapi
+{
+namespace tensor
+{
+
+template <typename T> class View
+{
+public:
+  View(const ::nnfw::util::tensor::Shape &shape, uint8_t *ptr, size_t len)
+      : _shape{shape}, _ptr{ptr}, _len{len}
+  {
+    // DO NOTHING
+  }
+
+public:
+  const nnfw::util::tensor::Shape &shape(void) const { return _shape; }
+
+private:
+  uint32_t offset_of(const nnfw::util::tensor::Index &index) const
+  {
+    if (_shape.rank() == 0)
+    {
+      return 0;
+    }
+
+    uint32_t offset = index.at(0);
+
+    // Stride decreases as axis increases in NNAPI
+    for (uint32_t axis = 1; axis < _shape.rank(); ++axis)
+    {
+      offset *= _shape.dim(axis);
+      offset += index.at(axis);
+    }
+
+    return offset;
+  }
+
+public:
+  T at(const nnfw::util::tensor::Index &index) const
+  {
+    const auto offset = offset_of(index);
+
+    T *arr = reinterpret_cast<T *>(_ptr);
+
+    return arr[offset];
+  }
+
+  T &at(const nnfw::util::tensor::Index &index)
+  {
+    const auto offset = offset_of(index);
+
+    T *arr = reinterpret_cast<T *>(_ptr);
+
+    return arr[offset];
+  }
+
+private:
+  nnfw::util::tensor::Shape _shape;
+
+private:
+  uint8_t *_ptr;
+  const size_t _len;
+};
+
+} // namespace tensor
+} // namespace nnapi
+} // namespace internal
+
+#endif // __INTERNAL_NNAPI_TENSOR_VIEW_H__
-- 
2.7.4