Optimized TensorSink (#1665)

author 윤현식/동작제어Lab(SR)/Principal Engineer/삼성전자 <hyunsik.yoon@samsung.com>

Thu, 14 Jun 2018 11:09:58 +0000 (20:09 +0900)

committer 이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>

Thu, 14 Jun 2018 11:09:58 +0000 (20:09 +0900)
author 윤현식/동작제어Lab(SR)/Principal Engineer/삼성전자 <hyunsik.yoon@samsung.com>
Thu, 14 Jun 2018 11:09:58 +0000 (20:09 +0900)
committer 이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
Thu, 14 Jun 2018 11:09:58 +0000 (20:09 +0900)
diff --git a/runtimes/pure_arm_compute/src/execution.cc b/runtimes/pure_arm_compute/src/execution.cc

index 12c3ec2..194464a 100644 (file)
--- a/runtimes/pure_arm_compute/src/execution.cc
+++ b/runtimes/pure_arm_compute/src/execution.cc
@@ -12,6 +12,7 @@
  #include "internal/arm_compute/feature/View.h"
  #include "internal/Sinks.h"
  #include "internal/MatrixSink.h"
+#include "internal/Tensor3DSink.h"
  
  #include "util/feature/IndexIterator.h"
  
@@ -285,20 +286,36 @@ static void asTensorSink(ANeuralNetworksExecution *execution, int32_t type, int3
    {
      case ANEURALNETWORKS_FLOAT32:
      case ANEURALNETWORKS_TENSOR_FLOAT32:
-      execution->sink<TensorSink<float>>(index, shape, reinterpret_cast<float *>(buffer), length);
+      if (shape.rank() == 3)
+        execution->sink<Tensor3DSink<float>>(index, shape, reinterpret_cast<float *>(buffer),
+                                             length);
+      else
+        execution->sink<TensorSink<float>>(index, shape, reinterpret_cast<float *>(buffer), length);
        break;
      case ANEURALNETWORKS_INT32:
      case ANEURALNETWORKS_TENSOR_INT32:
-      execution->sink<TensorSink<int32_t>>(index, shape, reinterpret_cast<int32_t *>(buffer),
-                                           length);
+      if (shape.rank() == 3)
+        execution->sink<Tensor3DSink<int32_t>>(index, shape, reinterpret_cast<int32_t *>(buffer),
+                                               length);
+      else
+        execution->sink<TensorSink<int32_t>>(index, shape, reinterpret_cast<int32_t *>(buffer),
+                                             length);
        break;
      case ANEURALNETWORKS_UINT32:
-      execution->sink<TensorSink<uint32_t>>(index, shape, reinterpret_cast<uint32_t *>(buffer),
-                                            length);
+      if (shape.rank() == 3)
+        execution->sink<Tensor3DSink<uint32_t>>(index, shape, reinterpret_cast<uint32_t *>(buffer),
+                                                length);
+      else
+        execution->sink<TensorSink<uint32_t>>(index, shape, reinterpret_cast<uint32_t *>(buffer),
+                                              length);
        break;
      case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM:
-      execution->sink<TensorSink<uint8_t>>(index, shape, reinterpret_cast<uint8_t *>(buffer),
-                                           length);
+      if (shape.rank() == 3)
+        execution->sink<Tensor3DSink<uint8_t>>(index, shape, reinterpret_cast<uint8_t *>(buffer),
+                                               length);
+      else
+        execution->sink<TensorSink<uint8_t>>(index, shape, reinterpret_cast<uint8_t *>(buffer),
+                                             length);
        break;
      default:
        throw std::runtime_error("Not supported, yet");
@@ -423,6 +440,7 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution *execution, int3
    else
    {
      // NOTE TensorSink is much slower than VectorSink and FeatureSink
+    //      in case of 3D tensor, optimized Tensor3DSink is called inside asTensorSink
      const auto &shape = operands.at(operand_index).shape();
      asTensorSink(execution, output_type, index, shape, buffer, length);
    }
diff --git a/runtimes/pure_arm_compute/src/internal/Tensor3DSink.h b/runtimes/pure_arm_compute/src/internal/Tensor3DSink.h

new file mode 100644 (file)

index 0000000..c5fad3f
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/Tensor3DSink.h
@@ -0,0 +1,54 @@
+#ifndef __TENSOR3D_SINK_H__
+#define __TENSOR3D_SINK_H__
+
+#include "internal/Sink.h"
+
+//
+// This is mempcy() version of generic TensorSink for 3D tensor
+//
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/core/Window.h>
+#include <arm_compute/core/Helpers.h>
+
+template <typename T> class Tensor3DSink final : public Sink
+{
+public:
+  Tensor3DSink(const nnfw::util::tensor::Shape &shape, T *base, const size_t size)
+      : _shape{shape}, _base{base}, _size{size}
+  {
+    // DO NOTHING
+  }
+
+public:
+  void pull(::arm_compute::ITensor &tensor) const override
+  {
+    using ::arm_compute::Window;
+    using ::arm_compute::Iterator;
+    using ::arm_compute::Coordinates;
+    using ::arm_compute::execute_window_loop;
+
+    Window window;
+
+    window.use_tensor_dimensions(tensor.info()->tensor_shape(), ::arm_compute::Window::DimY);
+    int32_t height_width = _shape.dim(1) * _shape.dim(2);
+    int32_t width = _shape.dim(2);
+
+    Iterator it(&tensor, window);
+    execute_window_loop(window,
+                        [&](const ::arm_compute::Coordinates &id) {
+                          const auto z = id.z();
+                          const auto y = id.y();
+                          memcpy(_base + z * height_width + y * width, it.ptr(), width * sizeof(T));
+                        },
+                        it);
+  }
+
+private:
+  const nnfw::util::tensor::Shape _shape;
+
+private:
+  T *const _base;
+  const size_t _size;
+};
+
+#endif // __TENSOR3D_SINK_H__
author	윤현식/동작제어Lab(SR)/Principal Engineer/삼성전자 <hyunsik.yoon@samsung.com>
	Thu, 14 Jun 2018 11:09:58 +0000 (20:09 +0900)
committer	이춘석/동작제어Lab(SR)/Staff Engineer/삼성전자 <chunseok.lee@samsung.com>
	Thu, 14 Jun 2018 11:09:58 +0000 (20:09 +0900)
runtimes/pure_arm_compute/src/execution.cc		patch \| blob \| history
runtimes/pure_arm_compute/src/internal/Tensor3DSink.h	[new file with mode: 0644]	patch \| blob