From d6971cbfaf3f5898970141badbb024177a42fbaa Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EC=9C=A4=ED=98=84=EC=8B=9D/=EB=8F=99=EC=9E=91=EC=A0=9C?= =?utf8?q?=EC=96=B4Lab=28SR=29/Principal=20Engineer/=EC=82=BC=EC=84=B1?= =?utf8?q?=EC=A0=84=EC=9E=90?= Date: Thu, 14 Jun 2018 20:09:58 +0900 Subject: [PATCH] Optimized TensorSink (#1665) Parent issue : #1658 memcpy is used just like #1647. Tested with `tools/nnapi_unittests/test/mul_1` and `mul_2`. Signed-off-by: Hyun Sik Yoon hyunsik.yoon@samsung.com --- runtimes/pure_arm_compute/src/execution.cc | 32 ++++++++++--- .../pure_arm_compute/src/internal/Tensor3DSink.h | 54 ++++++++++++++++++++++ 2 files changed, 79 insertions(+), 7 deletions(-) create mode 100644 runtimes/pure_arm_compute/src/internal/Tensor3DSink.h diff --git a/runtimes/pure_arm_compute/src/execution.cc b/runtimes/pure_arm_compute/src/execution.cc index 12c3ec2..194464a 100644 --- a/runtimes/pure_arm_compute/src/execution.cc +++ b/runtimes/pure_arm_compute/src/execution.cc @@ -12,6 +12,7 @@ #include "internal/arm_compute/feature/View.h" #include "internal/Sinks.h" #include "internal/MatrixSink.h" +#include "internal/Tensor3DSink.h" #include "util/feature/IndexIterator.h" @@ -285,20 +286,36 @@ static void asTensorSink(ANeuralNetworksExecution *execution, int32_t type, int3 { case ANEURALNETWORKS_FLOAT32: case ANEURALNETWORKS_TENSOR_FLOAT32: - execution->sink>(index, shape, reinterpret_cast(buffer), length); + if (shape.rank() == 3) + execution->sink>(index, shape, reinterpret_cast(buffer), + length); + else + execution->sink>(index, shape, reinterpret_cast(buffer), length); break; case ANEURALNETWORKS_INT32: case ANEURALNETWORKS_TENSOR_INT32: - execution->sink>(index, shape, reinterpret_cast(buffer), - length); + if (shape.rank() == 3) + execution->sink>(index, shape, reinterpret_cast(buffer), + length); + else + execution->sink>(index, shape, reinterpret_cast(buffer), + length); break; case ANEURALNETWORKS_UINT32: - execution->sink>(index, shape, reinterpret_cast(buffer), - length); + if (shape.rank() == 3) + execution->sink>(index, shape, reinterpret_cast(buffer), + length); + else + execution->sink>(index, shape, reinterpret_cast(buffer), + length); break; case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM: - execution->sink>(index, shape, reinterpret_cast(buffer), - length); + if (shape.rank() == 3) + execution->sink>(index, shape, reinterpret_cast(buffer), + length); + else + execution->sink>(index, shape, reinterpret_cast(buffer), + length); break; default: throw std::runtime_error("Not supported, yet"); @@ -423,6 +440,7 @@ int ANeuralNetworksExecution_setOutput(ANeuralNetworksExecution *execution, int3 else { // NOTE TensorSink is much slower than VectorSink and FeatureSink + // in case of 3D tensor, optimized Tensor3DSink is called inside asTensorSink const auto &shape = operands.at(operand_index).shape(); asTensorSink(execution, output_type, index, shape, buffer, length); } diff --git a/runtimes/pure_arm_compute/src/internal/Tensor3DSink.h b/runtimes/pure_arm_compute/src/internal/Tensor3DSink.h new file mode 100644 index 0000000..c5fad3f --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/Tensor3DSink.h @@ -0,0 +1,54 @@ +#ifndef __TENSOR3D_SINK_H__ +#define __TENSOR3D_SINK_H__ + +#include "internal/Sink.h" + +// +// This is mempcy() version of generic TensorSink for 3D tensor +// +#include +#include +#include + +template class Tensor3DSink final : public Sink +{ +public: + Tensor3DSink(const nnfw::util::tensor::Shape &shape, T *base, const size_t size) + : _shape{shape}, _base{base}, _size{size} + { + // DO NOTHING + } + +public: + void pull(::arm_compute::ITensor &tensor) const override + { + using ::arm_compute::Window; + using ::arm_compute::Iterator; + using ::arm_compute::Coordinates; + using ::arm_compute::execute_window_loop; + + Window window; + + window.use_tensor_dimensions(tensor.info()->tensor_shape(), ::arm_compute::Window::DimY); + int32_t height_width = _shape.dim(1) * _shape.dim(2); + int32_t width = _shape.dim(2); + + Iterator it(&tensor, window); + execute_window_loop(window, + [&](const ::arm_compute::Coordinates &id) { + const auto z = id.z(); + const auto y = id.y(); + memcpy(_base + z * height_width + y * width, it.ptr(), width * sizeof(T)); + }, + it); + } + +private: + const nnfw::util::tensor::Shape _shape; + +private: + T *const _base; + const size_t _size; +}; + +#endif // __TENSOR3D_SINK_H__ -- 2.7.4