From ce36b15c70c6f0add7b47ad8bc03202a843479cb Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EC=9E=A5=EC=A7=80=EC=84=AD/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?=
 =?utf8?q?=EC=9E=90?= <jiseob.jang@samsung.com>
Date: Fri, 17 Aug 2018 15:03:08 +0900
Subject: [PATCH] Unify the order of inputs in STRIDED_SLICE operation & Update
 ACL (#2193)

* Unify the order of inputs in STRIDED_SLICE operation

This commit unifies the order of inputs in STRIDED_SLICE operation.
  - add reordering bits of a scalar as the order of inputdata's shape.
  - add reordering data of a vector as the order of inputdata's shape.
  - unify the order of inputs in STRIDED_SLICE.

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>

* Update ACL

This commit updates acl.
  - Extend the range in which StridedSlice is supported by cl.

Signed-off-by: jiseob.jang <jiseob.jang@samsung.com>
---
 externals/acl                                    |  2 +-
 runtimes/pure_arm_compute/src/compilation.cc     | 62 +++++++++++++++++++-----
 runtimes/pure_arm_compute/src/internal/Model.h   | 11 +++++
 runtimes/pure_arm_compute/src/internal/Swizzle.h | 15 ++++++
 4 files changed, 78 insertions(+), 12 deletions(-)
diff --git a/externals/acl b/externals/acl
index b29bc9e..77e9913 160000
--- a/externals/acl
+++ b/externals/acl
@@ -1 +1 @@
-Subproject commit b29bc9ed09561c93f5cffb61e2e74222cd4a4542
+Subproject commit 77e9913a2bab598033c7d2c5179a9bcd80b959e6
diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index 0ce957f..0588af8 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -207,6 +207,23 @@ static void initMatrixTensor(::arm_compute::ITensor &tensor,
 }
 
 template <typename T>
+static void initReorderVectorTensor(::arm_compute::ITensor &tensor, const uint8_t *vec_base,
+                                    const size_t vec_size)
+{
+  for (uint32_t n = 0; n < vec_size; ++n)
+  {
+    const ::arm_compute::Coordinates coordinate{ToARMComputeAxis(vec_size, n).value()};
+
+    T *into = reinterpret_cast<T *>(tensor.ptr_to_element(coordinate));
+
+    const T *from = reinterpret_cast<const T *>(vec_base) + n;
+    const auto value = *from;
+
+    *into = value;
+  }
+}
+
+template <typename T>
 static void initKernelTensor(::arm_compute::ITensor &tensor,
                              const nnfw::util::kernel::Shape &kernel_shape,
                              const uint8_t *kernel_base, const size_t kernel_size)
@@ -2126,7 +2143,6 @@ void Planner::visit(const ::internal::tflite::op::StridedSlice::Node &node)
   const ::internal::tflite::operand::Index endMask_index{node.param().endMask_index};
   const ::internal::tflite::operand::Index shrinkAxisMask_index{node.param().shrinkAxisMask_index};
 
-  // TODO Should move to the place where the operand is handled, if it is possible.
   // Set Shape Constraints
   _builder.addShapeConstr(outputData_index, asTensorInfo(_ctx.at(outputData_index).shape(),
                                                          _ctx.at(outputData_index).type(),
@@ -2137,13 +2153,36 @@ void Planner::visit(const ::internal::tflite::op::StridedSlice::Node &node)
                                                         _ctx.at(inputData_index).scale(),
                                                         _ctx.at(inputData_index).zeroPoint()));
 
-  _builder.addShapeConstr(startData_index, asTensorInfo(_ctx.at(startData_index).shape().asVector(),
-                                                        _ctx.at(startData_index).type()));
-  _builder.addShapeConstr(endData_index, asTensorInfo(_ctx.at(endData_index).shape().asVector(),
-                                                      _ctx.at(endData_index).type()));
+  const auto startData_size = _ctx.at(startData_index).shape().asVector();
+  const auto endData_size = _ctx.at(endData_index).shape().asVector();
+  const auto stridesData_size = _ctx.at(stridesData_index).shape().asVector();
+  _builder.addShapeConstr(startData_index,
+                          asTensorInfo(startData_size, _ctx.at(startData_index).type()));
+  _builder.addShapeConstr(endData_index, asTensorInfo(endData_size, _ctx.at(endData_index).type()));
   _builder.addShapeConstr(stridesData_index,
-                          asTensorInfo(_ctx.at(stridesData_index).shape().asVector(),
-                                       _ctx.at(stridesData_index).type()));
+                          asTensorInfo(stridesData_size, _ctx.at(stridesData_index).type()));
+
+  // Set initializers for indices data such as order of inputData
+  {
+    auto startData_base = _ctx.at(startData_index).data().base();
+    auto endData_base = _ctx.at(endData_index).data().base();
+    auto stridesData_base = _ctx.at(stridesData_index).data().base();
+
+    assert(_ctx.at(startData_index).type() == ANEURALNETWORKS_TENSOR_INT32);
+    auto startData_initializer =
+        std::bind(initReorderVectorTensor<int32_t>, _1, startData_base, startData_size);
+    _builder.addInitializer(startData_index, startData_initializer);
+
+    assert(_ctx.at(endData_index).type() == ANEURALNETWORKS_TENSOR_INT32);
+    auto endData_initializer =
+        std::bind(initReorderVectorTensor<int32_t>, _1, endData_base, endData_size);
+    _builder.addInitializer(endData_index, endData_initializer);
+
+    assert(_ctx.at(stridesData_index).type() == ANEURALNETWORKS_TENSOR_INT32);
+    auto stridesData_initializer =
+        std::bind(initReorderVectorTensor<int32_t>, _1, stridesData_base, stridesData_size);
+    _builder.addInitializer(stridesData_index, stridesData_initializer);
+  }
 
   struct Param
   {
@@ -2167,15 +2206,16 @@ void Planner::visit(const ::internal::tflite::op::StridedSlice::Node &node)
   param.endData_index = endData_index.asInt();
   param.stridesData_index = stridesData_index.asInt();
 
-  param.beginMask = _ctx.at(beginMask_index).asScalar<int32_t>();
-  param.endMask = _ctx.at(endMask_index).asScalar<int32_t>();
-  param.shrinkAxisMask = _ctx.at(shrinkAxisMask_index).asScalar<int32_t>();
+  // Set mask bits such as order of inputData
+  const auto inputData_rank = _ctx.at(inputData_index).shape().rank();
+  param.beginMask = _ctx.at(beginMask_index).asReorderBits<int32_t>(inputData_rank);
+  param.endMask = _ctx.at(endMask_index).asReorderBits<int32_t>(inputData_rank);
+  param.shrinkAxisMask = _ctx.at(shrinkAxisMask_index).asReorderBits<int32_t>(inputData_rank);
 
   auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
     auto outputData_alloc = ctx.at(::internal::tflite::operand::Index{param.outputData_index});
     auto inputData_alloc = ctx.at(::internal::tflite::operand::Index{param.inputData_index});
 
-    // TODO: Let's alloc 1-D array for startData, endData and stridesData from operand
     auto startData_alloc = ctx.at(::internal::tflite::operand::Index{param.startData_index});
     auto endData_alloc = ctx.at(::internal::tflite::operand::Index{param.endData_index});
     auto stridesData_alloc = ctx.at(::internal::tflite::operand::Index{param.stridesData_index});
diff --git a/runtimes/pure_arm_compute/src/internal/Model.h b/runtimes/pure_arm_compute/src/internal/Model.h
index 6a332e9..86bbe66 100644
--- a/runtimes/pure_arm_compute/src/internal/Model.h
+++ b/runtimes/pure_arm_compute/src/internal/Model.h
@@ -120,6 +120,7 @@ private:
 #include <memory>
 #include <cassert>
 #include <functional>
+#include "internal/Swizzle.h"
 
 namespace internal
 {
@@ -167,6 +168,16 @@ public:
     return *(reinterpret_cast<const T *>(_data->base()));
   }
 
+public:
+  template <typename T> T asReorderBits(size_t numOfBits) const
+  {
+    assert((_shape.rank() == 0) || ((_shape.rank() == 1) && (_shape.dim(0) == 1)));
+    assert(_data != nullptr);
+    assert((_data->base() != nullptr) && (_data->size() == sizeof(T)));
+
+    return ReorderBits<T>(asScalar<T>(), numOfBits);
+  }
+
 private:
   const Shape _shape;
   const int32_t _type;
diff --git a/runtimes/pure_arm_compute/src/internal/Swizzle.h b/runtimes/pure_arm_compute/src/internal/Swizzle.h
index 73c0d10..8c3d1ef 100644
--- a/runtimes/pure_arm_compute/src/internal/Swizzle.h
+++ b/runtimes/pure_arm_compute/src/internal/Swizzle.h
@@ -49,4 +49,19 @@ inline ARMComputeAxis ToARMComputeAxis(uint32_t rank, uint32_t axis)
   return reversed;
 }
 
+#include <cassert>
+
+template <typename T> inline T ReorderBits(T in, size_t numOfBits)
+{
+  assert(numOfBits > 0);
+  T out = 0;
+  for (int32_t i = numOfBits - 1; i >= 0; --i)
+  {
+    const uint32_t toShift = numOfBits - ToARMComputeAxis(numOfBits, i).value() - 1;
+    out += ((in & 1) << toShift);
+    in >>= 1;
+  }
+  return out;
+}
+
 #endif // __SWIZZLE_H__
-- 
2.7.4