From 2f47fad111e7158e1aea34d8352ccc6a9880bbbd Mon Sep 17 00:00:00 2001
From: =?utf8?q?Vishal=20Keshav/SNAP=20/SRI-Bangalore/Engineer/=EC=82=BC?=
 =?utf8?q?=EC=84=B1=EC=A0=84=EC=9E=90?= <vishal.k1@samsung.com>
Date: Thu, 6 Dec 2018 13:57:47 +0530
Subject: [PATCH] Implementation of CUSTOM Unpack op (#3669)

ACL implementation
Used CLSubtensor and CLReshape
CLPermute to overcome backward swizzle

Signed-off-by: Vishal Keshav <vishal.k1@samsung.com>
---
 runtimes/pure_arm_compute/src/compilation.cc       | 72 +++++++++++++++++++++-
 .../src/internal/layers/SimpleUnpackLayer.cc       | 60 ++++++++++++++++++
 .../src/internal/layers/SimpleUnpackLayer.h        | 29 +++++++++
 .../pure_arm_compute/src/internal/op/Unpack.cc     |  5 +-
 4 files changed, 163 insertions(+), 3 deletions(-)
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h

diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index 7e15d7d..b1092c4 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -104,6 +104,7 @@
 #include "internal/layers/SimplePackLayer.h"
 #include "internal/layers/SimpleSpaceToBatchND.h"
 #include "internal/layers/SimpleNeg.h"
+#include "internal/layers/SimpleUnpackLayer.h"
 #include "internal/layers/SimpleSQRT.h"
 #include "internal/layers/SimpleArgMinMax.h"
 
@@ -5109,8 +5110,77 @@ void Planner::visit(const ::internal::tflite::op::DepthToSpace::Node &node)
 void Planner::visit(const ::internal::tflite::op::Unpack::Node &node)
 {
   VERBOSE(Unpack) << "Configure Unpack operation" << std::endl;
+  const ::internal::tflite::operand::Index ifm_index{node.param().ifm_index};
+  uint32_t input_rank = _ctx.at(ifm_index).shape().rank();
 
-  throw std::runtime_error("Not supported, yet");
+  assert(input_rank == 4 || input_rank == 3 || input_rank == 2);
+  _builder.addShapeConstr(ifm_index, asTensorInfo(asTensorShape(_ctx.at(ifm_index).shape()),
+                                                  _ctx.at(ifm_index).type()));
+
+  int32_t axis =
+      _ctx.at(::internal::tflite::operand::Index{node.param().axis_index}).asScalar<int32_t>();
+  // int32_t num_split =
+  // _ctx.at(::internal::tflite::operand::Index{node.param().num_split_index}).asScalar<int32_t>();
+
+  for (const auto &index : node.param().ofm_indexes)
+  {
+    const ::internal::tflite::operand::Index ofm_index{index};
+    _builder.addShapeConstr(ofm_index, asTensorInfo(asTensorShape(_ctx.at(ofm_index).shape()),
+                                                    _ctx.at(ofm_index).type()));
+  }
+
+  struct Param
+  {
+    std::vector<int32_t> ofm_indexes;
+    int ifm_index;
+    int axis;
+  };
+
+  if (input_rank == 4)
+  {
+    Param param;
+    param.ifm_index = ifm_index.asInt();
+    param.axis = axis;
+    for (const auto &index : node.param().ofm_indexes)
+    {
+      param.ofm_indexes.push_back(index);
+    }
+
+    auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+      auto input_alloc = ctx.at(::internal::tflite::operand::Index{param.ifm_index});
+
+      if (::internal::arm_compute::isGpuMode())
+      {
+        auto fn = nnfw::make_unique<SimpleUnpackLayer>();
+        std::vector<::arm_compute::ICLTensor *> outputs;
+        for (const auto &index : param.ofm_indexes)
+        {
+          auto output_alloc = ctx.at(::internal::tflite::operand::Index{index});
+          outputs.push_back(CAST_CL(output_alloc));
+        }
+        fn->configure(CAST_CL(input_alloc), outputs, param.axis);
+
+        builder.append("Unpack", std::move(fn));
+      }
+      else
+        throw std::runtime_error("Not supported, yet");
+    };
+
+    _builder.addStage(stage);
+  }
+  else if (input_rank == 3)
+  {
+    // TODO: generate test case for this and generalize 4D method all cases.
+    throw std::runtime_error("UNPACK_3D not implemented");
+  }
+  else if (input_rank == 2)
+  {
+    throw std::runtime_error("UNPACK_2D not implemented");
+  }
+  else
+  {
+    throw std::runtime_error("UNPACK axis is not valid");
+  }
 }
 
 void Planner::visit(const ::internal::tflite::op::Pack::Node &node)
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc
new file mode 100644
index 0000000..8dde475
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.cc
@@ -0,0 +1,60 @@
+#include "internal/arm_compute.h"
+#include "internal/Swizzle.h"
+#include "SimpleUnpackLayer.h"
+
+void SimpleUnpackLayer::configure(::arm_compute::ICLTensor *input,
+                                  const std::vector<::arm_compute::ICLTensor *> &output_vector,
+                                  int32_t axis)
+{
+  uint32_t nr_outputs = output_vector.size();
+  _cl_permuted_vector.resize(nr_outputs);
+  _cl_permute_vector.resize(nr_outputs);
+  uint32_t input_rank = input->info()->num_dimensions();
+  const ::arm_compute::PermutationVector pv{2, 0, 1};
+  _input = input;
+  // Negatige axis is supported, -1 implies R-1 axis where R is input rank
+  if (axis < 0)
+  {
+    axis += input_rank;
+  }
+  _axis = ToARMComputeAxis(input_rank, axis).value();
+  _cl_reshape_vector.resize(nr_outputs);
+
+  ::arm_compute::TensorShape subTensor_shape{};
+  for (int i = 0; i < input_rank; i++)
+  {
+    if (i != _axis)
+    {
+      subTensor_shape.set(i, _input->info()->tensor_shape()[i]);
+    }
+    else
+    {
+      subTensor_shape.set(i, 1);
+    }
+  }
+
+  auto subTensor_offset = ::arm_compute::Coordinates{};
+  subTensor_offset.set_num_dimensions(input_rank);
+
+  for (int i = 0; i < output_vector.size(); i++)
+  {
+    _output_vector.push_back(output_vector[i]);
+    subTensor_offset[_axis] = i;
+    auto temp_tensor = std::make_shared<::arm_compute::CLSubTensor>(
+        CAST_CL(_input), subTensor_shape, subTensor_offset, true);
+    _sub_tensor_vector.push_back(temp_tensor);
+    // Copies into the subtensor
+    _cl_permute_vector[i].configure(_sub_tensor_vector[i].get(), &_cl_permuted_vector[i], pv);
+    _cl_reshape_vector[i].configure(&_cl_permuted_vector[i], CAST_CL(_output_vector[i]));
+    _cl_permuted_vector[i].allocator()->allocate();
+  }
+}
+
+void SimpleUnpackLayer::run(void)
+{
+  for (int i = 0; i < _output_vector.size(); i++)
+  {
+    _cl_permute_vector[i].run();
+    _cl_reshape_vector[i].run();
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h
new file mode 100644
index 0000000..f2a78d2
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleUnpackLayer.h
@@ -0,0 +1,29 @@
+#ifndef __UNPACK_LAYER_H__
+#define __UNPACK_LAYER_H__
+
+#include <arm_compute/runtime/Tensor.h>
+#include <arm_compute/runtime/CL/CLTensor.h>
+#include <arm_compute/runtime/CL/CLSubTensor.h>
+#include <arm_compute/runtime/CL/functions/CLReshapeLayer.h>
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+
+class SimpleUnpackLayer : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ICLTensor *input,
+                 const std::vector<::arm_compute::ICLTensor *> &output_vector, int32_t axis);
+
+public:
+  void run(void) override;
+
+private:
+  std::vector<::arm_compute::CLTensor> _cl_permuted_vector;
+  std::vector<::arm_compute::ICLTensor *> _output_vector;
+  std::vector<std::shared_ptr<::arm_compute::CLSubTensor>> _sub_tensor_vector;
+  std::vector<::arm_compute::CLReshapeLayer> _cl_reshape_vector;
+  std::vector<::arm_compute::CLPermute> _cl_permute_vector;
+  ::arm_compute::ICLTensor *_input;
+  int32_t _axis;
+};
+
+#endif // __UNPACK_LAYER_H__
diff --git a/runtimes/pure_arm_compute/src/internal/op/Unpack.cc b/runtimes/pure_arm_compute/src/internal/op/Unpack.cc
index 7f4fb04..a1be028 100644
--- a/runtimes/pure_arm_compute/src/internal/op/Unpack.cc
+++ b/runtimes/pure_arm_compute/src/internal/op/Unpack.cc
@@ -43,8 +43,9 @@ namespace op
 {
 namespace Unpack
 {
-// There are two inputs: tensor which is to be unpacked
-// and axis along which tensor needs to be unpacked.
+// There are three inputs: tensor which is to be unpacked,
+// axis along which tensor needs to be unpacked
+// and number of splits along the axis.
 
 Param::Param(uint32_t inputCount, const uint32_t *inputs, uint32_t outputCount,
              const uint32_t *outputs)
-- 
2.7.4