From a04439ab3aae69dae74cf1c03caf225dd8c13471 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EC=9D=B4=ED=95=9C=EC=A2=85/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?=
 =?utf8?q?=EC=9E=90?= <hanjoung.lee@samsung.com>
Date: Tue, 18 Sep 2018 19:57:04 +0900
Subject: [PATCH] [neurun] Implement PermuteLayer kernel for cpu (#2749)

PermuteLayer basically copies data from input tensor to output tensor,
but if the tensor must be permuted it also do permute the tensor.
Currently NHWC to NCHW and NCHW to NHWC permutation is supported.

Signed-off-by: Hanjoung Lee <hanjoung.lee@samsung.com>
---
 runtimes/neurun/src/kernel/cpu/PermuteLayer.cc | 107 +++++++++++++++++++++++++
 runtimes/neurun/src/kernel/cpu/PermuteLayer.h  |  64 +++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 runtimes/neurun/src/kernel/cpu/PermuteLayer.cc
 create mode 100644 runtimes/neurun/src/kernel/cpu/PermuteLayer.h
diff --git a/runtimes/neurun/src/kernel/cpu/PermuteLayer.cc b/runtimes/neurun/src/kernel/cpu/PermuteLayer.cc
new file mode 100644
index 0000000..b6f327e
--- /dev/null
+++ b/runtimes/neurun/src/kernel/cpu/PermuteLayer.cc
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PermuteLayer.h"
+
+#include "internal/nnapi/feature/Reader.h"
+#include "internal/nnapi/feature/View.h"
+#include "backend/acl_cl/feature/View.h"
+
+#include <util/feature/IndexIterator.h>
+
+// TODO Remove these dependencies to arm_compute lib
+#include <arm_compute/runtime/CL/CLScheduler.h>
+#include <arm_compute/core/CL/ICLTensor.h>
+
+namespace neurun
+{
+namespace kernel
+{
+namespace cpu
+{
+
+void PermuteLayer::configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                             const graph::operand::Shape &shape, Type type)
+{
+  _input = input;
+  _output = output;
+  _shape = shape;
+  _type = type;
+}
+
+void PermuteLayer::run()
+{
+  auto input_buffer = _input->buffer();
+  auto input_size = _input->info()->total_size();
+
+  auto output_buffer = _output->buffer();
+  auto output_size = _output->info()->total_size();
+
+  assert(_shape.rank() == 4);
+  auto feature = _shape.asFeature();
+
+  switch (_type)
+  {
+    case Type::NHWC_TO_NCHW:
+    {
+      const ::internal::nnapi::feature::Reader<float> from{feature, input_buffer, input_size};
+      ::internal::arm_compute::feature::View<float> into{_output};
+
+      // TODO Fix this workaround (We may need codegen::operand::Object instead of ITensor)
+      auto &queue = ::arm_compute::CLScheduler::get().queue();
+      auto _output_cl = dynamic_cast<::arm_compute::ICLTensor *>(_output);
+      _output_cl->map(queue);
+
+      ::nnfw::util::feature::iterate(feature)
+          << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
+               const auto value = from.at(batch, ch, row, col);
+               into.at(batch, ch, row, col) = value;
+             };
+
+      _output_cl->unmap(queue); // TODO Likewise above
+
+      break;
+    }
+    case Type::NCHW_TO_NHWC:
+    {
+      // TODO Fix this workaround (We may need codegen::operand::Object instead of ITensor)
+      auto &queue = ::arm_compute::CLScheduler::get().queue();
+      auto _input_cl = dynamic_cast<::arm_compute::ICLTensor *>(_input);
+      _input_cl->map(queue);
+
+      const ::internal::arm_compute::feature::View<float> from{_input};
+      ::internal::nnapi::feature::View<float> into{feature, output_buffer, output_size};
+
+      ::nnfw::util::feature::iterate(feature)
+          << [&](uint32_t batch, uint32_t ch, uint32_t row, uint32_t col) {
+               const auto value = from.at(batch, ch, row, col);
+               into.at(batch, ch, row, col) = value;
+             };
+
+      _input_cl->unmap(queue); // TODO Likewise above
+
+      break;
+    }
+    case Type::COPY:
+      // If two different backends using same tensor layout, we need this.
+      throw "NYI";
+      break;
+  }
+}
+
+} // namespace cpu
+} // namespace kernel
+} // namespace neurun
diff --git a/runtimes/neurun/src/kernel/cpu/PermuteLayer.h b/runtimes/neurun/src/kernel/cpu/PermuteLayer.h
new file mode 100644
index 0000000..5960871
--- /dev/null
+++ b/runtimes/neurun/src/kernel/cpu/PermuteLayer.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NEURUN_KERNEL_CPU_PERMUTE_LAYER_H__
+#define __NEURUN_KERNEL_CPU_PERMUTE_LAYER_H__
+
+#include <NeuralNetworks.h>
+
+#include <arm_compute/runtime/IFunction.h>
+#include <arm_compute/core/ITensor.h>
+
+#include "internal/nnapi/feature/View.h"
+#include "OperationUtils.h"
+
+namespace neurun
+{
+namespace kernel
+{
+namespace cpu
+{
+
+class PermuteLayer : public ::arm_compute::IFunction
+{
+public:
+  enum class Type
+  {
+    NHWC_TO_NCHW,
+    NCHW_TO_NHWC,
+    COPY
+  };
+
+public:
+  PermuteLayer() = default;
+
+public:
+  void configure(::arm_compute::ITensor *input, ::arm_compute::ITensor *output,
+                 const graph::operand::Shape &shape, Type type);
+  void run();
+
+private:
+  ::arm_compute::ITensor *_input;
+  ::arm_compute::ITensor *_output;
+  graph::operand::Shape _shape{1};
+  Type _type;
+};
+
+} // namespace cpu
+} // namespace kernel
+} // namespace neurun
+
+#endif // __NEURUN_KERNEL_CPU_PERMUTE_LAYER_H__
-- 
2.7.4