From 2bf1eb667e301576c5c37db05ad5f74202a9bd18 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EC=9D=B4=ED=95=9C=EC=A2=85/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?=
 =?utf8?q?=EC=9E=90?= <hanjoung.lee@samsung.com>
Date: Thu, 19 Jul 2018 09:52:45 +0900
Subject: [PATCH] [neuron] Implement Concat acl_cl kernel (#1986)

* [neurun] Implement Concat acl_cl kernel

Implement Concat acl_cl kernel which is not exists in arm_compute lib.

Part of #1861

Signed-off-by: Hanjoung Lee <hanjoung.lee@samsung.com>

* Reflect comments

Signed-off-by: Hanjoung Lee <hanjoung.lee@samsung.com>

* Reflect comment 2

Signed-off-by: Hanjoung Lee <hanjoung.lee@samsung.com>
---
 .../src/internal/kernel/acl_cl/ConcatLayer.cc      | 152 +++++++++++++++++++++
 .../src/internal/kernel/acl_cl/ConcatLayer.h       |  68 +++++++++
 2 files changed, 220 insertions(+)
 create mode 100644 runtimes/neurun/src/internal/kernel/acl_cl/ConcatLayer.cc
 create mode 100644 runtimes/neurun/src/internal/kernel/acl_cl/ConcatLayer.h
diff --git a/runtimes/neurun/src/internal/kernel/acl_cl/ConcatLayer.cc b/runtimes/neurun/src/internal/kernel/acl_cl/ConcatLayer.cc
new file mode 100644
index 0000000..2b98059
--- /dev/null
+++ b/runtimes/neurun/src/internal/kernel/acl_cl/ConcatLayer.cc
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConcatLayer.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include "internal/arm_compute/kernel/View.h"
+#include "logging.h"
+
+namespace
+{
+
+bool matchSizeExceptAxis(const ::arm_compute::ICLTensor *t1, const ::arm_compute::ICLTensor *t2,
+                         uint32_t axis)
+{
+  assert(t1->info()->num_dimensions() <= 4);
+  assert(t2->info()->num_dimensions() <= 4);
+
+  for (uint32_t i = 0; i < 4; i++)
+  {
+    if (axis == i)
+      continue;
+    if (t1->info()->dimension(i) == t2->info()->dimension(i))
+      return false;
+  }
+  return true;
+}
+
+} // namespace {anonymous}
+
+namespace internal
+{
+namespace kernel
+{
+namespace acl_cl
+{
+
+bool ConcatLayer::concatenationFloat32()
+{
+  // Input and output size check
+  {
+    // NOTE Support only tensor with dimension 4 or less
+
+    uint32_t axis_sum = 0;
+
+    for (auto input : _input_allocs)
+    {
+      assert(matchSizeExceptAxis(_output_alloc, input, _axis));
+      axis_sum += input->info()->dimension(_axis);
+    }
+
+    assert(_output_alloc->info()->dimension(_axis) == axis_sum);
+  }
+
+  VERBOSE(Concat_RUN) << "START Concat" << std::endl;
+
+  // Perform operation
+  {
+    uint32_t axis_offset = 0;
+
+    auto &queue = ::arm_compute::CLScheduler::get().queue();
+
+    _output_alloc->map(queue);
+    ::internal::arm_compute::kernel::View<float> output_view{_output_alloc};
+
+    for (auto input : _input_allocs)
+    {
+      input->map(queue);
+      const ::internal::arm_compute::kernel::View<float> input_reader{input};
+
+      for (uint32_t n = 0; n < input_reader.shape().N; n++)
+      {
+        for (uint32_t c = 0; c < input_reader.shape().C; c++)
+        {
+          for (uint32_t h = 0; h < input_reader.shape().H; h++)
+          {
+            for (uint32_t w = 0; w < input_reader.shape().W; w++)
+            {
+              uint32_t no = (_axis == 3) ? axis_offset : 0;
+              uint32_t co = (_axis == 2) ? axis_offset : 0;
+              uint32_t ho = (_axis == 1) ? axis_offset : 0;
+              uint32_t wo = (_axis == 0) ? axis_offset : 0;
+              output_view.at(n + no, c + co, h + ho, w + wo) = input_reader.at(n, c, h, w);
+            }
+          }
+        }
+      }
+      if (_axis == 3)
+        axis_offset += input_reader.shape().N;
+      if (_axis == 2)
+        axis_offset += input_reader.shape().C;
+      if (_axis == 1)
+        axis_offset += input_reader.shape().H;
+      if (_axis == 0)
+        axis_offset += input_reader.shape().W;
+
+      input->unmap(queue);
+    }
+    _output_alloc->unmap(queue);
+  }
+
+  VERBOSE(Concat_RUN) << "End   Concat" << std::endl;
+
+  return true;
+}
+
+void ConcatLayer::configure(const std::vector<::arm_compute::ICLTensor *> &input_allocs,
+                            int32_t axis, ::arm_compute::ICLTensor *output_alloc)
+{
+  _input_allocs = input_allocs;
+  _output_alloc = output_alloc;
+
+  assert(axis < 4);
+
+  // This map converts NHWC to NCHW(reversed)
+  // NHWC -> WHCN
+  static const uint32_t axis_map[] = {3, 1, 0, 2};
+  _axis = axis_map[axis];
+
+  // TODO Support Quant8
+  _input_type = OperandType::TENSOR_FLOAT32;
+}
+
+void ConcatLayer::run()
+{
+  if (_input_type == OperandType::TENSOR_FLOAT32)
+  {
+    concatenationFloat32();
+  }
+  else if (_input_type == OperandType::TENSOR_QUANT8_ASYMM)
+  {
+    throw std::runtime_error("NYI - concatenationQuant8()");
+  }
+}
+
+} // namespace acl_cl
+} // namespace kernel
+} // namespace internal
diff --git a/runtimes/neurun/src/internal/kernel/acl_cl/ConcatLayer.h b/runtimes/neurun/src/internal/kernel/acl_cl/ConcatLayer.h
new file mode 100644
index 0000000..a8aa675
--- /dev/null
+++ b/runtimes/neurun/src/internal/kernel/acl_cl/ConcatLayer.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __INTERNAL_KERNEL_ACL_CL_CONCAT_LAYER_H__
+#define __INTERNAL_KERNEL_ACL_CL_CONCAT_LAYER_H__
+
+#include <NeuralNetworks.h>
+
+#include <arm_compute/core/CL/ICLTensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+#include "internal/Model.h"
+#include "internal/kernel/cpufallback/OperationUtils.h"
+
+using namespace internal::kernel::cpu;
+
+namespace internal
+{
+namespace kernel
+{
+namespace acl_cl
+{
+
+//
+// internal::kernel::acl_cl::ConcatLayer
+// A naive implementation of ConcatLayer for ACL
+//
+
+class ConcatLayer : public ::arm_compute::IFunction
+{
+public:
+  ConcatLayer() {}
+
+public:
+  void configure(const std::vector<::arm_compute::ICLTensor *> &input_allocs,
+                 int32_t axis /* NNAPI tensor axis from NHWC order */,
+                 ::arm_compute::ICLTensor *output_alloc);
+
+  void run();
+
+private:
+  bool concatenationFloat32();
+
+private:
+  std::vector<::arm_compute::ICLTensor *> _input_allocs;
+  ::arm_compute::ICLTensor *_output_alloc;
+  int32_t _axis;
+  OperandType _input_type;
+};
+
+} // namespace acl_cl
+} // namespace kernel
+} // namespace internal
+
+#endif // __INTERNAL_KERNEL_ACL_CL_CONCAT_LAYER_H__
-- 
2.7.4