From c01c19ae392ea9cb630de3dcfd8c2421dfac4102 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EB=B0=95=EC=84=B8=ED=9D=AC/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Principal=20Engineer/=EC=82=BC=EC=84=B1?=
 =?utf8?q?=EC=A0=84=EC=9E=90?= <saehie.park@samsung.com>
Date: Tue, 17 Apr 2018 15:48:52 +0900
Subject: [PATCH] Initial code of Concatenation on ACL (#746)

This will add initial implementation of Concatenation on ACL CL and NEON

Signed-off-by: SaeHie Park <saehie.park@samsung.com>
---
 include/kernel/acl/Concatenation.h             | 26 ++++++++
 src/kernel/acl/CMakeLists.txt                  |  4 ++
 src/kernel/acl/src/cl/Concatenation.cpp        | 88 +++++++++++++++++++++++++
 src/kernel/acl/src/cl/Concatenation.test.cpp   | 46 +++++++++++++
 src/kernel/acl/src/neon/Concatenation.cpp      | 89 ++++++++++++++++++++++++++
 src/kernel/acl/src/neon/Concatenation.test.cpp | 46 +++++++++++++
 6 files changed, 299 insertions(+)
 create mode 100644 include/kernel/acl/Concatenation.h
 create mode 100644 src/kernel/acl/src/cl/Concatenation.cpp
 create mode 100644 src/kernel/acl/src/cl/Concatenation.test.cpp
 create mode 100644 src/kernel/acl/src/neon/Concatenation.cpp
 create mode 100644 src/kernel/acl/src/neon/Concatenation.test.cpp
diff --git a/include/kernel/acl/Concatenation.h b/include/kernel/acl/Concatenation.h
new file mode 100644
index 0000000..1d4db48
--- /dev/null
+++ b/include/kernel/acl/Concatenation.h
@@ -0,0 +1,26 @@
+#ifndef __NNFW_KERNEL_ACL_CONCATENATION_H__
+#define __NNFW_KERNEL_ACL_CONCATENATION_H__
+
+#include <OperationsUtils.h>
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
+                          const std::vector<android::nn::Shape>& inputShapes, int32_t axis,
+                          float* outputData, const android::nn::Shape& outputShape);
+
+namespace neon {
+
+bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
+                          const std::vector<android::nn::Shape>& inputShapes, int32_t axis,
+                          float* outputData, const android::nn::Shape& outputShape);
+
+} // namespace neon
+
+} // namespace acl
+} // namespace kernal
+} // namespace nnfw
+
+#endif // __NNFW_KERNEL_ACL_CONCATENATION_H__
diff --git a/src/kernel/acl/CMakeLists.txt b/src/kernel/acl/CMakeLists.txt
index 048bddb..4790f0d 100644
--- a/src/kernel/acl/CMakeLists.txt
+++ b/src/kernel/acl/CMakeLists.txt
@@ -33,11 +33,13 @@ set(KERNELACL_SRCS "src/Init_acl.cpp"
                    "src/cl/Pooling.cpp"
                    "src/cl/Reshape.cpp"
                    "src/cl/Softmax.cpp"
+                   "src/cl/Concatenation.cpp"
                    "src/neon/Conv2D.cpp"
                    "src/neon/FullyConnected.cpp"
                    "src/neon/Pooling.cpp"
                    "src/neon/Softmax.cpp"
                    "src/neon/Reshape.cpp"
+                   "src/neon/Concatenation.cpp"
                    )
 
 add_library(${LIB_KERNELACL} SHARED ${KERNELACL_SRCS})
@@ -63,11 +65,13 @@ set(KERNELACL_TEST_SRCS "src/util.cpp"
                         "src/cl/Pooling.test.cpp"
                         "src/cl/Reshape.test.cpp"
                         "src/cl/Softmax.test.cpp"
+                        "src/cl/Concatenation.test.cpp"
                         "src/neon/Conv2D.test.cpp"
                         "src/neon/FullyConnected.test.cpp"
                         "src/neon/Pooling.test.cpp"
                         "src/neon/Softmax.test.cpp"
                         "src/neon/Reshape.test.cpp"
+                        "src/neon/Concatenation.test.cpp"
                         )
 
 add_executable(${LIB_KERNELACL_TEST} ${KERNELACL_TEST_SRCS})
diff --git a/src/kernel/acl/src/cl/Concatenation.cpp b/src/kernel/acl/src/cl/Concatenation.cpp
new file mode 100644
index 0000000..d0bd005
--- /dev/null
+++ b/src/kernel/acl/src/cl/Concatenation.cpp
@@ -0,0 +1,88 @@
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+#include <cassert>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+
+bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
+                          const std::vector<android::nn::Shape>& inputShapes, int32_t axis,
+                          float* outputData, const android::nn::Shape& outputShape)
+{
+  if (axis != 3)
+  {
+    assert("Only support axis=3 for ACL" && 0);
+    return false;
+  }
+  assert(inputDataPtrs.size() == inputShapes.size());
+
+  std::vector<arm_compute::CLTensor*> inputPtrs;
+  std::vector<arm_compute::ICLTensor*> inputIptrs;
+  arm_compute::CLTensor output;
+
+  // init Tensors
+  std::vector<android::nn::Shape>::const_iterator it_inputShape = inputShapes.begin();
+  for (auto inputData : inputDataPtrs)
+  {
+    const android::nn::Shape& inputShape = *it_inputShape;
+    arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+    arm_compute::CLTensor* inputPtr = new arm_compute::CLTensor();
+
+    inputPtr->allocator()->init(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+    inputPtrs.push_back(inputPtr);
+    inputIptrs.push_back(inputPtr);
+
+    it_inputShape++;
+  }
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+  output.allocator()->init(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  // prepare ACL Concatenate and configure tensors
+  auto concat = std::make_shared<arm_compute::CLDepthConcatenateLayer>();
+  concat->configure(inputIptrs, &output);
+
+  // allocate Tensors
+  it_inputShape = inputShapes.begin();
+  std::vector<const float*>::const_iterator it_inputData = inputDataPtrs.begin();
+  for (auto inputPtr : inputPtrs)
+  {
+    inputPtr->allocator()->allocate();
+
+    const float* inputData = *it_inputData;
+    const android::nn::Shape& inputShape = *it_inputShape;
+
+    TensorAccess<InputAccessor>(*inputPtr, inputData, inputShape);
+
+    it_inputShape++;
+    it_inputData++;
+  }
+  output.allocator()->allocate();
+
+  // run
+  concat->run();
+  arm_compute::CLScheduler::get().sync();
+
+  // get output
+  TensorAccess<OutputAccessor>(output, outputData, outputShape);
+
+  // cleanup
+  for (auto inputPtr : inputPtrs)
+  {
+    inputPtr->allocator()->free();
+    delete inputPtr;
+  }
+  output.allocator()->free();
+
+  return true;
+}
+
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/src/kernel/acl/src/cl/Concatenation.test.cpp b/src/kernel/acl/src/cl/Concatenation.test.cpp
new file mode 100644
index 0000000..7ac53e8
--- /dev/null
+++ b/src/kernel/acl/src/cl/Concatenation.test.cpp
@@ -0,0 +1,46 @@
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <kernel/acl/Concatenation.h>
+
+// TODO: fix include path in CMakeFiles
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, concatFloat32_1)
+{
+  float inputData_1[6] = {
+    1, 2, 3, 4, 5, 6      // [ [ [1],[2],[3] ], [ [4],[5],[6] ] ]
+  };
+  float inputData_2[6] = {
+    7, 8, 9, 10, 11, 12   // [ [ [7],[8],[9] ], [ [10],[11],[12] ] ]
+  };
+  const android::nn::Shape inputShape_1 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
+  const android::nn::Shape inputShape_2 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
+  std::vector<const float*> inputDataPtrs;
+  std::vector<android::nn::Shape> inputShapes;
+  float outputData[12];
+  const android::nn::Shape outputShape = { OperandType::FLOAT32, {1,2,3,2}, 1.0, 0 };
+  bool bret;
+
+  inputDataPtrs.push_back(inputData_1);
+  inputDataPtrs.push_back(inputData_2);
+  inputShapes.push_back(inputShape_1);
+  inputShapes.push_back(inputShape_2);
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = concatenationFloat32(inputDataPtrs, inputShapes, 3, 
+                              outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectNCHW[] = {
+    1, 2, 3, 4, 5, 6,
+    7, 8, 9, 10, 11, 12
+  };
+  float expectData[12]; // [ [ [1,7],[2,8],[3,9] ], [ [4,10],[5,11],[6,12] ] ]
+  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
diff --git a/src/kernel/acl/src/neon/Concatenation.cpp b/src/kernel/acl/src/neon/Concatenation.cpp
new file mode 100644
index 0000000..521b3d4
--- /dev/null
+++ b/src/kernel/acl/src/neon/Concatenation.cpp
@@ -0,0 +1,89 @@
+#include <OperationsUtils.h>
+#include <arm_compute/core/TensorShape.h>
+#include <arm_compute/core/TensorInfo.h>
+
+#include <cassert>
+
+// TODO: fix include path in CMakeFiles
+#include "../IO_accessor.h"
+#include "../shape.h"
+
+namespace nnfw {
+namespace kernel {
+namespace acl {
+namespace neon {
+
+bool concatenationFloat32(const std::vector<const float*>& inputDataPtrs,
+                          const std::vector<android::nn::Shape>& inputShapes, int32_t axis,
+                          float* outputData, const android::nn::Shape& outputShape)
+{
+  if (axis != 3)
+  {
+    assert("Only support axis=3 for ACL" && 0);
+    return false;
+  }
+  assert(inputDataPtrs.size() == inputShapes.size());
+
+  std::vector<arm_compute::Tensor*> inputPtrs;
+  std::vector<arm_compute::ITensor*> inputIptrs;
+  arm_compute::Tensor output;
+
+  // init Tensors
+  std::vector<android::nn::Shape>::const_iterator it_inputShape = inputShapes.begin();
+  for (auto inputData : inputDataPtrs)
+  {
+    const android::nn::Shape& inputShape = *it_inputShape;
+    arm_compute::TensorShape input_shape = util::fromNNShape(inputShape);
+    arm_compute::Tensor* inputPtr = new arm_compute::Tensor();
+
+    inputPtr->allocator()->init(arm_compute::TensorInfo(input_shape, arm_compute::Format::F32));
+    inputPtrs.push_back(inputPtr);
+    inputIptrs.push_back(inputPtr);
+
+    it_inputShape++;
+  }
+  arm_compute::TensorShape output_shape = util::fromNNShape(outputShape);
+  output.allocator()->init(arm_compute::TensorInfo(output_shape, arm_compute::Format::F32));
+
+  // prepare ACL Concatenate and configure tensors
+  auto concat = std::make_shared<arm_compute::NEDepthConcatenateLayer>();
+  concat->configure(inputIptrs, &output);
+
+  // allocate Tensors
+  it_inputShape = inputShapes.begin();
+  std::vector<const float*>::const_iterator it_inputData = inputDataPtrs.begin();
+  for (auto inputPtr : inputPtrs)
+  {
+    inputPtr->allocator()->allocate();
+
+    const float* inputData = *it_inputData;
+    const android::nn::Shape& inputShape = *it_inputShape;
+
+    TensorAccess<InputAccessor>(*inputPtr, inputData, inputShape);
+
+    it_inputShape++;
+    it_inputData++;
+  }
+  output.allocator()->allocate();
+
+  // run
+  concat->run();
+
+  // get output
+  TensorAccess<OutputAccessor>(output, outputData, outputShape);
+
+  // cleanup
+  for (auto inputPtr : inputPtrs)
+  {
+    inputPtr->allocator()->free();
+    delete inputPtr;
+  }
+  output.allocator()->free();
+
+  return true;
+}
+
+} // namespace neon
+} // namespace acl
+} // namespace kernel
+} // namespace nnfw
diff --git a/src/kernel/acl/src/neon/Concatenation.test.cpp b/src/kernel/acl/src/neon/Concatenation.test.cpp
new file mode 100644
index 0000000..6a5f00d
--- /dev/null
+++ b/src/kernel/acl/src/neon/Concatenation.test.cpp
@@ -0,0 +1,46 @@
+#include <gtest/gtest.h>
+#include <OperationsUtils.h>
+#include <kernel/acl/nnfw_kernel_acl.h>
+#include <kernel/acl/Concatenation.h>
+
+// TODO: fix include path in CMakeFiles
+#include "../util.h"
+
+using namespace nnfw::kernel::acl;
+
+TEST(KernelACL_TC, neon_concatFloat32_1)
+{
+  float inputData_1[6] = {
+    1, 2, 3, 4, 5, 6      // [ [ [1],[2],[3] ], [ [4],[5],[6] ] ]
+  };
+  float inputData_2[6] = {
+    7, 8, 9, 10, 11, 12   // [ [ [7],[8],[9] ], [ [10],[11],[12] ] ]
+  };
+  const android::nn::Shape inputShape_1 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
+  const android::nn::Shape inputShape_2 = { OperandType::FLOAT32, {1,2,3,1}, 1.0, 0 };
+  std::vector<const float*> inputDataPtrs;
+  std::vector<android::nn::Shape> inputShapes;
+  float outputData[12];
+  const android::nn::Shape outputShape = { OperandType::FLOAT32, {1,2,3,2}, 1.0, 0 };
+  bool bret;
+
+  inputDataPtrs.push_back(inputData_1);
+  inputDataPtrs.push_back(inputData_2);
+  inputShapes.push_back(inputShape_1);
+  inputShapes.push_back(inputShape_2);
+
+  util::initData(outputData, sizeof(outputData) / sizeof(outputData[0]), 0.0);
+
+  bret = neon::concatenationFloat32(inputDataPtrs, inputShapes, 3, 
+                                    outputData, outputShape);
+  EXPECT_EQ(bret, true);
+
+  float expectNCHW[] = {
+    1, 2, 3, 4, 5, 6,
+    7, 8, 9, 10, 11, 12
+  };
+  float expectData[12]; // [ [ [1,7],[2,8],[3,9] ], [ [4,10],[5,11],[6,12] ] ]
+  util::NCHW2NHWC(expectNCHW, expectData, outputShape);
+  bret = util::compareData(outputData, expectData, outputShape);
+  EXPECT_EQ(bret, true);
+}
-- 
2.7.4