From 0aa282a82bae7006387d9703b51de5830e3e0ed4 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=EA=B9=80=EC=9A=A9=EC=84=AD/=EB=8F=99=EC=9E=91=EC=A0=9C?=
 =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?=
 =?utf8?q?=EC=9E=90?= <yons.kim@samsung.com>
Date: Tue, 11 Sep 2018 13:28:31 +0900
Subject: [PATCH] Implement EMBEDDING_LOOKUP op in pure_acl on cpu (#2647)

* Implement EMBEDDING_LOOKUP op in pure_acl on cpu

Implement EMBEDDING_LOOKUP op in pure_acl on cpu

Signed-off-by: Yongseop Kim <yons.kim@samsung.com>

* Apply comments

- For 1d tensor, asTensorShape(shape, true)
- Use output_info for offset

* Change output to values
---
 runtimes/pure_arm_compute/src/compilation.cc       |  71 ++++++++++++-
 .../src/internal/layers/SimpleEmbeddingLookup.cc   | 115 +++++++++++++++++++++
 .../src/internal/layers/SimpleEmbeddingLookup.h    |  22 ++++
 3 files changed, 206 insertions(+), 2 deletions(-)
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
 create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h

diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc
index a153e2c..c806785 100644
--- a/runtimes/pure_arm_compute/src/compilation.cc
+++ b/runtimes/pure_arm_compute/src/compilation.cc
@@ -69,6 +69,7 @@
 #include "internal/layers/GenericFullyConnectedLayer.h"
 #include "internal/layers/PadLayer.h"
 #include "internal/layers/SimpleSpaceToDepth.h"
+#include "internal/layers/SimpleEmbeddingLookup.h"
 
 #include "util/matrix/IndexIterator.h"
 #include "util/kernel/IndexIterator.h"
@@ -3626,8 +3627,74 @@ void Planner::visit(const ::internal::tflite::op::L2Pool2D::Explicit::Node &node
 
 void Planner::visit(const ::internal::tflite::op::EmbeddingLookup::Node &node)
 {
-  // TODO Implement EMBEDDING_LOOKUP
-  throw std::runtime_error("Not supported");
+  const ::internal::tflite::operand::Index output_index{node.param().output_index};
+  const ::internal::tflite::operand::Index lookups_index{node.param().lookups_index};
+  const ::internal::tflite::operand::Index values_index{node.param().values_index};
+
+  const auto &output_obj = _ctx.at(output_index);
+  const auto &lookups_obj = _ctx.at(lookups_index);
+  const auto &values_obj = _ctx.at(values_index);
+
+  // Verify operand here, not at SimpleEmbeddingLookup::configure() to avoid acl's modifying
+  // TensorShape sometimes(Issue: https://github.sec.samsung.net/STAR/nnfw/issues/729)
+  {
+    assert(lookups_obj.type() == ANEURALNETWORKS_TENSOR_INT32);
+
+    const auto &output_shape = output_obj.shape();
+    const auto &lookups_shape = lookups_obj.shape();
+    const auto &values_shape = values_obj.shape();
+
+    assert(lookups_shape.rank() == 1);
+    assert(values_shape.rank() >= 2);
+
+    // output should be a n-D tensor with the same rank and shape as the values tensor, except for
+    // the first dimension which has the same size as lookups' only dimension.
+    assert(output_shape.rank() == values_shape.rank());
+    assert(output_shape.dim(0) == lookups_shape.dim(0));
+    for (size_t n = 1; n < output_shape.rank(); ++n)
+    {
+      assert(output_shape.dim(n) == values_shape.dim(n));
+    }
+  }
+
+  // Set Shape Constraints and TensorInfo
+  _builder.addShapeConstr(output_index,
+                          asTensorInfo(asTensorShape(output_obj.shape(), false), output_obj.type(),
+                                       output_obj.scale(), output_obj.zeroPoint()));
+  _builder.addShapeConstr(lookups_index,
+                          asTensorInfo(asTensorShape(lookups_obj.shape()), lookups_obj.type(),
+                                       lookups_obj.scale(), lookups_obj.zeroPoint()));
+  _builder.addShapeConstr(values_index,
+                          asTensorInfo(asTensorShape(values_obj.shape(), false), values_obj.type(),
+                                       values_obj.scale(), values_obj.zeroPoint()));
+
+  // Construct operation parameters
+  struct Param
+  {
+    int32_t output_index;
+    int32_t lookups_index;
+    int32_t values_index;
+  };
+
+  Param param;
+
+  param.output_index = output_index.asInt();
+  param.lookups_index = lookups_index.asInt();
+  param.values_index = values_index.asInt();
+
+  auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) {
+    auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index});
+    auto lookups_alloc = ctx.at(::internal::tflite::operand::Index{param.lookups_index});
+    auto values_alloc = ctx.at(::internal::tflite::operand::Index{param.values_index});
+
+    auto fn = nnfw::make_unique<SimpleEmbeddingLookup>();
+
+    fn->configure(lookups_alloc, values_alloc, output_alloc);
+
+    builder.append("EmbeddingLookup", std::move(fn));
+  };
+
+  _builder.addStage(stage);
 }
 
 class AllocationContext final : public IAllocationContext
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
new file mode 100644
index 0000000..45eb207
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc
@@ -0,0 +1,115 @@
+#include "internal/layers/SimpleEmbeddingLookup.h"
+
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+void SimpleEmbeddingLookup::configure(::arm_compute::ITensor *lookups,
+                                      ::arm_compute::ITensor *values,
+                                      ::arm_compute::ITensor *output)
+{
+  // Assume that verification of operands are already done at Planner::visit()
+  _lookups = lookups;
+  _values = values;
+  _output = output;
+}
+
+void SimpleEmbeddingLookup::run()
+{
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_lookups)->map(q);
+    CAST_CL(_values)->map(q);
+    CAST_CL(_output)->map(q);
+  }
+
+  // type of elements of lookups is always integer
+  const int32_t *lookups_buf = reinterpret_cast<int32_t *>(_lookups->buffer());
+  const auto values_buf = _values->buffer();
+  auto output_buf = _output->buffer();
+
+  const auto lookups_info = _lookups->info();
+  const auto values_info = _values->info();
+  const auto output_info = _output->info();
+
+  // TODO Refactor below duplicated code!
+  const auto values_rank = values_info->num_dimensions();
+  switch (values_rank)
+  {
+    case 2:
+      // (H,W) in nnapi -> (W,H) in acl
+      {
+        const size_t row_size = values_info->dimension(1);
+        const size_t row_bytes = values_info->total_size() / row_size;
+        for (size_t i = 0; i < lookups_info->dimension(0); ++i)
+        {
+          size_t idx = lookups_buf[i];
+          if (idx >= row_size || idx < 0)
+            throw std::runtime_error("Embedding Lookup: index out of bounds.");
+
+          size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, idx});
+          size_t row_offset_by_i = output_info->offset_element_in_bytes({0, i});
+
+          unsigned char *sink_addr = output_buf + row_offset_by_i;
+          unsigned char *source_addr = values_buf + row_offset_by_idx;
+          memcpy(sink_addr, source_addr, row_bytes);
+        }
+      }
+      break;
+    case 3:
+      // (B,H,W) in nnapi -> (W,H,B) in acl
+      {
+        const size_t row_size = values_info->dimension(2);
+        const size_t row_bytes = values_info->total_size() / row_size;
+        for (size_t i = 0; i < lookups_info->dimension(0); ++i)
+        {
+          size_t idx = lookups_buf[i];
+          if (idx >= row_size || idx < 0)
+            throw std::runtime_error("Embedding Lookup: index out of bounds.");
+
+          size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, idx});
+          size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, i});
+
+          unsigned char *sink_addr = output_buf + row_offset_by_i;
+          unsigned char *source_addr = values_buf + row_offset_by_idx;
+          memcpy(sink_addr, source_addr, row_bytes);
+        }
+      }
+      break;
+    case 4:
+      // (N,H,W,C) in nnapi -> (N,C,H,W) in acl
+      {
+        const size_t row_size = values_info->dimension(3);
+        const size_t row_bytes = values_info->total_size() / row_size;
+        for (size_t i = 0; i < lookups_info->dimension(0); ++i)
+        {
+          size_t idx = lookups_buf[i];
+          if (idx >= row_size || idx < 0)
+            throw std::runtime_error("Embedding Lookup: index out of bounds.");
+
+          size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, 0, idx});
+          size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, 0, i});
+
+          unsigned char *sink_addr = output_buf + row_offset_by_i;
+          unsigned char *source_addr = values_buf + row_offset_by_idx;
+          memcpy(sink_addr, source_addr, row_bytes);
+        }
+      }
+      break;
+    case 1:
+      // In this case, shape of values actually is matrix but the height(row size) is 1 in acl. If
+      // row size is 1, this op is not needed and it means this situtation could be wrong.
+      throw std::runtime_error("Wrong usage of EmbeddingLookup op!");
+    default:
+      throw std::runtime_error("Not supported rank!");
+  }
+
+  if (::internal::arm_compute::isGpuMode())
+  {
+    auto &q = ::arm_compute::CLScheduler::get().queue();
+
+    CAST_CL(_lookups)->unmap(q);
+    CAST_CL(_values)->unmap(q);
+    CAST_CL(_output)->unmap(q);
+  }
+}
diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h
new file mode 100644
index 0000000..9f2cd97
--- /dev/null
+++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h
@@ -0,0 +1,22 @@
+#ifndef __SIMPLE_EMBEDDING_LOOKUP_H__
+#define __SIMPLE_EMBEDDING_LOOKUP_H__
+
+#include "internal/arm_compute.h"
+#include <arm_compute/core/ITensor.h>
+#include <arm_compute/runtime/IFunction.h>
+
+class SimpleEmbeddingLookup : public ::arm_compute::IFunction
+{
+public:
+  void configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *values,
+                 ::arm_compute::ITensor *output);
+
+  void run() override;
+
+private:
+  ::arm_compute::ITensor *_lookups;
+  ::arm_compute::ITensor *_values;
+  ::arm_compute::ITensor *_output;
+};
+
+#endif /*__SIMPLE_EMBEDDING_LOOKUP_H__ */
-- 
2.7.4