From 0aa282a82bae7006387d9703b51de5830e3e0ed4 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=EA=B9=80=EC=9A=A9=EC=84=AD/=EB=8F=99=EC=9E=91=EC=A0=9C?= =?utf8?q?=EC=96=B4Lab=28SR=29/Engineer/=EC=82=BC=EC=84=B1=EC=A0=84?= =?utf8?q?=EC=9E=90?= Date: Tue, 11 Sep 2018 13:28:31 +0900 Subject: [PATCH] Implement EMBEDDING_LOOKUP op in pure_acl on cpu (#2647) * Implement EMBEDDING_LOOKUP op in pure_acl on cpu Implement EMBEDDING_LOOKUP op in pure_acl on cpu Signed-off-by: Yongseop Kim * Apply comments - For 1d tensor, asTensorShape(shape, true) - Use output_info for offset * Change output to values --- runtimes/pure_arm_compute/src/compilation.cc | 71 ++++++++++++- .../src/internal/layers/SimpleEmbeddingLookup.cc | 115 +++++++++++++++++++++ .../src/internal/layers/SimpleEmbeddingLookup.h | 22 ++++ 3 files changed, 206 insertions(+), 2 deletions(-) create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc create mode 100644 runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h diff --git a/runtimes/pure_arm_compute/src/compilation.cc b/runtimes/pure_arm_compute/src/compilation.cc index a153e2c..c806785 100644 --- a/runtimes/pure_arm_compute/src/compilation.cc +++ b/runtimes/pure_arm_compute/src/compilation.cc @@ -69,6 +69,7 @@ #include "internal/layers/GenericFullyConnectedLayer.h" #include "internal/layers/PadLayer.h" #include "internal/layers/SimpleSpaceToDepth.h" +#include "internal/layers/SimpleEmbeddingLookup.h" #include "util/matrix/IndexIterator.h" #include "util/kernel/IndexIterator.h" @@ -3626,8 +3627,74 @@ void Planner::visit(const ::internal::tflite::op::L2Pool2D::Explicit::Node &node void Planner::visit(const ::internal::tflite::op::EmbeddingLookup::Node &node) { - // TODO Implement EMBEDDING_LOOKUP - throw std::runtime_error("Not supported"); + const ::internal::tflite::operand::Index output_index{node.param().output_index}; + const ::internal::tflite::operand::Index lookups_index{node.param().lookups_index}; + const ::internal::tflite::operand::Index values_index{node.param().values_index}; + + const auto &output_obj = _ctx.at(output_index); + const auto &lookups_obj = _ctx.at(lookups_index); + const auto &values_obj = _ctx.at(values_index); + + // Verify operand here, not at SimpleEmbeddingLookup::configure() to avoid acl's modifying + // TensorShape sometimes(Issue: https://github.sec.samsung.net/STAR/nnfw/issues/729) + { + assert(lookups_obj.type() == ANEURALNETWORKS_TENSOR_INT32); + + const auto &output_shape = output_obj.shape(); + const auto &lookups_shape = lookups_obj.shape(); + const auto &values_shape = values_obj.shape(); + + assert(lookups_shape.rank() == 1); + assert(values_shape.rank() >= 2); + + // output should be a n-D tensor with the same rank and shape as the values tensor, except for + // the first dimension which has the same size as lookups' only dimension. + assert(output_shape.rank() == values_shape.rank()); + assert(output_shape.dim(0) == lookups_shape.dim(0)); + for (size_t n = 1; n < output_shape.rank(); ++n) + { + assert(output_shape.dim(n) == values_shape.dim(n)); + } + } + + // Set Shape Constraints and TensorInfo + _builder.addShapeConstr(output_index, + asTensorInfo(asTensorShape(output_obj.shape(), false), output_obj.type(), + output_obj.scale(), output_obj.zeroPoint())); + _builder.addShapeConstr(lookups_index, + asTensorInfo(asTensorShape(lookups_obj.shape()), lookups_obj.type(), + lookups_obj.scale(), lookups_obj.zeroPoint())); + _builder.addShapeConstr(values_index, + asTensorInfo(asTensorShape(values_obj.shape(), false), values_obj.type(), + values_obj.scale(), values_obj.zeroPoint())); + + // Construct operation parameters + struct Param + { + int32_t output_index; + int32_t lookups_index; + int32_t values_index; + }; + + Param param; + + param.output_index = output_index.asInt(); + param.lookups_index = lookups_index.asInt(); + param.values_index = values_index.asInt(); + + auto stage = [param](const IAllocationContext &ctx, IExecutionBuilder &builder) { + auto output_alloc = ctx.at(::internal::tflite::operand::Index{param.output_index}); + auto lookups_alloc = ctx.at(::internal::tflite::operand::Index{param.lookups_index}); + auto values_alloc = ctx.at(::internal::tflite::operand::Index{param.values_index}); + + auto fn = nnfw::make_unique(); + + fn->configure(lookups_alloc, values_alloc, output_alloc); + + builder.append("EmbeddingLookup", std::move(fn)); + }; + + _builder.addStage(stage); } class AllocationContext final : public IAllocationContext diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc new file mode 100644 index 0000000..45eb207 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.cc @@ -0,0 +1,115 @@ +#include "internal/layers/SimpleEmbeddingLookup.h" + +#include + +void SimpleEmbeddingLookup::configure(::arm_compute::ITensor *lookups, + ::arm_compute::ITensor *values, + ::arm_compute::ITensor *output) +{ + // Assume that verification of operands are already done at Planner::visit() + _lookups = lookups; + _values = values; + _output = output; +} + +void SimpleEmbeddingLookup::run() +{ + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_lookups)->map(q); + CAST_CL(_values)->map(q); + CAST_CL(_output)->map(q); + } + + // type of elements of lookups is always integer + const int32_t *lookups_buf = reinterpret_cast(_lookups->buffer()); + const auto values_buf = _values->buffer(); + auto output_buf = _output->buffer(); + + const auto lookups_info = _lookups->info(); + const auto values_info = _values->info(); + const auto output_info = _output->info(); + + // TODO Refactor below duplicated code! + const auto values_rank = values_info->num_dimensions(); + switch (values_rank) + { + case 2: + // (H,W) in nnapi -> (W,H) in acl + { + const size_t row_size = values_info->dimension(1); + const size_t row_bytes = values_info->total_size() / row_size; + for (size_t i = 0; i < lookups_info->dimension(0); ++i) + { + size_t idx = lookups_buf[i]; + if (idx >= row_size || idx < 0) + throw std::runtime_error("Embedding Lookup: index out of bounds."); + + size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, idx}); + size_t row_offset_by_i = output_info->offset_element_in_bytes({0, i}); + + unsigned char *sink_addr = output_buf + row_offset_by_i; + unsigned char *source_addr = values_buf + row_offset_by_idx; + memcpy(sink_addr, source_addr, row_bytes); + } + } + break; + case 3: + // (B,H,W) in nnapi -> (W,H,B) in acl + { + const size_t row_size = values_info->dimension(2); + const size_t row_bytes = values_info->total_size() / row_size; + for (size_t i = 0; i < lookups_info->dimension(0); ++i) + { + size_t idx = lookups_buf[i]; + if (idx >= row_size || idx < 0) + throw std::runtime_error("Embedding Lookup: index out of bounds."); + + size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, idx}); + size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, i}); + + unsigned char *sink_addr = output_buf + row_offset_by_i; + unsigned char *source_addr = values_buf + row_offset_by_idx; + memcpy(sink_addr, source_addr, row_bytes); + } + } + break; + case 4: + // (N,H,W,C) in nnapi -> (N,C,H,W) in acl + { + const size_t row_size = values_info->dimension(3); + const size_t row_bytes = values_info->total_size() / row_size; + for (size_t i = 0; i < lookups_info->dimension(0); ++i) + { + size_t idx = lookups_buf[i]; + if (idx >= row_size || idx < 0) + throw std::runtime_error("Embedding Lookup: index out of bounds."); + + size_t row_offset_by_idx = values_info->offset_element_in_bytes({0, 0, 0, idx}); + size_t row_offset_by_i = output_info->offset_element_in_bytes({0, 0, 0, i}); + + unsigned char *sink_addr = output_buf + row_offset_by_i; + unsigned char *source_addr = values_buf + row_offset_by_idx; + memcpy(sink_addr, source_addr, row_bytes); + } + } + break; + case 1: + // In this case, shape of values actually is matrix but the height(row size) is 1 in acl. If + // row size is 1, this op is not needed and it means this situtation could be wrong. + throw std::runtime_error("Wrong usage of EmbeddingLookup op!"); + default: + throw std::runtime_error("Not supported rank!"); + } + + if (::internal::arm_compute::isGpuMode()) + { + auto &q = ::arm_compute::CLScheduler::get().queue(); + + CAST_CL(_lookups)->unmap(q); + CAST_CL(_values)->unmap(q); + CAST_CL(_output)->unmap(q); + } +} diff --git a/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h new file mode 100644 index 0000000..9f2cd97 --- /dev/null +++ b/runtimes/pure_arm_compute/src/internal/layers/SimpleEmbeddingLookup.h @@ -0,0 +1,22 @@ +#ifndef __SIMPLE_EMBEDDING_LOOKUP_H__ +#define __SIMPLE_EMBEDDING_LOOKUP_H__ + +#include "internal/arm_compute.h" +#include +#include + +class SimpleEmbeddingLookup : public ::arm_compute::IFunction +{ +public: + void configure(::arm_compute::ITensor *lookups, ::arm_compute::ITensor *values, + ::arm_compute::ITensor *output); + + void run() override; + +private: + ::arm_compute::ITensor *_lookups; + ::arm_compute::ITensor *_values; + ::arm_compute::ITensor *_output; +}; + +#endif /*__SIMPLE_EMBEDDING_LOOKUP_H__ */ -- 2.7.4