runtime/onert/backend/acl_neon/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/NEON/NEFunctions.h>   // Include all ARM Compute NEON functions
  20 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
  21
  22 #include <AclActivationBuilder.h>
  23 #include <AclFunction.h>
  24 #include <Convert.h>
  25 #include <Swizzle.h>
  26
  27 #include "ir/Index.h"
  28 #include "ir/DataType.h"
  29 #include "ir/InternalType.h"
  30 #include "exec/NopFunction.h"
  31 #include "util/logging.h"
  32 #include "util/Utils.h"
  33 #include "AclKernelGen.h"
  34
  35 namespace onert
  36 {
  37 namespace backend
  38 {
  39 namespace acl_neon
  40 {
  41
  42 using ::onert::backend::acl_common::asAclFunction;
  43 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
  44     ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
  45
  46 KernelGenerator::KernelGenerator(
  47     const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
  48     const std::shared_ptr<TensorBuilder> &tensor_builder,
  49     const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
  50     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
  51       _tensor_reg(tensor_reg), _current_layout(ir::Layout::UNKNOWN)
  52 {
  53   // DO NOTHING
  54 }
  55
  56 void KernelGenerator::visit(const ir::OpSequence &op_seq)
  57 {
  58   // TODO Move this to IKernelGenerator
  59   //      (all derivatives have the same implementation for this)
  60   assert(!_return_fn_seq);
  61   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
  62   _return_fn_seq->enableDynamicShapeInferer(false);
  63
  64   _current_layout = op_seq.getLayout();
  65   for (const auto &operation_idx : op_seq.operations())
  66   {
  67     const auto &node = _operations_ctx.at(operation_idx);
  68     node.accept(*this);
  69     _return_fn_seq->append(releaseFunction());
  70   }
  71 }
  72
  73 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
  74 {
  75   const auto ofm_index{node.getOutputs().at(0)};
  76   const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
  77   const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
  78
  79   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
  80
  81   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
  82   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
  83   auto frontend_layout = _current_layout;
  84   auto backend_layout = ifm_tensor->layout();
  85
  86   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
  87   if (axis_value < 0)
  88   {
  89     axis_value += ifm_rank;
  90   }
  91   assert(axis_value >= 0 && axis_value < ifm_rank);
  92   const auto fixed_axis =
  93       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
  94   auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
  95                                              : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
  96
  97   auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
  98       ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), reduce_type);
  99
 100   _return_fn = asAclFunction(std::move(fn));
 101 }
 102
 103 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
 104 {
 105   const auto ofm_index{node.getOutputs().at(0)};
 106   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
 107   const auto block_size_index{
 108       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 109
 110   const auto NNApiInputs = 2;
 111   if (node.getInputs().size() != NNApiInputs)
 112   {
 113     const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
 114     if (!_ctx.at(crops_index).isConstant())
 115     {
 116       throw std::runtime_error("Non-constant crops NYI for acl_neon backend BatchToSpaceND");
 117     }
 118
 119     auto crops = _ctx.at(crops_index).asVector<int32_t>();
 120     for (auto crop : crops)
 121     {
 122       if (crop != 0)
 123       {
 124         throw std::runtime_error("Non-zero crops NYI for acl_neon backend BatchToSpaceND");
 125       }
 126     }
 127   }
 128
 129   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 130   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 131   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 132
 133   assert(_ctx.at(block_size_index).data());
 134
 135   auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
 136       ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 137
 138   _return_fn = asAclFunction(std::move(fn));
 139 }
 140
 141 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 142 {
 143   const auto ofm_index{node.getOutputs().at(0)};
 144   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
 145   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
 146
 147   const auto activation = node.param().activation;
 148
 149   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 150   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 151   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 152
 153   std::unique_ptr<arm_compute::IFunction> fn;
 154   switch (node.param().arithmetic_type)
 155   {
 156     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
 157     {
 158       fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
 159           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 160           arm_compute::ConvertPolicy::SATURATE);
 161       break;
 162     }
 163     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
 164     {
 165       fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
 166           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 167           arm_compute::ConvertPolicy::SATURATE);
 168       break;
 169     }
 170     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
 171     {
 172       // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
 173       fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
 174           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
 175           arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
 176       break;
 177     }
 178     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
 179     {
 180       fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
 181           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 182       break;
 183     }
 184     default:
 185       assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
 186       break;
 187   }
 188   _return_fn = std::make_unique<exec::FunctionSequence>(
 189       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 190 }
 191
 192 void KernelGenerator::visit(const ir::operation::Conv2D &node)
 193 {
 194   using ir::operation::Conv2D;
 195
 196   const auto ofm_index{node.getOutputs().at(0)};
 197   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 198   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 199   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 200
 201   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
 202   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
 203   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 204   const auto &ker_shape = _ctx.at(ker_index).shape();
 205   const auto ker_height = ker_shape.dim(1);
 206   const auto ker_width = ker_shape.dim(2);
 207
 208   const auto stride = node.param().stride;
 209   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 210                                             ker_width, ker_height);
 211   const auto activation = node.param().activation;
 212
 213   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 214   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 215   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 216   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 217
 218   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 219   const auto act_info = acl_common::asActivationLayerInfo(activation);
 220
 221   auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
 222       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
 223       ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
 224       ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 225
 226   _return_fn = asAclFunction(std::move(fn));
 227 }
 228
 229 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
 230 {
 231   const auto output_index{node.getOutputs().at(0)};
 232   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
 233
 234   auto block_size = node.param().block_size;
 235   assert(block_size > 0);
 236
 237   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 238   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 239
 240   auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
 241       input_tensor->handle(), output_tensor->handle(), block_size);
 242
 243   _return_fn = asAclFunction(std::move(fn));
 244 }
 245
 246 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 247 {
 248   using ir::operation::DepthwiseConv2D;
 249
 250   const auto ofm_index{node.getOutputs().at(0)};
 251   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 252   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 253   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 254
 255   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
 256   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
 257   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 258   const auto &ker_shape = _ctx.at(ker_index).shape();
 259   const auto ker_height = ker_shape.dim(1);
 260   const auto ker_width = ker_shape.dim(2);
 261
 262   const auto stride = node.param().stride;
 263   const auto dilation = node.param().dilation;
 264   const auto padding =
 265       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width,
 266                            ker_height, dilation.width_factor, dilation.height_factor);
 267   const auto multiplier = node.param().multiplier;
 268   const auto activation = node.param().activation;
 269
 270   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 271   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 272   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 273   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 274
 275   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 276   const auto act_info = acl_common::asActivationLayerInfo(activation);
 277   const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
 278
 279   auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
 280       ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
 281       conv_info, multiplier, act_info, dilation_info);
 282
 283   _return_fn = asAclFunction(std::move(fn));
 284 }
 285
 286 void KernelGenerator::visit(const ir::operation::Concat &node)
 287 {
 288   const auto ofm_index{node.getOutputs().at(0)};
 289
 290   std::vector<ir::OperandIndex> input_indexes;
 291   for (const auto &input : node.getInputs())
 292     input_indexes.emplace_back(input);
 293
 294   const auto axis = node.param().axis;
 295
 296   // Concat elimination check
 297   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
 298   if (eliminated)
 299   {
 300     // If concat eliminated, return a NOP IFunction
 301     VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
 302     _return_fn = std::make_unique<exec::NopFunction>();
 303     return;
 304   }
 305
 306   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
 307   std::vector<::arm_compute::ITensor *> input_tensors;
 308   for (const auto &ifm_ind : input_indexes)
 309     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 310
 311   std::unique_ptr<::arm_compute::IFunction> fn;
 312   if (input_indexes.size() < 2)
 313   {
 314     fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensors.at(0),
 315                                                         output_tensor->handle());
 316   }
 317   else
 318   {
 319     const auto rank = _ctx.at(ofm_index).shape().rank();
 320     const auto frontend_layout = _current_layout;
 321     const auto backend_layout = output_tensor->layout();
 322     const auto fixed_axis =
 323         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 324     fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
 325         input_tensors, output_tensor->handle(), fixed_axis);
 326   }
 327
 328   _return_fn = asAclFunction(std::move(fn));
 329 }
 330
 331 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
 332 {
 333   const auto ofm_index{node.getOutputs().at(0)};
 334   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 335
 336   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 337   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 338
 339   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
 340       node.param().op_type, node.param().alpha, node.param().beta);
 341
 342   std::unique_ptr<arm_compute::IFunction> fn =
 343       acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
 344                                                                 ofm_tensor->handle(), act_info);
 345
 346   _return_fn = asAclFunction(std::move(fn));
 347 }
 348
 349 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
 350 {
 351   const auto output_index{node.getOutputs().at(0)};
 352   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
 353   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 354
 355   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 356   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 357   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 358
 359   std::unique_ptr<arm_compute::IFunction> fn;
 360   switch (node.param().op_type)
 361   {
 362     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
 363     {
 364       fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
 365           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 366       break;
 367     }
 368     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
 369     {
 370       fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
 371           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 372       break;
 373     }
 374     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
 375     {
 376       fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
 377           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 378       break;
 379     }
 380     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
 381     {
 382       fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
 383           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 384       break;
 385     }
 386     default:
 387     {
 388       std::string err_msg("acl_neon KernelGenerator : " + node.name() +
 389                           "is not elementwise-binary operations");
 390       assert(false && err_msg.c_str());
 391       break;
 392     }
 393   }
 394   _return_fn = asAclFunction(std::move(fn));
 395 }
 396
 397 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
 398 {
 399   const auto output_index{node.getOutputs().at(0)};
 400   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 401
 402   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 403   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 404
 405   std::unique_ptr<arm_compute::IFunction> fn;
 406   switch (node.param().op_type)
 407   {
 408     case ir::operation::ElementwiseUnary::Type::ABS:
 409     {
 410       const ::arm_compute::ActivationLayerInfo act_info{
 411           ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 412
 413       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
 414           input_tensor->handle(), output_tensor->handle(), act_info);
 415       break;
 416     }
 417     case ir::operation::ElementwiseUnary::Type::CAST:
 418     {
 419       if (input_tensor->data_type() == output_tensor->data_type())
 420       {
 421         fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
 422                                                             output_tensor->handle());
 423       }
 424       else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
 425       {
 426         fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
 427                                                                 output_tensor->handle());
 428       }
 429       else
 430       {
 431         fn = acl_common::generateLayer<arm_compute::NECast>(
 432             input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
 433       }
 434       break;
 435     }
 436     case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
 437     {
 438       fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
 439                                                                          output_tensor->handle());
 440       break;
 441     }
 442     case ir::operation::ElementwiseUnary::Type::EXP:
 443     {
 444       fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
 445                                                               output_tensor->handle());
 446       break;
 447     }
 448     case ir::operation::ElementwiseUnary::Type::FLOOR:
 449     {
 450       fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
 451                                                            output_tensor->handle());
 452       break;
 453     }
 454     case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
 455     {
 456       fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
 457                                                                 output_tensor->handle());
 458       break;
 459     }
 460     case ir::operation::ElementwiseUnary::Type::NEG:
 461     {
 462       fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
 463                                                               output_tensor->handle());
 464       break;
 465     }
 466     case ir::operation::ElementwiseUnary::Type::RSQRT:
 467     {
 468       fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
 469                                                                 output_tensor->handle());
 470       break;
 471     }
 472     case ir::operation::ElementwiseUnary::Type::SQRT:
 473     {
 474       const ::arm_compute::ActivationLayerInfo act_info{
 475           ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 476
 477       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
 478           input_tensor->handle(), output_tensor->handle(), act_info);
 479       break;
 480     }
 481     default:
 482     {
 483       throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
 484                                "is not supported yet");
 485       break;
 486     }
 487   }
 488   _return_fn = asAclFunction(std::move(fn));
 489 }
 490
 491 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
 492 {
 493   const auto output_index{node.getOutputs().at(0)};
 494   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
 495   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 496
 497   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 498   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
 499   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 500
 501   auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
 502       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 503
 504   _return_fn = asAclFunction(std::move(fn));
 505 }
 506
 507 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 508 {
 509   const auto output_index{node.getOutputs().at(0)};
 510   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 511   const auto activation = node.param().activation;
 512   if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
 513     throw std::runtime_error(
 514         "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
 515
 516   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
 517                                                 ::arm_compute::NEFullyConnectedReshapingLayer>(
 518       node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
 519   _return_fn = std::make_unique<exec::FunctionSequence>(
 520       std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 521 }
 522
 523 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
 524 {
 525   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
 526   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
 527
 528   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
 529   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
 530   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 531
 532   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 533   auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
 534
 535   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
 536   auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
 537   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 538
 539   auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
 540       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
 541       output_tensor->handle(), hits_tensor->handle());
 542
 543   _return_fn = asAclFunction(std::move(fn));
 544 }
 545
 546 void KernelGenerator::visit(const ir::operation::Gather &node)
 547 {
 548   const auto ofm_index{node.getOutputs().at(0)};
 549
 550   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
 551   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
 552
 553   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
 554   const auto axis_raw = node.param().axis;
 555   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
 556   // Converting in reverse order
 557   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 558
 559   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 560   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 561   auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
 562   const auto backend_layout = ofm_tensor->layout();
 563   UNUSED_RELEASE(backend_layout);
 564
 565   // NOTE The frontend layout and backend layout must be the same for this operation.
 566   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
 567   //      is not not efficient even if it works well. If so, it would be better to set the
 568   //      layout of these backend tensors to the same layout.
 569   //      There is also one thing we have to think about. This operation depends on the layout of
 570   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
 571   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
 572   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
 573   assert(backend_layout == ifm_tensor->layout());
 574   assert(backend_layout == indices_tensor->layout());
 575   assert(ifm_rank < 4 || _current_layout == backend_layout);
 576
 577   // input is n-D, indices k-D, output is (n + k - 1)-D
 578   size_t n = ifm_rank;
 579   assert(n == ifm_tensor->num_dimensions());
 580   size_t k = _ctx.at(indices_index).shape().rank();
 581   assert(k == indices_tensor->num_dimensions());
 582
 583   // Disable applied dim_correction
 584   if (n != ifm_tensor->info()->num_dimensions())
 585   {
 586     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
 587     acl_common::disableDimCorrection(ifm_tensor);
 588   }
 589   if (k != indices_tensor->info()->num_dimensions())
 590   {
 591     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
 592     acl_common::disableDimCorrection(indices_tensor);
 593   }
 594
 595   auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
 596       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 597
 598   // Revert disabling applied dim_correction
 599   if (ifm_tensor->dimension(0) == 1)
 600   {
 601     acl_common::enableDimCorrection(ifm_tensor);
 602   }
 603   if (indices_tensor->dimension(0) == 1)
 604   {
 605     acl_common::enableDimCorrection(indices_tensor);
 606   }
 607
 608   _return_fn = asAclFunction(std::move(fn));
 609 }
 610
 611 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
 612 {
 613   const auto ofm_index{node.getOutputs().at(0)};
 614   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
 615   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
 616   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 617
 618   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 619   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 620   auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
 621   auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
 622   auto epsilon = node.param().epsilon;
 623   auto activation = node.param().activation;
 624
 625   auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
 626       ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
 627       epsilon);
 628
 629   _return_fn = std::make_unique<exec::FunctionSequence>(
 630       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 631 }
 632
 633 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
 634 {
 635   const auto ofm_index{node.getOutputs().at(0)};
 636   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
 637
 638   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
 639   // L2 Normalization always performs the reduction along the depth axis
 640   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
 641   // choosing normalization parameters as below
 642
 643   const auto &ifm_shape = _ctx.at(ifm_index).shape();
 644   // TODO Support optional constant dimension that normalization would be performed on
 645   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
 646   int32_t radius =
 647       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
 648   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
 649   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
 650   float bias = 0.0f;                             // Don't offset the reduction.
 651
 652   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 653   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 654
 655   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
 656                                                                radius, alpha, beta, bias, false);
 657
 658   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
 659       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 660
 661   _return_fn = asAclFunction(std::move(fn));
 662 }
 663
 664 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
 665 {
 666   const auto ofm_index{node.getOutputs().at(0)};
 667   const auto ifm_index{
 668       node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
 669
 670   auto radius = node.param().radius;
 671   auto alpha = node.param().alpha;
 672   auto beta = node.param().beta;
 673   auto bias = node.param().bias;
 674
 675   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 676   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 677
 678   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
 679       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 680
 681   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
 682       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 683
 684   _return_fn = asAclFunction(std::move(fn));
 685 }
 686
 687 void KernelGenerator::visit(const ir::operation::LSTM &node)
 688 {
 689   _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
 690                                          ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
 691 }
 692
 693 void KernelGenerator::visit(const ir::operation::Pack &node)
 694 {
 695   const auto output_index{node.getOutputs().at(0)};
 696   auto axis{node.param().axis};
 697
 698   const auto output_rank = _ctx.at(output_index).shape().rank();
 699
 700   std::vector<ir::OperandIndex> input_indexes;
 701   for (const auto &input_index : node.getInputs())
 702     input_indexes.emplace_back(input_index);
 703
 704   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 705   std::vector<arm_compute::ITensor *> inputs;
 706   for (const auto &input_index : input_indexes)
 707     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 708
 709   const auto frontend_layout = _current_layout;
 710   const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 711
 712   if (axis < 0)
 713     axis += output_rank;
 714   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 715
 716   // Disable applied dim_correction
 717   for (const auto &input_index : input_indexes)
 718   {
 719     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 720     if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
 721     {
 722       // This means that high dimension's value is 1 and input tensor is applied dim_correction
 723       acl_common::disableDimCorrection(input_tensor);
 724     }
 725   }
 726
 727   auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
 728
 729   // Revert disabling applied dim_correction
 730   for (const auto &input_index : input_indexes)
 731   {
 732     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 733     if (input_tensor->dimension(0) == 1)
 734     {
 735       acl_common::enableDimCorrection(input_tensor);
 736     }
 737   }
 738
 739   _return_fn = asAclFunction(std::move(fn));
 740 }
 741
 742 void KernelGenerator::visit(const ir::operation::Pad &node)
 743 {
 744   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
 745   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
 746   const auto output_index{node.getOutputs().at(0)};
 747   assert(_ctx.at(pad_index).data());
 748
 749   auto rank = _ctx.at(input_index).shape().rank();
 750   auto pad_base = _ctx.at(pad_index).data()->base();
 751
 752   auto input = _tensor_reg->getAclTensor(input_index)->handle();
 753   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 754
 755   ::arm_compute::PaddingList padding_list;
 756   padding_list.resize(rank);
 757   for (int32_t n = 0; n < rank; ++n)
 758   {
 759     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
 760
 761     const auto frontend_layout = _current_layout;
 762     const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
 763     const auto axis =
 764         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
 765     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
 766   }
 767
 768   const auto input_type = _ctx.at(input_index).typeInfo();
 769   UNUSED_RELEASE(input_type);
 770   assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
 771   assert(input->info()->quantization_info() ==
 772          ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()));
 773   const auto pixel_value =
 774       ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
 775
 776   auto fn =
 777       acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
 778
 779   _return_fn = asAclFunction(std::move(fn));
 780 }
 781
 782 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 783 {
 784   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
 785       node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
 786
 787   const auto ofm_index{node.getOutputs().at(0)};
 788   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 789   const auto activation = node.param().activation;
 790   _return_fn = std::make_unique<exec::FunctionSequence>(
 791       asAclFunction(std::move(raw_fn)),
 792       ActivationBuilder::generate(activation, ofm_tensor->handle()));
 793 }
 794
 795 void KernelGenerator::visit(const ir::operation::Permute &node)
 796 {
 797   const auto ofm_idx{node.getOutputs().at(0)};
 798   const auto ifm_idx{node.getInputs().at(0)};
 799   const auto permute_type = node.getPermuteType();
 800   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
 801   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
 802   const auto rank = _ctx.at(ofm_idx).shape().rank();
 803   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 804
 805   std::unique_ptr<::arm_compute::IFunction> fn;
 806   arm_compute::PermutationVector pv;
 807   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
 808   {
 809     // WHCN -> CWHN
 810     pv = arm_compute::PermutationVector{2, 0, 1};
 811
 812     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
 813                                                            ofm_tensor->handle(), pv);
 814   }
 815   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
 816   {
 817     // CWHN -> WHCN
 818     pv = arm_compute::PermutationVector{1, 2, 0};
 819
 820     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
 821                                                            ofm_tensor->handle(), pv);
 822   }
 823   else
 824   {
 825     fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
 826   }
 827   _return_fn = asAclFunction(std::move(fn));
 828 }
 829
 830 void KernelGenerator::visit(const ir::operation::PReLU &node)
 831 {
 832   const auto ofm_index{node.getOutputs().at(0)};
 833   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
 834   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 835
 836   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 837   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 838   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
 839
 840   auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
 841       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 842
 843   _return_fn = asAclFunction(std::move(fn));
 844 }
 845
 846 void KernelGenerator::visit(const ir::operation::Reduce &node)
 847 {
 848   const auto output_index{node.getOutputs().at(0)};
 849   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
 850   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 851
 852   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 853   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 854
 855   // Convert to ACL axes taking into account negative values and possible duplicates.
 856   const auto &axes = _ctx.at(axes_index);
 857   const auto input_rank = _ctx.at(input_index).shape().rank();
 858   const auto frontend_layout = _current_layout;
 859   const auto backend_layout = input_tensor->layout();
 860   const auto reduce_axes =
 861       acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
 862   const auto reduce_type = node.param().reduce_type;
 863   const auto keep_dims = node.param().keep_dims;
 864
 865   std::unique_ptr<::arm_compute::IFunction> fn;
 866   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
 867   {
 868     fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
 869                                                               keep_dims, output_tensor->handle());
 870   }
 871   else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
 872   {
 873     fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
 874                                                              keep_dims, output_tensor->handle());
 875   }
 876   else
 877   {
 878     fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
 879         input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
 880         acl_common::convertReduceType(reduce_type));
 881   }
 882   _return_fn = asAclFunction(std::move(fn));
 883 }
 884
 885 void KernelGenerator::visit(const ir::operation::Reshape &node)
 886 {
 887   const auto output_index{node.getOutputs().at(0)};
 888   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 889
 890   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 891   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 892
 893   // NOTE This operation must not be changed the layout from frontend to backend
 894   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 895   const auto frontend_layout = _current_layout;
 896   const auto backend_layout = output_tensor->layout();
 897   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 898          frontend_layout == backend_layout);
 899   UNUSED_RELEASE(frontend_layout);
 900   UNUSED_RELEASE(backend_layout);
 901
 902   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
 903                                                                    output_tensor->handle());
 904
 905   _return_fn = asAclFunction(std::move(fn));
 906 }
 907
 908 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 909 {
 910   const auto ofm_index{node.getOutputs().at(0)};
 911   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 912
 913   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 914   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 915
 916   auto fn = acl_common::generateLayer<arm_compute::NEScale>(
 917       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
 918       ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
 919       ::arm_compute::SamplingPolicy::TOP_LEFT);
 920
 921   _return_fn = asAclFunction(std::move(fn));
 922 }
 923
 924 void KernelGenerator::visit(const ir::operation::RNN &node)
 925 {
 926   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
 927   const auto hidden_state_out_index{
 928       node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
 929
 930   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
 931   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
 932   const auto recurrent_weights_index{
 933       node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
 934   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
 935   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
 936
 937   const auto activation = node.param().activation;
 938
 939   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 940   auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
 941
 942   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 943   auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
 944   auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
 945   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 946   auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
 947   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 948
 949   auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
 950       hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
 951   _return_fn = asAclFunction(std::move(copy_layer));
 952
 953   auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
 954       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 955       weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
 956       hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
 957   _return_fn = asAclFunction(std::move(fn));
 958 }
 959
 960 void KernelGenerator::visit(const ir::operation::Squeeze &node)
 961 {
 962   // Squeeze is identical to reshape except that it has an optional dimensions input.
 963   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 964   // by freezer and toco
 965   const auto output_index{node.getOutputs().at(0)};
 966   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 967   const auto dims{node.param().dims};
 968   const auto ndim{node.param().ndim};
 969   (void)dims;
 970   (void)ndim;
 971
 972   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 973   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 974   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
 975                                                                    output_tensor->handle());
 976   _return_fn = asAclFunction(std::move(fn));
 977 }
 978
 979 void KernelGenerator::visit(const ir::operation::Softmax &node)
 980 {
 981   const auto output_index{node.getOutputs().at(0)};
 982   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
 983   const auto beta = node.param().beta;
 984
 985   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 986   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 987
 988   // Disable applied dim_correction
 989   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
 990   {
 991     // This means that high dimension's value is 1 and input tensor is applied dim_correction
 992     acl_common::disableDimCorrection(input_tensor);
 993   }
 994
 995   auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
 996       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 997       output_tensor->handle(), beta);
 998
 999   // Revert disabling applied dim_correction
1000   if (input_tensor->dimension(0) == 1)
1001   {
1002     acl_common::disableDimCorrection(input_tensor);
1003   }
1004
1005   _return_fn = asAclFunction(std::move(fn));
1006 }
1007
1008 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1009 {
1010   const auto ofm_index{node.getOutputs().at(0)};
1011   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
1012   const auto block_size_index{
1013       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1014   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1015
1016   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1017   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1018   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1019   auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1020
1021   assert(_ctx.at(block_size_index).data());
1022   assert(_ctx.at(paddings_index).data());
1023
1024   auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
1025       ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1026       ofm_tensor->handle());
1027
1028   _return_fn = asAclFunction(std::move(fn));
1029 }
1030
1031 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1032 {
1033   const auto ofm_index{node.getOutputs().at(0)};
1034   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1035
1036   auto block_size = node.param().block_size;
1037
1038   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1039   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1040
1041   auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
1042       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1043
1044   _return_fn = asAclFunction(std::move(fn));
1045 }
1046
1047 void KernelGenerator::visit(const ir::operation::Split &node)
1048 {
1049   // TODO Support this op by SubTensor
1050   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1051   const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1052
1053   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1054   if (!_ctx.at(axis_index).isConstant())
1055   {
1056     throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
1057   }
1058
1059   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1060   std::vector<ir::OperandIndex> output_indexes;
1061   for (const auto &output : node.getOutputs())
1062     output_indexes.emplace_back(output);
1063
1064   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1065   std::vector<arm_compute::ITensor *> output_tensors;
1066   for (const auto &ofm_ind : output_indexes)
1067     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1068
1069   const auto frontend_layout = _current_layout;
1070   const auto backend_layout = ifm_tensor->layout();
1071   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1072   if (axis < 0)
1073     axis += ifm_rank;
1074   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1075
1076   auto fn =
1077       acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
1078
1079   _return_fn = asAclFunction(std::move(fn));
1080 }
1081
1082 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1083 {
1084   const auto ofm_index{node.getOutputs().at(0)};
1085   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1086   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1087
1088   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1089   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1090   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1091
1092   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1093       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1094
1095   _return_fn = asAclFunction(std::move(fn));
1096 }
1097
1098 void KernelGenerator::visit(const ir::operation::Slice &node)
1099 {
1100   const auto output_index{node.getOutputs().at(0)};
1101   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1102   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1103   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1104
1105   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1106   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1107   const auto frontend_layout = _current_layout;
1108   const auto backend_layout = inputData_tensor->layout();
1109
1110   // Set initializers for indices data such as order of inputData
1111   int input_rank = _ctx.at(input_index).shape().rank();
1112   std::vector<int32_t> starts;
1113   std::vector<int32_t> ends;
1114   starts.resize(input_rank, 0);
1115   ends.resize(input_rank, 0);
1116   {
1117     auto beginData_base = _ctx.at(begins_index).data()->base();
1118     auto sizeData_base = _ctx.at(sizes_index).data()->base();
1119     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1120     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1121
1122     using ir::DataType;
1123
1124     UNUSED_RELEASE(beginData_size);
1125     UNUSED_RELEASE(sizeData_size);
1126
1127     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1128     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1129     assert(beginData_size == input_rank);
1130     assert(sizeData_size == input_rank);
1131
1132     assert(beginData_base != nullptr);
1133     for (int n = 0; n < input_rank; ++n)
1134     {
1135       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1136                                                                  backend_layout)
1137                       .value();
1138
1139       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1140       starts[axis] = begin_value;
1141
1142       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1143       ends[axis] = begin_value + size_value;
1144     }
1145   }
1146
1147   ::arm_compute::Coordinates starts_set;
1148   ::arm_compute::Coordinates ends_set;
1149
1150   for (size_t i = 0; i < starts.size(); ++i)
1151   {
1152     starts_set.set(i, starts[i]);
1153     ends_set.set(i, ends[i]);
1154   }
1155
1156   auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1157       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1158
1159   _return_fn = asAclFunction(std::move(fn));
1160 }
1161
1162 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1163 {
1164   const auto output_index{node.getOutputs().at(0)};
1165   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1166   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1167   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1168   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1169
1170   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1171   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1172   const auto frontend_layout = _current_layout;
1173   const auto backend_layout = inputData_tensor->layout();
1174
1175   // Set initializers for indices data such as order of inputData
1176   int input_rank = _ctx.at(input_index).shape().rank();
1177   std::vector<int32_t> starts;
1178   std::vector<int32_t> ends;
1179   std::vector<int32_t> strides;
1180   starts.resize(input_rank, 0);
1181   ends.resize(input_rank, 0);
1182   strides.resize(input_rank, 0);
1183   {
1184     auto startData_base = _ctx.at(starts_index).data()->base();
1185     auto endData_base = _ctx.at(ends_index).data()->base();
1186     auto stridesData_base = _ctx.at(strides_index).data()->base();
1187     const int startData_size = _ctx.at(starts_index).shape().num_elements();
1188     const int endData_size = _ctx.at(ends_index).shape().num_elements();
1189     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1190
1191     using ir::DataType;
1192
1193     UNUSED_RELEASE(startData_size);
1194     UNUSED_RELEASE(endData_size);
1195     UNUSED_RELEASE(stridesData_size);
1196
1197     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1198     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1199     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1200     assert(startData_size == input_rank);
1201     assert(endData_size == input_rank);
1202     assert(stridesData_size == input_rank);
1203
1204     assert(startData_base != nullptr);
1205     for (int n = 0; n < input_rank; ++n)
1206     {
1207       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1208                                                                  backend_layout)
1209                       .value();
1210
1211       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1212       starts[axis] = start_value;
1213
1214       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1215       ends[axis] = end_value;
1216
1217       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1218       strides[axis] = strides_value;
1219     }
1220   }
1221
1222   // Set mask bits such as order of inputData
1223   // FIXME Take the layouts into account.
1224   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1225   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1226   const auto shrink_axis_mask =
1227       acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1228
1229   ::arm_compute::Coordinates starts_set;
1230   ::arm_compute::Coordinates ends_set;
1231   ::arm_compute::BiStrides strides_set;
1232
1233   for (size_t i = 0; i < starts.size(); ++i)
1234   {
1235     starts_set.set(i, starts[i]);
1236     ends_set.set(i, ends[i]);
1237     strides_set.set(i, strides[i]);
1238   }
1239
1240   // Disable applied dim_correction
1241   if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
1242   {
1243     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1244     acl_common::disableDimCorrection(inputData_tensor);
1245   }
1246
1247   auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1248       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1249       begin_mask, end_mask, shrink_axis_mask);
1250
1251   // Revert disabling applied dim_correction
1252   if (inputData_tensor->dimension(0) == 1)
1253   {
1254     acl_common::enableDimCorrection(inputData_tensor);
1255   }
1256
1257   _return_fn = asAclFunction(std::move(fn));
1258 }
1259
1260 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1261 {
1262   const auto ofm_index{node.getOutputs().at(0)};
1263   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1264   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1265
1266   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
1267   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
1268   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
1269
1270   const auto stride = node.param().stride;
1271
1272   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1273          (node.param().padding.type == ir::PaddingType::VALID));
1274   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1275                                       ker_shape.W, ker_shape.H);
1276
1277   uint32_t invalid_horizontal = 0;
1278   uint32_t invalid_vertical = 0;
1279   if (node.param().padding.type == ir::PaddingType::VALID)
1280   {
1281     invalid_horizontal =
1282         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1283     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1284   }
1285
1286   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1287   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1288   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1289
1290   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1291
1292   auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1293       ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1294       invalid_horizontal, invalid_vertical);
1295
1296   _return_fn = asAclFunction(std::move(fn));
1297 }
1298
1299 void KernelGenerator::visit(const ir::operation::Transpose &node)
1300 {
1301   const auto ofm_idx{node.getOutputs().at(0)};
1302   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1303   const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
1304
1305   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
1306   const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
1307   const auto frontend_layout = _current_layout;
1308   const auto backend_layout = ifm_tensor->layout();
1309   const auto rank = _ctx.at(ifm_idx).shape().rank();
1310
1311   const auto &perms = _ctx.at(perm_idx);
1312   std::vector<int32_t> pv;
1313   if (perms.shape() == ir::Shape{0})
1314   {
1315     pv.resize(rank);
1316     std::iota(pv.begin(), pv.end(), 0);
1317     std::reverse(pv.begin(), pv.end());
1318   }
1319   else
1320   {
1321     pv = _ctx.at(perm_idx).asVector<int32_t>();
1322   }
1323
1324   std::unique_ptr<arm_compute::IFunction> fn;
1325   if (rank == 1)
1326   {
1327     fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
1328   }
1329   else if (rank == 2)
1330   {
1331     assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
1332     fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1333                                                              ofm_tensor->handle());
1334   }
1335   else
1336   {
1337     auto backend_pv =
1338         acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
1339
1340     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1341                                                            ofm_tensor->handle(), backend_pv);
1342   }
1343   _return_fn = asAclFunction(std::move(fn));
1344 }
1345
1346 void KernelGenerator::visit(const ir::operation::Unpack &node)
1347 {
1348   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1349   auto axis{node.param().axis};
1350
1351   const auto input_rank = _ctx.at(input_index).shape().rank();
1352
1353   std::vector<ir::OperandIndex> output_indexes;
1354   for (const auto &output_index : node.getOutputs())
1355     output_indexes.emplace_back(output_index);
1356
1357   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1358   std::vector<arm_compute::ITensor *> outputs;
1359   for (const auto &output_index : output_indexes)
1360     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1361
1362   const auto frontend_layout = _current_layout;
1363   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1364   if (axis < 0)
1365     axis += input_rank;
1366   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1367
1368   // Disable applied dim_correction
1369   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1370   {
1371     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1372     acl_common::disableDimCorrection(input_tensor);
1373   }
1374
1375   auto fn =
1376       acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
1377
1378   // Revert disabling applied dim_correction
1379   if (input_tensor->dimension(0) == 1)
1380   {
1381     acl_common::enableDimCorrection(input_tensor);
1382   }
1383
1384   _return_fn = asAclFunction(std::move(fn));
1385 }
1386
1387 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1388 {
1389   const auto output_index{node.getOutputs().at(0)};
1390   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1391
1392   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1393   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1394
1395   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1396                                                                    output_tensor->handle());
1397
1398   _return_fn = asAclFunction(std::move(fn));
1399 }
1400
1401 void KernelGenerator::visit(const ir::operation::Comparison &node)
1402 {
1403   const auto output_index{node.getOutputs().at(0)};
1404   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1405   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1406
1407   const auto comparison_type = node.param().comparison_type;
1408
1409   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1410   auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
1411   auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
1412
1413   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1414       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1415       (arm_compute::ComparisonOperation)comparison_type);
1416
1417   _return_fn = asAclFunction(std::move(fn));
1418 }
1419
1420 void KernelGenerator::visit(const ir::operation::OneHot &node)
1421 {
1422   const auto out_idx{node.getOutputs().at(0)};
1423   const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1424   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1425   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1426   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1427
1428   auto output_tensor = _tensor_reg->getAclTensor(out_idx);
1429   auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
1430   auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
1431   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
1432   auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
1433
1434   const size_t output_rank = _ctx.at(out_idx).shape().rank();
1435   const auto frontend_layout = _current_layout;
1436   const auto backend_layout = output_tensor->layout();
1437   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
1438   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1439
1440   auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
1441       indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1442       offvalue_tensor->handle(), output_tensor->handle(), axis);
1443   _return_fn = asAclFunction(std::move(fn));
1444 }
1445
1446 } // namespace acl_neon
1447 } // namespace backend
1448 } // namespace onert