runtime/onert/backend/acl_neon/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/NEON/NEFunctions.h>   // Include all ARM Compute NEON functions
  20 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
  21
  22 #include <AclActivationBuilder.h>
  23 #include <AclFunction.h>
  24 #include <Convert.h>
  25 #include <Swizzle.h>
  26
  27 #include "ir/Index.h"
  28 #include "ir/DataType.h"
  29 #include "ir/InternalType.h"
  30 #include "exec/NopFunction.h"
  31 #include "util/logging.h"
  32 #include "util/Utils.h"
  33 #include "AclKernelGen.h"
  34
  35 namespace onert
  36 {
  37 namespace backend
  38 {
  39 namespace acl_neon
  40 {
  41
  42 using ::onert::backend::acl_common::asAclFunction;
  43 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
  44   ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
  45
  46 KernelGenerator::KernelGenerator(
  47   const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
  48   const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
  49   : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()),
  50     _operations_ctx(graph.operations()), _current_layout{graph.layout()},
  51     _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
  52 {
  53   // DO NOTHING
  54 }
  55
  56 std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
  57 {
  58   auto ret = std::make_unique<exec::FunctionSequence>();
  59   ret->enableDynamicShapeInferer(false);
  60
  61   const auto &op = _graph.operations().at(ind);
  62   op.accept(*this);
  63   ret->append(releaseFunction());
  64   return ret;
  65 }
  66
  67 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
  68 {
  69   const auto ofm_index{node.getOutputs().at(0)};
  70   const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
  71   const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
  72
  73   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
  74
  75   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
  76   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
  77   auto frontend_layout = _current_layout;
  78   auto backend_layout = ifm_tensor->layout();
  79
  80   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
  81   if (axis_value < 0)
  82   {
  83     axis_value += ifm_rank;
  84   }
  85   assert(axis_value >= 0 && axis_value < ifm_rank);
  86   const auto fixed_axis =
  87     acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
  88   auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
  89                                              : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
  90
  91   auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
  92     ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), reduce_type);
  93
  94   _return_fn = asAclFunction(std::move(fn));
  95 }
  96
  97 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
  98 {
  99   const auto ofm_index{node.getOutputs().at(0)};
 100   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
 101   const auto block_size_index{
 102     node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 103
 104   const auto NNApiInputs = 2;
 105   if (node.getInputs().size() != NNApiInputs)
 106   {
 107     const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
 108     if (!_ctx.at(crops_index).isConstant())
 109     {
 110       throw std::runtime_error("Non-constant crops NYI for acl_neon backend BatchToSpaceND");
 111     }
 112
 113     auto crops = _ctx.at(crops_index).asVector<int32_t>();
 114     for (auto &&crop : crops)
 115     {
 116       if (crop != 0)
 117       {
 118         throw std::runtime_error("Non-zero crops NYI for acl_neon backend BatchToSpaceND");
 119       }
 120     }
 121   }
 122
 123   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 124   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 125   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 126
 127   assert(_ctx.at(block_size_index).data());
 128
 129   auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
 130     ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 131
 132   _return_fn = asAclFunction(std::move(fn));
 133 }
 134
 135 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 136 {
 137   const auto ofm_index{node.getOutputs().at(0)};
 138   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
 139   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
 140
 141   const auto activation = node.param().activation;
 142
 143   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 144   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 145   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 146
 147   std::unique_ptr<arm_compute::IFunction> fn;
 148   switch (node.param().arithmetic_type)
 149   {
 150     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
 151     {
 152       fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
 153         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 154         arm_compute::ConvertPolicy::SATURATE);
 155       break;
 156     }
 157     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
 158     {
 159       fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
 160         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 161         arm_compute::ConvertPolicy::SATURATE);
 162       break;
 163     }
 164     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
 165     {
 166       // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
 167       fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
 168         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
 169         arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
 170       break;
 171     }
 172     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
 173     {
 174       fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
 175         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 176       break;
 177     }
 178     default:
 179       assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
 180       break;
 181   }
 182   _return_fn = std::make_unique<exec::FunctionSequence>(
 183     asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 184 }
 185
 186 void KernelGenerator::visit(const ir::operation::Conv2D &node)
 187 {
 188   using ir::operation::Conv2D;
 189
 190   const auto ofm_index{node.getOutputs().at(0)};
 191   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 192   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 193   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 194
 195   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
 196   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
 197   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 198   const auto &ker_shape = _ctx.at(ker_index).shape();
 199   const auto ker_height = ker_shape.dim(1);
 200   const auto ker_width = ker_shape.dim(2);
 201
 202   const auto stride = node.param().stride;
 203   const auto padding =
 204     ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
 205   const auto activation = node.param().activation;
 206
 207   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 208   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 209   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 210   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 211
 212   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 213   const auto act_info = acl_common::asActivationLayerInfo(activation);
 214
 215   auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
 216     _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
 217     ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
 218     ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 219
 220   _return_fn = asAclFunction(std::move(fn));
 221 }
 222
 223 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
 224 {
 225   const auto output_index{node.getOutputs().at(0)};
 226   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
 227
 228   auto block_size = node.param().block_size;
 229   assert(block_size > 0);
 230
 231   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 232   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 233
 234   auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
 235     input_tensor->handle(), output_tensor->handle(), block_size);
 236
 237   _return_fn = asAclFunction(std::move(fn));
 238 }
 239
 240 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 241 {
 242   using ir::operation::DepthwiseConv2D;
 243
 244   const auto ofm_index{node.getOutputs().at(0)};
 245   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 246   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 247   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 248
 249   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
 250   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
 251   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 252   const auto &ker_shape = _ctx.at(ker_index).shape();
 253   const auto ker_height = ker_shape.dim(1);
 254   const auto ker_width = ker_shape.dim(2);
 255
 256   const auto stride = node.param().stride;
 257   const auto dilation = node.param().dilation;
 258   const auto padding =
 259     ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
 260                          dilation.width_factor, dilation.height_factor);
 261   const auto multiplier = node.param().multiplier;
 262   const auto activation = node.param().activation;
 263
 264   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 265   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 266   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 267   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 268
 269   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 270   const auto act_info = acl_common::asActivationLayerInfo(activation);
 271   const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
 272
 273   auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
 274     ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
 275     conv_info, multiplier, act_info, dilation_info);
 276
 277   _return_fn = asAclFunction(std::move(fn));
 278 }
 279
 280 void KernelGenerator::visit(const ir::operation::Concat &node)
 281 {
 282   const auto ofm_index{node.getOutputs().at(0)};
 283
 284   std::vector<ir::OperandIndex> input_indexes;
 285   for (const auto &input : node.getInputs())
 286     input_indexes.emplace_back(input);
 287
 288   const auto axis = node.param().axis;
 289
 290   // Concat elimination check
 291   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
 292   if (eliminated)
 293   {
 294     // If concat eliminated, return a NOP IFunction
 295     VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
 296     _return_fn = std::make_unique<exec::NopFunction>();
 297     return;
 298   }
 299
 300   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
 301   std::vector<const ::arm_compute::ITensor *> input_tensors;
 302   for (const auto &ifm_ind : input_indexes)
 303     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 304
 305   std::unique_ptr<::arm_compute::IFunction> fn;
 306   if (input_indexes.size() < 2)
 307   {
 308     ::arm_compute::ITensor *input_tesor = _tensor_reg->getAclTensor(input_indexes.at(0))->handle();
 309     fn = acl_common::generateLayer<arm_compute::NECopy>(input_tesor, output_tensor->handle());
 310   }
 311   else
 312   {
 313     const auto rank = _ctx.at(ofm_index).shape().rank();
 314     const auto frontend_layout = _current_layout;
 315     const auto backend_layout = output_tensor->layout();
 316     const auto fixed_axis =
 317       acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 318     fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
 319       input_tensors, output_tensor->handle(), fixed_axis);
 320   }
 321
 322   _return_fn = asAclFunction(std::move(fn));
 323 }
 324
 325 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
 326 {
 327   const auto ofm_index{node.getOutputs().at(0)};
 328   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 329
 330   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 331   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 332
 333   const ::arm_compute::ActivationLayerInfo act_info =
 334     acl_common::asActivationLayerInfo(node.param().op_type, node.param().alpha, node.param().beta);
 335
 336   std::unique_ptr<arm_compute::IFunction> fn =
 337     acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
 338                                                               ofm_tensor->handle(), act_info);
 339
 340   _return_fn = asAclFunction(std::move(fn));
 341 }
 342
 343 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
 344 {
 345   const auto output_index{node.getOutputs().at(0)};
 346   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
 347   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 348
 349   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 350   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 351   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 352
 353   std::unique_ptr<arm_compute::IFunction> fn;
 354   switch (node.param().op_type)
 355   {
 356     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
 357     {
 358       fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
 359         lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 360       break;
 361     }
 362     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
 363     {
 364       fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
 365         lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 366       break;
 367     }
 368     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
 369     {
 370       fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
 371         lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 372       break;
 373     }
 374     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
 375     {
 376       fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
 377         lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 378       break;
 379     }
 380     default:
 381     {
 382       std::string err_msg("acl_neon KernelGenerator : " + node.name() +
 383                           "is not elementwise-binary operations");
 384       assert(false && err_msg.c_str());
 385       break;
 386     }
 387   }
 388   _return_fn = asAclFunction(std::move(fn));
 389 }
 390
 391 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
 392 {
 393   const auto output_index{node.getOutputs().at(0)};
 394   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 395
 396   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 397   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 398
 399   std::unique_ptr<arm_compute::IFunction> fn;
 400   switch (node.param().op_type)
 401   {
 402     case ir::operation::ElementwiseUnary::Type::ABS:
 403     {
 404       const ::arm_compute::ActivationLayerInfo act_info{
 405         ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 406
 407       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
 408         input_tensor->handle(), output_tensor->handle(), act_info);
 409       break;
 410     }
 411     case ir::operation::ElementwiseUnary::Type::CAST:
 412     {
 413       if (input_tensor->data_type() == output_tensor->data_type())
 414       {
 415         fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
 416                                                             output_tensor->handle());
 417       }
 418       else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
 419       {
 420         fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
 421                                                                 output_tensor->handle());
 422       }
 423       else
 424       {
 425         fn = acl_common::generateLayer<arm_compute::NECast>(
 426           input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
 427       }
 428       break;
 429     }
 430     case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
 431     {
 432       fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
 433                                                                          output_tensor->handle());
 434       break;
 435     }
 436     case ir::operation::ElementwiseUnary::Type::EXP:
 437     {
 438       fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
 439                                                               output_tensor->handle());
 440       break;
 441     }
 442     case ir::operation::ElementwiseUnary::Type::FLOOR:
 443     {
 444       fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
 445                                                            output_tensor->handle());
 446       break;
 447     }
 448     case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
 449     {
 450       fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
 451                                                                 output_tensor->handle());
 452       break;
 453     }
 454     case ir::operation::ElementwiseUnary::Type::NEG:
 455     {
 456       fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
 457                                                               output_tensor->handle());
 458       break;
 459     }
 460     case ir::operation::ElementwiseUnary::Type::RSQRT:
 461     {
 462       fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
 463                                                                 output_tensor->handle());
 464       break;
 465     }
 466     case ir::operation::ElementwiseUnary::Type::SQRT:
 467     {
 468       const ::arm_compute::ActivationLayerInfo act_info{
 469         ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 470
 471       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
 472         input_tensor->handle(), output_tensor->handle(), act_info);
 473       break;
 474     }
 475     default:
 476     {
 477       throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
 478                                "is not supported yet");
 479       break;
 480     }
 481   }
 482   _return_fn = asAclFunction(std::move(fn));
 483 }
 484
 485 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
 486 {
 487   const auto output_index{node.getOutputs().at(0)};
 488   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
 489   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 490
 491   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 492   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
 493   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 494
 495   auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
 496     values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 497
 498   _return_fn = asAclFunction(std::move(fn));
 499 }
 500
 501 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 502 {
 503   const auto output_index{node.getOutputs().at(0)};
 504   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 505   const auto activation = node.param().activation;
 506   if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
 507     throw std::runtime_error(
 508       "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
 509
 510   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
 511                                                 ::arm_compute::NEFullyConnectedReshapingLayer>(
 512     node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
 513   _return_fn = std::make_unique<exec::FunctionSequence>(
 514     std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 515 }
 516
 517 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
 518 {
 519   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
 520   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
 521
 522   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
 523   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
 524   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 525
 526   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 527   auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
 528
 529   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
 530   auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
 531   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 532
 533   auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
 534     lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
 535     output_tensor->handle(), hits_tensor->handle());
 536
 537   _return_fn = asAclFunction(std::move(fn));
 538 }
 539
 540 void KernelGenerator::visit(const ir::operation::Gather &node)
 541 {
 542   const auto ofm_index{node.getOutputs().at(0)};
 543
 544   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
 545   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
 546
 547   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
 548   const auto axis_raw = node.param().axis;
 549   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
 550   // Converting in reverse order
 551   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 552
 553   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 554   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 555   auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
 556   const auto backend_layout = ofm_tensor->layout();
 557   UNUSED_RELEASE(backend_layout);
 558
 559   // NOTE The frontend layout and backend layout must be the same for this operation.
 560   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
 561   //      is not not efficient even if it works well. If so, it would be better to set the
 562   //      layout of these backend tensors to the same layout.
 563   //      There is also one thing we have to think about. This operation depends on the layout of
 564   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
 565   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
 566   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
 567   assert(backend_layout == ifm_tensor->layout());
 568   assert(backend_layout == indices_tensor->layout());
 569   assert(ifm_rank < 4 || _current_layout == backend_layout);
 570
 571   // input is n-D, indices k-D, output is (n + k - 1)-D
 572   size_t n = ifm_rank;
 573   assert(n == ifm_tensor->num_dimensions());
 574   size_t k = _ctx.at(indices_index).shape().rank();
 575   assert(k == indices_tensor->num_dimensions());
 576
 577   // Disable applied dim_correction
 578   if (n != ifm_tensor->info()->num_dimensions())
 579   {
 580     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
 581     acl_common::disableDimCorrection(ifm_tensor);
 582   }
 583   if (k != indices_tensor->info()->num_dimensions())
 584   {
 585     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
 586     acl_common::disableDimCorrection(indices_tensor);
 587   }
 588
 589   auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
 590     ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 591
 592   // Revert disabling applied dim_correction
 593   if (ifm_tensor->dimension(0) == 1)
 594   {
 595     acl_common::enableDimCorrection(ifm_tensor);
 596   }
 597   if (indices_tensor->dimension(0) == 1)
 598   {
 599     acl_common::enableDimCorrection(indices_tensor);
 600   }
 601
 602   _return_fn = asAclFunction(std::move(fn));
 603 }
 604
 605 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
 606 {
 607   const auto ofm_index{node.getOutputs().at(0)};
 608   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
 609   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
 610   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 611
 612   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 613   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 614   auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
 615   auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
 616   auto epsilon = node.param().epsilon;
 617   auto activation = node.param().activation;
 618
 619   auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
 620     ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
 621     epsilon);
 622
 623   _return_fn = std::make_unique<exec::FunctionSequence>(
 624     asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 625 }
 626
 627 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
 628 {
 629   const auto ofm_index{node.getOutputs().at(0)};
 630   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
 631
 632   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
 633   // L2 Normalization always performs the reduction along the depth axis
 634   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
 635   // choosing normalization parameters as below
 636
 637   const auto &ifm_shape = _ctx.at(ifm_index).shape();
 638   // TODO Support optional constant dimension that normalization would be performed on
 639   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
 640   int32_t radius =
 641     2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
 642   float alpha = 1.0f;                          // In the implementation to make alpha_ become 1
 643   float beta = 0.5f;                           // pow(reduction, -0.5) = 1 / sqrt(reduction)
 644   float bias = 0.0f;                           // Don't offset the reduction.
 645
 646   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 647   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 648
 649   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
 650                                                                radius, alpha, beta, bias, false);
 651
 652   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
 653     ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 654
 655   _return_fn = asAclFunction(std::move(fn));
 656 }
 657
 658 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
 659 {
 660   const auto ofm_index{node.getOutputs().at(0)};
 661   const auto ifm_index{
 662     node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
 663
 664   auto radius = node.param().radius;
 665   auto alpha = node.param().alpha;
 666   auto beta = node.param().beta;
 667   auto bias = node.param().bias;
 668
 669   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 670   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 671
 672   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
 673     ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 674
 675   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
 676     ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 677
 678   _return_fn = asAclFunction(std::move(fn));
 679 }
 680
 681 void KernelGenerator::visit(const ir::operation::LSTM &node)
 682 {
 683   _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
 684                                          ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
 685 }
 686
 687 void KernelGenerator::visit(const ir::operation::Pack &node)
 688 {
 689   const auto output_index{node.getOutputs().at(0)};
 690   auto axis{node.param().axis};
 691
 692   const auto output_rank = _ctx.at(output_index).shape().rank();
 693
 694   std::vector<ir::OperandIndex> input_indexes;
 695   for (const auto &input_index : node.getInputs())
 696     input_indexes.emplace_back(input_index);
 697
 698   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 699   std::vector<arm_compute::ITensor *> inputs;
 700   for (const auto &input_index : input_indexes)
 701     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 702
 703   const auto frontend_layout = _current_layout;
 704   const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 705
 706   if (axis < 0)
 707     axis += output_rank;
 708   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 709
 710   // Disable applied dim_correction
 711   for (const auto &input_index : input_indexes)
 712   {
 713     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 714     if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
 715     {
 716       // This means that high dimension's value is 1 and input tensor is applied dim_correction
 717       acl_common::disableDimCorrection(input_tensor);
 718     }
 719   }
 720
 721   auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
 722
 723   // Revert disabling applied dim_correction
 724   for (const auto &input_index : input_indexes)
 725   {
 726     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 727     if (input_tensor->dimension(0) == 1)
 728     {
 729       acl_common::enableDimCorrection(input_tensor);
 730     }
 731   }
 732
 733   _return_fn = asAclFunction(std::move(fn));
 734 }
 735
 736 void KernelGenerator::visit(const ir::operation::Pad &node)
 737 {
 738   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
 739   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
 740   const auto output_index{node.getOutputs().at(0)};
 741   assert(_ctx.at(pad_index).data());
 742
 743   auto rank = _ctx.at(input_index).shape().rank();
 744   auto pad_base = _ctx.at(pad_index).data()->base();
 745
 746   auto input = _tensor_reg->getAclTensor(input_index)->handle();
 747   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 748
 749   ::arm_compute::PaddingList padding_list;
 750   padding_list.resize(rank);
 751   for (int32_t n = 0; n < rank; ++n)
 752   {
 753     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
 754
 755     const auto frontend_layout = _current_layout;
 756     const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
 757     const auto axis =
 758       acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
 759     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
 760   }
 761
 762   const auto input_type = _ctx.at(input_index).typeInfo();
 763   UNUSED_RELEASE(input_type);
 764   assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
 765   assert(input->info()->quantization_info() ==
 766          ::arm_compute::QuantizationInfo(input_type.scale(), input_type.zero_point()));
 767   const auto pixel_value =
 768     ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
 769
 770   auto fn =
 771     acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
 772
 773   _return_fn = asAclFunction(std::move(fn));
 774 }
 775
 776 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 777 {
 778   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
 779     node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
 780
 781   const auto ofm_index{node.getOutputs().at(0)};
 782   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 783   const auto activation = node.param().activation;
 784   _return_fn = std::make_unique<exec::FunctionSequence>(
 785     asAclFunction(std::move(raw_fn)),
 786     ActivationBuilder::generate(activation, ofm_tensor->handle()));
 787 }
 788
 789 void KernelGenerator::visit(const ir::operation::Permute &node)
 790 {
 791   const auto ofm_idx{node.getOutputs().at(0)};
 792   const auto ifm_idx{node.getInputs().at(0)};
 793   const auto permute_type = node.getPermuteType();
 794   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
 795   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
 796   const auto rank = _ctx.at(ofm_idx).shape().rank();
 797   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 798
 799   std::unique_ptr<::arm_compute::IFunction> fn;
 800   arm_compute::PermutationVector pv;
 801   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
 802   {
 803     // WHCN -> CWHN
 804     pv = arm_compute::PermutationVector{2, 0, 1};
 805
 806     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
 807                                                            ofm_tensor->handle(), pv);
 808   }
 809   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
 810   {
 811     // CWHN -> WHCN
 812     pv = arm_compute::PermutationVector{1, 2, 0};
 813
 814     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
 815                                                            ofm_tensor->handle(), pv);
 816   }
 817   else
 818   {
 819     fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
 820   }
 821   _return_fn = asAclFunction(std::move(fn));
 822 }
 823
 824 void KernelGenerator::visit(const ir::operation::PReLU &node)
 825 {
 826   const auto ofm_index{node.getOutputs().at(0)};
 827   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
 828   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 829
 830   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 831   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 832   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
 833
 834   auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
 835     ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 836
 837   _return_fn = asAclFunction(std::move(fn));
 838 }
 839
 840 void KernelGenerator::visit(const ir::operation::Reduce &node)
 841 {
 842   const auto output_index{node.getOutputs().at(0)};
 843   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
 844   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 845
 846   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 847   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 848
 849   // Convert to ACL axes taking into account negative values and possible duplicates.
 850   const auto &axes = _ctx.at(axes_index);
 851   const auto input_rank = _ctx.at(input_index).shape().rank();
 852   const auto frontend_layout = _current_layout;
 853   const auto backend_layout = input_tensor->layout();
 854   const auto reduce_axes =
 855     acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
 856   const auto reduce_type = node.param().reduce_type;
 857   const auto keep_dims = node.param().keep_dims;
 858
 859   std::unique_ptr<::arm_compute::IFunction> fn;
 860   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
 861   {
 862     fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
 863                                                               keep_dims, output_tensor->handle());
 864   }
 865   else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
 866   {
 867     fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
 868                                                              keep_dims, output_tensor->handle());
 869   }
 870   else
 871   {
 872     fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
 873       input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
 874       acl_common::convertReduceType(reduce_type));
 875   }
 876   _return_fn = asAclFunction(std::move(fn));
 877 }
 878
 879 void KernelGenerator::visit(const ir::operation::Reshape &node)
 880 {
 881   const auto output_index{node.getOutputs().at(0)};
 882   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 883
 884   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 885   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 886
 887   // NOTE This operation must not be changed the layout from frontend to backend
 888   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 889   const auto frontend_layout = _current_layout;
 890   const auto backend_layout = output_tensor->layout();
 891   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 892          frontend_layout == backend_layout);
 893   UNUSED_RELEASE(frontend_layout);
 894   UNUSED_RELEASE(backend_layout);
 895
 896   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
 897                                                                    output_tensor->handle());
 898
 899   _return_fn = asAclFunction(std::move(fn));
 900 }
 901
 902 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 903 {
 904   const auto ofm_index{node.getOutputs().at(0)};
 905   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 906
 907   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 908   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 909
 910   auto fn = acl_common::generateLayer<arm_compute::NEScale>(
 911     ifm_tensor->handle(), ofm_tensor->handle(),
 912     ::arm_compute::ScaleKernelInfo{::arm_compute::InterpolationPolicy::BILINEAR,
 913                                    ::arm_compute::BorderMode::REPLICATE,
 914                                    ::arm_compute::PixelValue(0.f),
 915                                    ::arm_compute::SamplingPolicy::TOP_LEFT, false /*use padding*/});
 916
 917   _return_fn = asAclFunction(std::move(fn));
 918 }
 919
 920 void KernelGenerator::visit(const ir::operation::RNN &node)
 921 {
 922   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
 923   const auto hidden_state_out_index{
 924     node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
 925
 926   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
 927   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
 928   const auto recurrent_weights_index{
 929     node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
 930   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
 931   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
 932
 933   const auto activation = node.param().activation;
 934
 935   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 936   auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
 937
 938   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 939   auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
 940   auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
 941   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 942   auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
 943   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 944
 945   auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
 946     hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
 947   _return_fn = asAclFunction(std::move(copy_layer));
 948
 949   auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
 950     _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 951     weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
 952     hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
 953   _return_fn = asAclFunction(std::move(fn));
 954 }
 955
 956 void KernelGenerator::visit(const ir::operation::Squeeze &node)
 957 {
 958   // Squeeze is identical to reshape except that it has an optional dimensions input.
 959   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 960   // by freezer and toco
 961   const auto output_index{node.getOutputs().at(0)};
 962   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 963   const auto dims{node.param().dims};
 964   const auto ndim{node.param().ndim};
 965   (void)dims;
 966   (void)ndim;
 967
 968   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 969   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 970   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
 971                                                                    output_tensor->handle());
 972   _return_fn = asAclFunction(std::move(fn));
 973 }
 974
 975 void KernelGenerator::visit(const ir::operation::Softmax &node)
 976 {
 977   const auto output_index{node.getOutputs().at(0)};
 978   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
 979   const auto beta = node.param().beta;
 980
 981   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 982   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 983
 984   // NOTE NESoftmaxLayer's default axis is -1
 985   auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
 986     _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 987     output_tensor->handle(), beta);
 988
 989   _return_fn = asAclFunction(std::move(fn));
 990 }
 991
 992 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
 993 {
 994   const auto ofm_index{node.getOutputs().at(0)};
 995   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
 996   const auto block_size_index{
 997     node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
 998   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 999
1000   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1001   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1002   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1003   auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1004
1005   assert(_ctx.at(block_size_index).data());
1006   assert(_ctx.at(paddings_index).data());
1007
1008   auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
1009     ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1010     ofm_tensor->handle());
1011
1012   _return_fn = asAclFunction(std::move(fn));
1013 }
1014
1015 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1016 {
1017   const auto ofm_index{node.getOutputs().at(0)};
1018   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1019
1020   auto block_size = node.param().block_size;
1021
1022   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1023   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1024
1025   auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
1026     ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1027
1028   _return_fn = asAclFunction(std::move(fn));
1029 }
1030
1031 void KernelGenerator::visit(const ir::operation::Split &node)
1032 {
1033   // TODO Support this op by SubTensor
1034   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1035   const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1036
1037   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1038   if (!_ctx.at(axis_index).isConstant())
1039   {
1040     throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
1041   }
1042
1043   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1044   std::vector<ir::OperandIndex> output_indexes;
1045   for (const auto &output : node.getOutputs())
1046     output_indexes.emplace_back(output);
1047
1048   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1049   std::vector<arm_compute::ITensor *> output_tensors;
1050   for (const auto &ofm_ind : output_indexes)
1051     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1052
1053   const auto frontend_layout = _current_layout;
1054   const auto backend_layout = ifm_tensor->layout();
1055   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1056   if (axis < 0)
1057     axis += ifm_rank;
1058   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1059
1060   auto fn =
1061     acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
1062
1063   _return_fn = asAclFunction(std::move(fn));
1064 }
1065
1066 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1067 {
1068   const auto ofm_index{node.getOutputs().at(0)};
1069   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1070   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1071
1072   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1073   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1074   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1075
1076   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1077     lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1078
1079   _return_fn = asAclFunction(std::move(fn));
1080 }
1081
1082 void KernelGenerator::visit(const ir::operation::Slice &node)
1083 {
1084   const auto output_index{node.getOutputs().at(0)};
1085   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1086   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1087   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1088
1089   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1090   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1091   const auto frontend_layout = _current_layout;
1092   const auto backend_layout = inputData_tensor->layout();
1093
1094   // Set initializers for indices data such as order of inputData
1095   int input_rank = _ctx.at(input_index).shape().rank();
1096   std::vector<int32_t> starts;
1097   std::vector<int32_t> ends;
1098   starts.resize(input_rank, 0);
1099   ends.resize(input_rank, 0);
1100   {
1101     auto beginData_base = _ctx.at(begins_index).data()->base();
1102     auto sizeData_base = _ctx.at(sizes_index).data()->base();
1103     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1104     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1105
1106     using ir::DataType;
1107
1108     UNUSED_RELEASE(beginData_size);
1109     UNUSED_RELEASE(sizeData_size);
1110
1111     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1112     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1113     assert(beginData_size == input_rank);
1114     assert(sizeData_size == input_rank);
1115
1116     assert(beginData_base != nullptr);
1117     for (int n = 0; n < input_rank; ++n)
1118     {
1119       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1120                                                                  backend_layout)
1121                     .value();
1122
1123       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1124       starts[axis] = begin_value;
1125
1126       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1127       ends[axis] = begin_value + size_value;
1128     }
1129   }
1130
1131   ::arm_compute::Coordinates starts_set;
1132   ::arm_compute::Coordinates ends_set;
1133
1134   for (size_t i = 0; i < starts.size(); ++i)
1135   {
1136     starts_set.set(i, starts[i]);
1137     ends_set.set(i, ends[i]);
1138   }
1139
1140   auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1141     inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1142
1143   _return_fn = asAclFunction(std::move(fn));
1144 }
1145
1146 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1147 {
1148   const auto output_index{node.getOutputs().at(0)};
1149   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1150   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1151   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1152   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1153
1154   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1155   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1156   const auto frontend_layout = _current_layout;
1157   const auto backend_layout = inputData_tensor->layout();
1158
1159   // Set initializers for indices data such as order of inputData
1160   int input_rank = _ctx.at(input_index).shape().rank();
1161   std::vector<int32_t> starts;
1162   std::vector<int32_t> ends;
1163   std::vector<int32_t> strides;
1164   starts.resize(input_rank, 0);
1165   ends.resize(input_rank, 0);
1166   strides.resize(input_rank, 0);
1167   {
1168     auto startData_base = _ctx.at(starts_index).data()->base();
1169     auto endData_base = _ctx.at(ends_index).data()->base();
1170     auto stridesData_base = _ctx.at(strides_index).data()->base();
1171     const int startData_size = _ctx.at(starts_index).shape().num_elements();
1172     const int endData_size = _ctx.at(ends_index).shape().num_elements();
1173     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1174
1175     using ir::DataType;
1176
1177     UNUSED_RELEASE(startData_size);
1178     UNUSED_RELEASE(endData_size);
1179     UNUSED_RELEASE(stridesData_size);
1180
1181     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1182     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1183     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1184     assert(startData_size == input_rank);
1185     assert(endData_size == input_rank);
1186     assert(stridesData_size == input_rank);
1187
1188     assert(startData_base != nullptr);
1189     for (int n = 0; n < input_rank; ++n)
1190     {
1191       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1192                                                                  backend_layout)
1193                     .value();
1194
1195       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1196       starts[axis] = start_value;
1197
1198       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1199       ends[axis] = end_value;
1200
1201       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1202       strides[axis] = strides_value;
1203     }
1204   }
1205
1206   // Set mask bits such as order of inputData
1207   // FIXME Take the layouts into account.
1208   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1209   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1210   const auto shrink_axis_mask =
1211     acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1212
1213   ::arm_compute::Coordinates starts_set;
1214   ::arm_compute::Coordinates ends_set;
1215   ::arm_compute::BiStrides strides_set;
1216
1217   for (size_t i = 0; i < starts.size(); ++i)
1218   {
1219     starts_set.set(i, starts[i]);
1220     ends_set.set(i, ends[i]);
1221     strides_set.set(i, strides[i]);
1222   }
1223
1224   // Disable applied dim_correction
1225   if (static_cast<size_t>(inputData_tensor->getShape().rank()) !=
1226       inputData_tensor->info()->num_dimensions())
1227   {
1228     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1229     acl_common::disableDimCorrection(inputData_tensor);
1230   }
1231
1232   auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1233     inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1234     begin_mask, end_mask, shrink_axis_mask);
1235
1236   // Revert disabling applied dim_correction
1237   if (inputData_tensor->getShape().dim(0) == 1)
1238   {
1239     acl_common::enableDimCorrection(inputData_tensor);
1240   }
1241
1242   _return_fn = asAclFunction(std::move(fn));
1243 }
1244
1245 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1246 {
1247   const auto ofm_index{node.getOutputs().at(0)};
1248   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1249   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1250
1251   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
1252   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
1253   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
1254
1255   const auto stride = node.param().stride;
1256
1257   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1258          (node.param().padding.type == ir::PaddingType::VALID));
1259   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1260                                       ker_shape.W, ker_shape.H);
1261
1262   uint32_t invalid_horizontal = 0;
1263   uint32_t invalid_vertical = 0;
1264   if (node.param().padding.type == ir::PaddingType::VALID)
1265   {
1266     invalid_horizontal =
1267       ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1268     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1269   }
1270
1271   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1272   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1273   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1274
1275   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1276
1277   auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1278     ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1279     invalid_horizontal, invalid_vertical);
1280
1281   _return_fn = asAclFunction(std::move(fn));
1282 }
1283
1284 void KernelGenerator::visit(const ir::operation::Transpose &node)
1285 {
1286   const auto ofm_idx{node.getOutputs().at(0)};
1287   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1288   const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
1289
1290   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
1291   const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
1292   const auto frontend_layout = _current_layout;
1293   const auto backend_layout = ifm_tensor->layout();
1294   const auto rank = _ctx.at(ifm_idx).shape().rank();
1295
1296   const auto &perms = _ctx.at(perm_idx);
1297   std::vector<int32_t> pv;
1298   if (perms.shape() == ir::Shape{0})
1299   {
1300     pv.resize(rank);
1301     std::iota(pv.begin(), pv.end(), 0);
1302     std::reverse(pv.begin(), pv.end());
1303   }
1304   else
1305   {
1306     pv = _ctx.at(perm_idx).asVector<int32_t>();
1307   }
1308
1309   std::unique_ptr<arm_compute::IFunction> fn;
1310   if (rank == 1)
1311   {
1312     fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
1313   }
1314   else if (rank == 2)
1315   {
1316     assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
1317     fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1318                                                              ofm_tensor->handle());
1319   }
1320   else
1321   {
1322     auto backend_pv =
1323       acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
1324
1325     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1326                                                            ofm_tensor->handle(), backend_pv);
1327   }
1328   _return_fn = asAclFunction(std::move(fn));
1329 }
1330
1331 void KernelGenerator::visit(const ir::operation::Unpack &node)
1332 {
1333   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1334   auto axis{node.param().axis};
1335
1336   const auto input_rank = _ctx.at(input_index).shape().rank();
1337
1338   std::vector<ir::OperandIndex> output_indexes;
1339   for (const auto &output_index : node.getOutputs())
1340     output_indexes.emplace_back(output_index);
1341
1342   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1343   std::vector<arm_compute::ITensor *> outputs;
1344   for (const auto &output_index : output_indexes)
1345     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1346
1347   const auto frontend_layout = _current_layout;
1348   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1349   if (axis < 0)
1350     axis += input_rank;
1351   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1352
1353   // Disable applied dim_correction
1354   if (static_cast<size_t>(input_tensor->getShape().rank()) !=
1355       input_tensor->info()->num_dimensions())
1356   {
1357     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1358     acl_common::disableDimCorrection(input_tensor);
1359   }
1360
1361   auto fn =
1362     acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
1363
1364   // Revert disabling applied dim_correction
1365   if (input_tensor->getShape().dim(0) == 1)
1366   {
1367     acl_common::enableDimCorrection(input_tensor);
1368   }
1369
1370   _return_fn = asAclFunction(std::move(fn));
1371 }
1372
1373 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1374 {
1375   const auto output_index{node.getOutputs().at(0)};
1376   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1377
1378   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1379   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1380
1381   auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1382                                                                    output_tensor->handle());
1383
1384   _return_fn = asAclFunction(std::move(fn));
1385 }
1386
1387 void KernelGenerator::visit(const ir::operation::Comparison &node)
1388 {
1389   const auto output_index{node.getOutputs().at(0)};
1390   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1391   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1392
1393   const auto comparison_type = node.param().comparison_type;
1394
1395   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1396   auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
1397   auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
1398
1399   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1400     input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1401     (arm_compute::ComparisonOperation)comparison_type);
1402
1403   _return_fn = asAclFunction(std::move(fn));
1404 }
1405
1406 void KernelGenerator::visit(const ir::operation::OneHot &node)
1407 {
1408   const auto out_idx{node.getOutputs().at(0)};
1409   const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1410   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1411   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1412   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1413
1414   auto output_tensor = _tensor_reg->getAclTensor(out_idx);
1415   auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
1416   auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
1417   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
1418   auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
1419
1420   const size_t output_rank = _ctx.at(out_idx).shape().rank();
1421   const auto frontend_layout = _current_layout;
1422   const auto backend_layout = output_tensor->layout();
1423   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
1424   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1425
1426   auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
1427     indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1428     offvalue_tensor->handle(), output_tensor->handle(), axis);
1429   _return_fn = asAclFunction(std::move(fn));
1430 }
1431
1432 } // namespace acl_neon
1433 } // namespace backend
1434 } // namespace onert