runtime/onert/backend/acl_cl/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
  20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
  21
  22 #include <AclActivationBuilder.h>
  23 #include <AclFunction.h>
  24 #include <Convert.h>
  25 #include <Swizzle.h>
  26
  27 #include "ir/Index.h"
  28 #include "ir/DataType.h"
  29 #include "ir/InternalType.h"
  30 #include "exec/NopFunction.h"
  31 #include "exec/FunctionSequence.h"
  32 #include "util/logging.h"
  33 #include "util/Utils.h"
  34 #include "AclKernelGen.h"
  35
  36 namespace onert
  37 {
  38 namespace backend
  39 {
  40 namespace acl_cl
  41 {
  42
  43 using ::onert::backend::acl_common::asAclFunction;
  44 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
  45   ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclFunction>;
  46
  47 KernelGenerator::KernelGenerator(
  48   const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
  49   const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
  50   : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()),
  51     _operations_ctx(graph.operations()), _current_layout{graph.layout()},
  52     _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
  53 {
  54   // DO NOTHING
  55 }
  56
  57 std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
  58 {
  59   auto ret = std::make_unique<exec::FunctionSequence>();
  60   ret->enableDynamicShapeInferer(false);
  61
  62   const auto &op = _graph.operations().at(ind);
  63   op.accept(*this);
  64   ret->append(releaseFunction());
  65   return ret;
  66 }
  67
  68 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
  69 {
  70   const auto ofm_index{node.getOutputs().at(0)};
  71   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
  72   const auto block_size_index{
  73     node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
  74
  75   const auto NNApiInputs = 2;
  76   if (node.getInputs().size() != NNApiInputs)
  77   {
  78     const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
  79     if (!_ctx.at(crops_index).isConstant())
  80     {
  81       throw std::runtime_error("Non-constant crops NYI for acl_cl backend BatchToSpaceND");
  82     }
  83
  84     auto crops = _ctx.at(crops_index).asVector<int32_t>();
  85     for (auto crop : crops)
  86     {
  87       if (crop != 0)
  88       {
  89         throw std::runtime_error("Non-zero crops NYI for acl_cl backend BatchToSpaceND");
  90       }
  91     }
  92   }
  93
  94   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
  95   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
  96   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
  97
  98   assert(_ctx.at(block_size_index).data());
  99
 100   auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
 101     ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 102
 103   _return_fn = asAclFunction(std::move(fn));
 104 }
 105
 106 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 107 {
 108   const auto ofm_index{node.getOutputs().at(0)};
 109   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
 110   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
 111
 112   const auto activation = node.param().activation;
 113
 114   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 115   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 116   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 117
 118   const auto act_info = acl_common::asActivationLayerInfo(activation);
 119
 120   std::unique_ptr<arm_compute::IFunction> fn;
 121   switch (node.param().arithmetic_type)
 122   {
 123     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
 124     {
 125       fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
 126         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 127         arm_compute::ConvertPolicy::SATURATE, act_info);
 128       break;
 129     }
 130     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
 131     {
 132       fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
 133         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 134         arm_compute::ConvertPolicy::SATURATE, act_info);
 135       break;
 136     }
 137     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
 138     {
 139       fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
 140         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
 141         arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
 142         act_info);
 143       break;
 144     }
 145     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
 146     {
 147       fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
 148         lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
 149       break;
 150     }
 151     default:
 152       assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
 153       break;
 154   }
 155
 156   _return_fn = asAclFunction(std::move(fn));
 157 }
 158
 159 void KernelGenerator::visit(const ir::operation::Conv2D &node)
 160 {
 161   using ir::operation::Conv2D;
 162
 163   const auto ofm_index{node.getOutputs().at(0)};
 164   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 165   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 166   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 167
 168   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
 169   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
 170   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 171   const auto &ker_shape = _ctx.at(ker_index).shape();
 172   const auto ker_height = ker_shape.dim(1);
 173   const auto ker_width = ker_shape.dim(2);
 174
 175   const auto stride = node.param().stride;
 176   const auto padding =
 177     ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
 178   const auto activation = node.param().activation;
 179
 180   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 181   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 182   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 183   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 184
 185   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 186   const auto act_info = acl_common::asActivationLayerInfo(activation);
 187
 188   auto fn = acl_common::generateLayer<arm_compute::CLConvolutionLayer>(
 189     _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
 190     ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
 191     ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 192
 193   _return_fn = asAclFunction(std::move(fn));
 194 }
 195
 196 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 197 {
 198   using ir::operation::DepthwiseConv2D;
 199
 200   const auto ofm_index{node.getOutputs().at(0)};
 201   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 202   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 203   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 204
 205   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
 206   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
 207   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 208   const auto &ker_shape = _ctx.at(ker_index).shape();
 209   const auto ker_height = ker_shape.dim(1);
 210   const auto ker_width = ker_shape.dim(2);
 211
 212   const auto stride = node.param().stride;
 213   const auto dilation = node.param().dilation;
 214   const auto padding =
 215     ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
 216                          dilation.width_factor, dilation.height_factor);
 217   const auto multiplier = node.param().multiplier;
 218   const auto activation = node.param().activation;
 219
 220   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 221   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 222   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 223   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 224
 225   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 226   const auto act_info = acl_common::asActivationLayerInfo(activation);
 227   const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
 228
 229   auto fn = acl_common::generateLayer<arm_compute::CLDepthwiseConvolutionLayer>(
 230     ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
 231     conv_info, multiplier, act_info, dilation_info);
 232
 233   _return_fn = asAclFunction(std::move(fn));
 234 }
 235
 236 void KernelGenerator::visit(const ir::operation::Concat &node)
 237 {
 238   const auto ofm_index{node.getOutputs().at(0)};
 239
 240   std::vector<ir::OperandIndex> input_indexes;
 241
 242   for (const auto &input : node.getInputs())
 243     input_indexes.emplace_back(input);
 244
 245   const auto axis = node.param().axis;
 246
 247   // Concat elimination check
 248   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
 249   if (eliminated)
 250   {
 251     // If concat eliminated, return a NOP IFunction
 252     VERBOSE(acl_cl_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
 253     _return_fn = std::make_unique<exec::NopFunction>();
 254     return;
 255   }
 256
 257   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
 258   std::vector<const ::arm_compute::ICLTensor *> input_tensors;
 259   for (const auto &ifm_ind : input_indexes)
 260     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 261
 262   std::unique_ptr<::arm_compute::IFunction> fn;
 263   if (input_indexes.size() < 2)
 264   {
 265     ::arm_compute::ICLTensor *input_tesor =
 266       _tensor_reg->getAclTensor(input_indexes.at(0))->handle();
 267
 268     fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tesor, output_tensor->handle());
 269   }
 270   else
 271   {
 272     const auto rank = _ctx.at(ofm_index).shape().rank();
 273     const auto frontend_layout = _current_layout;
 274     const auto backend_layout = output_tensor->layout();
 275     const auto fixed_axis =
 276       acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 277     fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
 278       input_tensors, output_tensor->handle(), fixed_axis);
 279   }
 280
 281   _return_fn = asAclFunction(std::move(fn));
 282 }
 283
 284 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 285 {
 286   const auto output_index{node.getOutputs().at(0)};
 287   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 288   const auto activation = node.param().activation;
 289   if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
 290     throw std::runtime_error(
 291       "KernelGenerator(acl_cl): FullyConnected 16x1Float32 weights is not supported.");
 292
 293   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
 294                                                 ::arm_compute::CLFullyConnectedReshapingLayer>(
 295     node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
 296   _return_fn = std::make_unique<exec::FunctionSequence>(
 297     std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 298 }
 299
 300 void KernelGenerator::visit(const ir::operation::Reduce &node)
 301 {
 302   const auto output_index{node.getOutputs().at(0)};
 303   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
 304   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 305   const auto keep_dims{node.param().keep_dims};
 306   const auto reduce_type = node.param().reduce_type;
 307
 308   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 309   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 310
 311   // Convert to ACL axes taking into account negative values and possible duplicates.
 312   const auto &axes = _ctx.at(axes_index);
 313   const auto input_rank = _ctx.at(input_index).shape().rank();
 314   const auto frontend_layout = _current_layout;
 315   const auto backend_layout = input_tensor->layout();
 316
 317   std::unique_ptr<arm_compute::IFunction> fn;
 318   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
 319   {
 320     const auto acl_axes =
 321       acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
 322     fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
 323                                                               keep_dims, output_tensor->handle());
 324   }
 325   else
 326   {
 327     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
 328
 329     fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
 330       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 331       output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type));
 332   }
 333
 334   _return_fn = asAclFunction(std::move(fn));
 335 }
 336
 337 void KernelGenerator::visit(const ir::operation::Reshape &node)
 338 {
 339   const auto output_index{node.getOutputs().at(0)};
 340   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 341
 342   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 343   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 344
 345   // NOTE This operation must not be changed the layout from frontend to backend
 346   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 347   const auto frontend_layout = _current_layout;
 348   const auto backend_layout = output_tensor->layout();
 349   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 350          frontend_layout == backend_layout);
 351   UNUSED_RELEASE(frontend_layout);
 352   UNUSED_RELEASE(backend_layout);
 353
 354   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 355                                                                    output_tensor->handle());
 356
 357   _return_fn = asAclFunction(std::move(fn));
 358 }
 359
 360 void KernelGenerator::visit(const ir::operation::Squeeze &node)
 361 {
 362   // Squeeze is identical to reshape except that it has an optional dimensions input.
 363   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 364   // by freezer and toco
 365   // TODO Support multi-layout for frontend and backend
 366   const auto output_index{node.getOutputs().at(0)};
 367   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 368   const auto dims{node.param().dims};
 369   const auto ndim{node.param().ndim};
 370   (void)dims;
 371   (void)ndim;
 372
 373   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 374   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 375   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 376                                                                    output_tensor->handle());
 377   _return_fn = asAclFunction(std::move(fn));
 378 }
 379
 380 void KernelGenerator::visit(const ir::operation::Softmax &node)
 381 {
 382   const auto output_index{node.getOutputs().at(0)};
 383   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
 384
 385   const auto beta = node.param().beta;
 386
 387   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 388   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 389
 390   auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
 391     _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 392     output_tensor->handle(), beta);
 393
 394   _return_fn = asAclFunction(std::move(fn));
 395 }
 396
 397 void KernelGenerator::visit(const ir::operation::Slice &node)
 398 {
 399   const auto output_index{node.getOutputs().at(0)};
 400   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
 401   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
 402   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 403
 404   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
 405   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
 406   const auto frontend_layout = _current_layout;
 407   const auto backend_layout = inputData_tensor->layout();
 408
 409   // Set initializers for indices data such as order of inputData
 410   int input_rank = _ctx.at(input_index).shape().rank();
 411   std::vector<int32_t> starts;
 412   std::vector<int32_t> ends;
 413   starts.resize(input_rank, 0);
 414   ends.resize(input_rank, 0);
 415   {
 416     assert(_ctx.at(begins_index).data());
 417     assert(_ctx.at(sizes_index).data());
 418     auto beginData_base = _ctx.at(begins_index).data()->base();
 419     auto sizeData_base = _ctx.at(sizes_index).data()->base();
 420     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
 421     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
 422
 423     using ir::DataType;
 424
 425     UNUSED_RELEASE(beginData_size);
 426     UNUSED_RELEASE(sizeData_size);
 427
 428     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
 429     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
 430     assert(beginData_size == input_rank);
 431     assert(sizeData_size == input_rank);
 432
 433     assert(beginData_base != nullptr);
 434     for (int n = 0; n < input_rank; ++n)
 435     {
 436       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 437                                                                  backend_layout)
 438                     .value();
 439
 440       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
 441       starts[axis] = begin_value;
 442
 443       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
 444       ends[axis] = begin_value + size_value;
 445     }
 446   }
 447
 448   ::arm_compute::Coordinates starts_set;
 449   ::arm_compute::Coordinates ends_set;
 450
 451   for (size_t i = 0; i < starts.size(); ++i)
 452   {
 453     starts_set.set(i, starts[i]);
 454     ends_set.set(i, ends[i]);
 455   }
 456
 457   auto fn = acl_common::generateLayer<arm_compute::CLSlice>(
 458     inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 459
 460   _return_fn = asAclFunction(std::move(fn));
 461 }
 462
 463 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 464 {
 465   const auto output_index{node.getOutputs().at(0)};
 466   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
 467   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
 468   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
 469   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 470
 471   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
 472   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
 473   const auto frontend_layout = _current_layout;
 474   const auto backend_layout = inputData_tensor->layout();
 475
 476   // Set initializers for indices data such as order of inputData
 477   int input_rank = _ctx.at(input_index).shape().rank();
 478   std::vector<int32_t> starts;
 479   std::vector<int32_t> ends;
 480   std::vector<int32_t> strides;
 481   starts.resize(input_rank, 0);
 482   ends.resize(input_rank, 0);
 483   strides.resize(input_rank, 0);
 484   {
 485     assert(_ctx.at(starts_index).data());
 486     assert(_ctx.at(ends_index).data());
 487     assert(_ctx.at(strides_index).data());
 488     auto startData_base = _ctx.at(starts_index).data()->base();
 489     auto endData_base = _ctx.at(ends_index).data()->base();
 490     auto stridesData_base = _ctx.at(strides_index).data()->base();
 491     const int startData_size = _ctx.at(starts_index).shape().num_elements();
 492     const int endData_size = _ctx.at(ends_index).shape().num_elements();
 493     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
 494
 495     using ir::DataType;
 496
 497     UNUSED_RELEASE(startData_size);
 498     UNUSED_RELEASE(endData_size);
 499     UNUSED_RELEASE(stridesData_size);
 500
 501     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
 502     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
 503     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
 504     assert(startData_size == input_rank);
 505     assert(endData_size == input_rank);
 506     assert(stridesData_size == input_rank);
 507
 508     assert(startData_base != nullptr);
 509     for (int n = 0; n < input_rank; ++n)
 510     {
 511       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 512                                                                  backend_layout)
 513                     .value();
 514
 515       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
 516       starts[axis] = start_value;
 517
 518       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
 519       ends[axis] = end_value;
 520
 521       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
 522       strides[axis] = strides_value;
 523     }
 524   }
 525
 526   // Set mask bits such as order of inputData
 527   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
 528                                                            frontend_layout, backend_layout);
 529   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
 530                                                          frontend_layout, backend_layout);
 531   const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
 532     node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
 533
 534   ::arm_compute::Coordinates starts_set;
 535   ::arm_compute::Coordinates ends_set;
 536   ::arm_compute::BiStrides strides_set;
 537
 538   for (size_t i = 0; i < starts.size(); ++i)
 539   {
 540     starts_set.set(i, starts[i]);
 541     ends_set.set(i, ends[i]);
 542     strides_set.set(i, strides[i]);
 543   }
 544
 545   // Disable applied dim_correction
 546   if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
 547   {
 548     // This means that high dimension's value is 1 and input tensor is applied dim_correction
 549     acl_common::disableDimCorrection(inputData_tensor);
 550   }
 551
 552   auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
 553     inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
 554     begin_mask, end_mask, shrink_axis_mask);
 555
 556   // Revert disabling applied dim_correction
 557   if (inputData_tensor->dimension(0) == 1)
 558   {
 559     acl_common::enableDimCorrection(inputData_tensor);
 560   }
 561
 562   _return_fn = asAclFunction(std::move(fn));
 563 }
 564
 565 void KernelGenerator::visit(const ir::operation::Transpose &node)
 566 {
 567   const auto ofm_idx{node.getOutputs().at(0)};
 568   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
 569   const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
 570
 571   const auto rank = _ctx.at(ifm_idx).shape().rank();
 572
 573   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
 574   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
 575   const auto frontend_layout = _current_layout;
 576   const auto backend_layout = ifm_tensor->layout();
 577
 578   const auto &perms = _ctx.at(perm_idx);
 579   std::vector<int32_t> pv;
 580   if (perms.shape() == ir::Shape{0})
 581   {
 582     pv.resize(rank);
 583     std::iota(pv.begin(), pv.end(), 0);
 584     std::reverse(pv.begin(), pv.end());
 585   }
 586   else
 587   {
 588     pv = _ctx.at(perm_idx).asVector<int32_t>();
 589   }
 590
 591   std::unique_ptr<arm_compute::IFunction> fn;
 592   if (rank == 1)
 593   {
 594     fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
 595   }
 596   else if (rank == 2)
 597   {
 598     assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
 599     fn = acl_common::generateLayer<arm_compute::CLTranspose>(ifm_tensor->handle(),
 600                                                              ofm_tensor->handle());
 601   }
 602   else
 603   {
 604     auto backend_pv =
 605       acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
 606
 607     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
 608                                                            ofm_tensor->handle(), backend_pv);
 609   }
 610
 611   _return_fn = asAclFunction(std::move(fn));
 612 }
 613
 614 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
 615 {
 616   const auto ofm_index{node.getOutputs().at(0)};
 617   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 618
 619   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 620   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 621
 622   const ::arm_compute::ActivationLayerInfo act_info =
 623     acl_common::asActivationLayerInfo(node.param().op_type, node.param().alpha, node.param().beta);
 624
 625   auto fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 626     ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 627
 628   _return_fn = asAclFunction(std::move(fn));
 629 }
 630
 631 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
 632 {
 633   const auto output_index{node.getOutputs().at(0)};
 634   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
 635   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 636
 637   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 638   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 639   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 640
 641   std::unique_ptr<arm_compute::IFunction> fn;
 642   switch (node.param().op_type)
 643   {
 644     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
 645     {
 646       fn = acl_common::generateLayer<arm_compute::CLBinaryLogicalOp>(
 647         lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle(),
 648         arm_compute::BinaryLogicalOperation::AND);
 649       break;
 650     }
 651     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
 652     {
 653       fn = acl_common::generateLayer<arm_compute::CLBitwiseOr>(
 654         lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 655       break;
 656     }
 657     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
 658     {
 659       fn = acl_common::generateLayer<arm_compute::CLElementwiseMax>(
 660         lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 661       break;
 662     }
 663     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
 664     {
 665       fn = acl_common::generateLayer<arm_compute::CLElementwiseMin>(
 666         lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 667       break;
 668     }
 669     default:
 670     {
 671       std::string err_msg("acl_cl KernelGenerator : " + node.name() +
 672                           "is not elementwise-binary operations");
 673       assert(false && err_msg.c_str());
 674       break;
 675     }
 676   }
 677
 678   _return_fn = asAclFunction(std::move(fn));
 679 }
 680
 681 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
 682 {
 683   const auto output_index{node.getOutputs().at(0)};
 684   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 685
 686   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 687   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 688
 689   std::unique_ptr<arm_compute::IFunction> fn;
 690   switch (node.param().op_type)
 691   {
 692     case ir::operation::ElementwiseUnary::Type::ABS:
 693     {
 694       const ::arm_compute::ActivationLayerInfo act_info{
 695         ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 696
 697       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 698         input_tensor->handle(), output_tensor->handle(), act_info);
 699       break;
 700     }
 701     case ir::operation::ElementwiseUnary::Type::CAST:
 702     {
 703       if (input_tensor->data_type() == output_tensor->data_type())
 704       {
 705         fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
 706                                                             output_tensor->handle());
 707       }
 708       else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
 709       {
 710         fn = acl_common::generateLayer<arm_compute::CLCastBool>(input_tensor->handle(),
 711                                                                 output_tensor->handle());
 712       }
 713       else
 714       {
 715         // TODO Support converting float to int32 as round down
 716         fn = acl_common::generateLayer<arm_compute::CLCast>(
 717           input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
 718       }
 719       break;
 720     }
 721     case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
 722     {
 723       fn = acl_common::generateLayer<arm_compute::CLDequantizationLayer>(input_tensor->handle(),
 724                                                                          output_tensor->handle());
 725       break;
 726     }
 727     case ir::operation::ElementwiseUnary::Type::EXP:
 728     {
 729       fn = acl_common::generateLayer<arm_compute::CLExpLayer>(input_tensor->handle(),
 730                                                               output_tensor->handle());
 731       break;
 732     }
 733     case ir::operation::ElementwiseUnary::Type::FLOOR:
 734     {
 735       fn = acl_common::generateLayer<arm_compute::CLFloor>(input_tensor->handle(),
 736                                                            output_tensor->handle());
 737       break;
 738     }
 739     case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
 740     {
 741       fn = acl_common::generateLayer<arm_compute::CLBitwiseNot>(input_tensor->handle(),
 742                                                                 output_tensor->handle());
 743       break;
 744     }
 745     case ir::operation::ElementwiseUnary::Type::NEG:
 746     {
 747       fn = acl_common::generateLayer<arm_compute::CLNeg>(input_tensor->handle(),
 748                                                          output_tensor->handle());
 749       break;
 750     }
 751     case ir::operation::ElementwiseUnary::Type::RSQRT:
 752     {
 753       fn = acl_common::generateLayer<arm_compute::CLRsqrtLayer>(input_tensor->handle(),
 754                                                                 output_tensor->handle());
 755       break;
 756     }
 757     case ir::operation::ElementwiseUnary::Type::SQRT:
 758     {
 759       const ::arm_compute::ActivationLayerInfo act_info{
 760         ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 761
 762       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 763         input_tensor->handle(), output_tensor->handle(), act_info);
 764       break;
 765     }
 766     default:
 767     {
 768       throw std::runtime_error("acl_cl KernelGenerator : " + node.name() + "is not supported yet");
 769       break;
 770     }
 771   }
 772
 773   auto acl_fn = asAclFunction(std::move(fn));
 774
 775   _return_fn = std::move(acl_fn);
 776 }
 777
 778 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
 779 {
 780   const auto output_index{node.getOutputs().at(0)};
 781   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 782
 783   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 784   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 785
 786   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 787                                                                    output_tensor->handle());
 788
 789   _return_fn = asAclFunction(std::move(fn));
 790 }
 791
 792 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
 793 {
 794   const auto ofm_index{node.getOutputs().at(0)};
 795   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
 796   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
 797   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 798
 799   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 800   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 801   auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
 802   auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
 803   auto epsilon = node.param().epsilon;
 804   auto activation = node.param().activation;
 805
 806   auto fn = acl_common::generateLayer<arm_compute::CLInstanceNormalizationLayerEx>(
 807     ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
 808     epsilon);
 809
 810   _return_fn = std::make_unique<exec::FunctionSequence>(
 811     asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 812 }
 813
 814 void KernelGenerator::visit(const ir::operation::LSTM &node)
 815 {
 816   _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ICLTensor,
 817                                          ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_reg);
 818 }
 819
 820 void KernelGenerator::visit(const ir::operation::Comparison &node)
 821 {
 822   const auto output_index{node.getOutputs().at(0)};
 823   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
 824   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
 825
 826   const auto comparison_type = node.param().comparison_type;
 827
 828   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 829   auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
 830   auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
 831
 832   auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
 833     input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
 834     (arm_compute::ComparisonOperation)comparison_type);
 835
 836   _return_fn = asAclFunction(std::move(fn));
 837 }
 838
 839 void KernelGenerator::visit(const ir::operation::OneHot &node)
 840 {
 841   const auto output_idx{node.getOutputs().at(0)};
 842   const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
 843   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
 844   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
 845   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
 846   const auto depth = _ctx.at(depth_idx).asScalar<int32_t>();
 847   assert(depth > 0);
 848
 849   auto output_tensor = _tensor_reg->getAclTensor(output_idx);
 850   auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
 851   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
 852
 853   const size_t output_rank = _ctx.at(output_idx).shape().rank();
 854   const auto frontend_layout = _current_layout;
 855   const auto backend_layout = output_tensor->layout();
 856   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
 857   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 858
 859   if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions())
 860   {
 861     // This means that high dimension's value is 1 and output_tensor is applied dim_correction
 862     acl_common::disableDimCorrection(output_tensor);
 863   }
 864
 865   std::unique_ptr<::arm_compute::IFunction> fn;
 866   const auto &offvalue = _ctx.at(offvalue_idx);
 867   if (offvalue.isConstant())
 868   {
 869     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
 870       indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(),
 871       acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis);
 872   }
 873   else
 874   {
 875     auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
 876     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
 877       indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
 878       output_tensor->handle(), static_cast<uint32_t>(depth), axis);
 879   }
 880
 881   if (output_tensor->dimension(0) == 1)
 882   {
 883     acl_common::enableDimCorrection(output_tensor);
 884   }
 885
 886   _return_fn = asAclFunction(std::move(fn));
 887 }
 888
 889 void KernelGenerator::visit(const ir::operation::Pack &node)
 890 {
 891   const auto output_index{node.getOutputs().at(0)};
 892   auto axis{node.param().axis};
 893
 894   const auto output_rank = _ctx.at(output_index).shape().rank();
 895
 896   std::vector<ir::OperandIndex> input_indexes;
 897   for (const auto &input_index : node.getInputs())
 898     input_indexes.emplace_back(input_index);
 899
 900   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 901   std::vector<arm_compute::ICLTensor *> inputs;
 902   for (const auto &input_index : input_indexes)
 903     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 904
 905   const auto frontend_layout = _current_layout;
 906   const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 907
 908   if (axis < 0)
 909     axis += output_rank;
 910   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 911
 912   // Disable applied dim_correction
 913   for (const auto &input_index : input_indexes)
 914   {
 915     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 916     if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
 917     {
 918       // This means that high dimension's value is 1 and input tensor is applied dim_correction
 919       acl_common::disableDimCorrection(input_tensor);
 920     }
 921   }
 922
 923   auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
 924
 925   // Revert disabling applied dim_correction
 926   for (const auto &input_index : input_indexes)
 927   {
 928     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 929     if (input_tensor->dimension(0) == 1)
 930     {
 931       acl_common::enableDimCorrection(input_tensor);
 932     }
 933   }
 934
 935   _return_fn = asAclFunction(std::move(fn));
 936 }
 937
 938 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 939 {
 940   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
 941     node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
 942
 943   const auto ofm_index{node.getOutputs().at(0)};
 944   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 945   const auto activation = node.param().activation;
 946   _return_fn = std::make_unique<exec::FunctionSequence>(
 947     asAclFunction(std::move(raw_fn)),
 948     ActivationBuilder::generate(activation, ofm_tensor->handle()));
 949 }
 950
 951 void KernelGenerator::visit(const ir::operation::Permute &node)
 952 {
 953   const auto ofm_idx{node.getOutputs().at(0)};
 954   const auto ifm_idx{node.getInputs().at(0)};
 955   const auto permute_type = node.getPermuteType();
 956   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
 957   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
 958   const auto rank = _ctx.at(ofm_idx).shape().rank();
 959   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 960
 961   std::unique_ptr<::arm_compute::IFunction> fn;
 962   arm_compute::PermutationVector pv;
 963   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
 964   {
 965     // WHCN -> CWHN
 966     pv = arm_compute::PermutationVector{2, 0, 1};
 967
 968     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
 969                                                            ofm_tensor->handle(), pv);
 970   }
 971   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
 972   {
 973     // CWHN -> WHCN
 974     pv = arm_compute::PermutationVector{1, 2, 0};
 975
 976     fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
 977                                                              ofm_tensor->handle(), pv);
 978   }
 979   else
 980   {
 981     fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
 982   }
 983
 984   _return_fn = asAclFunction(std::move(fn));
 985 }
 986
 987 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 988 {
 989   const auto ofm_index{node.getOutputs().at(0)};
 990   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 991
 992   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 993   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 994
 995   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
 996     ifm_tensor->handle(), ofm_tensor->handle(),
 997     ::arm_compute::ScaleKernelInfo{
 998       ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
 999       ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT});
1000
1001   _return_fn = asAclFunction(std::move(fn));
1002 }
1003
1004 void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
1005 {
1006   const auto ofm_index{node.getOutputs().at(0)};
1007   const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)};
1008
1009   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1010   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1011
1012   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
1013     ifm_tensor->handle(), ofm_tensor->handle(),
1014     ::arm_compute::ScaleKernelInfo{
1015       ::arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, ::arm_compute::BorderMode::REPLICATE,
1016       ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT});
1017
1018   _return_fn = asAclFunction(std::move(fn));
1019 }
1020
1021 void KernelGenerator::visit(const ir::operation::RNN &node)
1022 {
1023   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
1024   const auto hidden_state_out_index{
1025     node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
1026
1027   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
1028   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
1029   const auto recurrent_weights_index{
1030     node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
1031   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
1032   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
1033
1034   const auto activation = node.param().activation;
1035
1036   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1037   auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
1038
1039   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1040   auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
1041   auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
1042   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
1043   auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
1044   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
1045
1046   auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
1047     hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
1048   _return_fn = asAclFunction(std::move(copy_layer));
1049
1050   auto fn = acl_common::generateLayer<arm_compute::CLRNNLayer>(
1051     _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
1052     weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
1053     hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
1054   _return_fn = asAclFunction(std::move(fn));
1055 }
1056
1057 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1058 {
1059   const auto ofm_index{node.getOutputs().at(0)};
1060   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
1061   const auto block_size_index{
1062     node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1063   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1064
1065   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1066   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1067   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1068   auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1069
1070   assert(_ctx.at(block_size_index).data());
1071   assert(_ctx.at(paddings_index).data());
1072
1073   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToBatchLayer>(
1074     ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1075     ofm_tensor->handle());
1076
1077   _return_fn = asAclFunction(std::move(fn));
1078 }
1079
1080 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1081 {
1082   const auto ofm_index{node.getOutputs().at(0)};
1083   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1084
1085   auto block_size = node.param().block_size;
1086
1087   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1088   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1089
1090   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
1091     ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1092
1093   _return_fn = asAclFunction(std::move(fn));
1094 }
1095
1096 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
1097 {
1098   const auto output_index{node.getOutputs().at(0)};
1099   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
1100   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
1101
1102   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1103   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1104   auto values_tensor = _tensor_reg->getAclTensor(values_index);
1105
1106   auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
1107     values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
1108
1109   _return_fn = asAclFunction(std::move(fn));
1110 }
1111
1112 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1113 {
1114   const auto ofm_index{node.getOutputs().at(0)};
1115   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
1116
1117   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1118   // L2 Normalization always performs the reduction along the depth axis
1119   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1120   // choosing normalization parameters as below
1121
1122   const auto &ifm_shape = _ctx.at(ifm_index).shape();
1123   // TODO Support optional constant dimension that normalization would be performed on
1124   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1125   int32_t radius =
1126     2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1127   float alpha = 1.0f;                          // In the implementation to make alpha_ become 1
1128   float beta = 0.5f;                           // pow(reduction, -0.5) = 1 / sqrt(reduction)
1129   float bias = 0.0f;                           // Don't offset the reduction.
1130
1131   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1132   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1133
1134   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1135                                                                radius, alpha, beta, bias, false);
1136
1137   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1138     ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1139
1140   _return_fn = asAclFunction(std::move(fn));
1141 }
1142
1143 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
1144 {
1145   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
1146   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
1147
1148   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
1149   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
1150   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
1151
1152   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1153   auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
1154
1155   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1156   auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
1157   auto values_tensor = _tensor_reg->getAclTensor(values_index);
1158
1159   auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
1160     lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
1161     output_tensor->handle(), hits_tensor->handle());
1162
1163   _return_fn = asAclFunction(std::move(fn));
1164 }
1165
1166 void KernelGenerator::visit(const ir::operation::PReLU &node)
1167 {
1168   const auto ofm_index{node.getOutputs().at(0)};
1169   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
1170   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
1171
1172   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1173   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1174   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
1175
1176   auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
1177     ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
1178
1179   _return_fn = asAclFunction(std::move(fn));
1180 }
1181
1182 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1183 {
1184   const auto ofm_index{node.getOutputs().at(0)};
1185   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1186   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1187
1188   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
1189   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
1190   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
1191
1192   const auto stride = node.param().stride;
1193
1194   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1195          (node.param().padding.type == ir::PaddingType::VALID));
1196   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1197                                       ker_shape.W, ker_shape.H);
1198   uint32_t invalid_horizontal = 0;
1199   uint32_t invalid_vertical = 0;
1200   if (node.param().padding.type == ir::PaddingType::VALID)
1201   {
1202     invalid_horizontal =
1203       ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1204     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1205   }
1206
1207   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1208   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1209   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1210
1211   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1212
1213   auto fn = acl_common::generateLayer<arm_compute::CLTransposeConvLayer>(
1214     _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
1215     ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
1216     invalid_vertical);
1217
1218   _return_fn = asAclFunction(std::move(fn));
1219 }
1220
1221 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1222 {
1223   const auto ofm_index{node.getOutputs().at(0)};
1224   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1225   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1226
1227   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1228   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1229   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1230
1231   auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
1232     lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1233
1234   _return_fn = asAclFunction(std::move(fn));
1235 }
1236
1237 void KernelGenerator::visit(const ir::operation::TopKV2 &node)
1238 {
1239   const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
1240   const auto outputIndices_index{
1241     node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
1242
1243   const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
1244
1245   // Currently, we only support the vector input.
1246   assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1247          _ctx.at(inputData_index).shape().rank() == 2);
1248
1249   const auto k = node.param().k;
1250
1251   auto values_tensor = _tensor_reg->getAclTensor(outputValues_index);
1252   auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index);
1253   auto input_tensor = _tensor_reg->getAclTensor(inputData_index);
1254
1255   auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
1256     input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
1257
1258   _return_fn = asAclFunction(std::move(fn));
1259 }
1260
1261 void KernelGenerator::visit(const ir::operation::Gather &node)
1262 {
1263   const auto ofm_index{node.getOutputs().at(0)};
1264
1265   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
1266   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
1267
1268   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1269   const auto axis_raw = node.param().axis;
1270   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1271   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
1272
1273   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1274   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1275   auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
1276
1277   // NOTE The frontend layout and backend layout must be the same for this operation.
1278   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1279   //      is not not efficient even if it works well. If so, it would be better to set the
1280   //      layout of these backend tensors to the same layout.
1281   //      There is also one thing we have to think about. This operation depends on the layout of
1282   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1283   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1284   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1285   const auto backend_layout = ofm_tensor->layout();
1286   UNUSED_RELEASE(backend_layout);
1287   assert(backend_layout == ifm_tensor->layout());
1288   assert(backend_layout == indices_tensor->layout());
1289   assert(ifm_rank < 4 || _current_layout == backend_layout);
1290
1291   // input is n-D, indices k-D, output is (n + k - 1)-D
1292   size_t n = ifm_rank;
1293   assert(n == ifm_tensor->num_dimensions());
1294   size_t k = _ctx.at(indices_index).shape().rank();
1295   assert(k == indices_tensor->num_dimensions());
1296
1297   // Disable applied dim_correction
1298   if (n != ifm_tensor->info()->num_dimensions())
1299   {
1300     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1301     acl_common::disableDimCorrection(ifm_tensor);
1302   }
1303   if (k != indices_tensor->info()->num_dimensions())
1304   {
1305     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
1306     acl_common::disableDimCorrection(indices_tensor);
1307   }
1308
1309   auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
1310     ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
1311
1312   // Revert disabling applied dim_correction
1313   if (ifm_tensor->dimension(0) == 1)
1314   {
1315     acl_common::enableDimCorrection(ifm_tensor);
1316   }
1317   if (indices_tensor->dimension(0) == 1)
1318   {
1319     acl_common::enableDimCorrection(indices_tensor);
1320   }
1321
1322   _return_fn = asAclFunction(std::move(fn));
1323 }
1324
1325 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
1326 {
1327   const auto ofm_index{node.getOutputs().at(0)};
1328   const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
1329   const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
1330
1331   auto ifm_shape = _ctx.at(ifm_index).shape();
1332   auto ofm_shape = _ctx.at(ofm_index).shape();
1333
1334   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1335
1336   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1337   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1338   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1339   auto frontend_layout = _current_layout;
1340   auto backend_layout = ifm_tensor->layout();
1341
1342   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
1343   if (axis_value < 0)
1344   {
1345     axis_value += ifm_rank;
1346   }
1347
1348   auto acl_axis =
1349     acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1350   auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
1351                                              : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
1352   auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
1353     ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type);
1354
1355   _return_fn = asAclFunction(std::move(fn));
1356 }
1357
1358 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
1359 {
1360   const auto ofm_index{node.getOutputs().at(0)};
1361   const auto ifm_index{
1362     node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
1363
1364   auto radius = node.param().radius;
1365   auto alpha = node.param().alpha;
1366   auto beta = node.param().beta;
1367   auto bias = node.param().bias;
1368
1369   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1370   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1371
1372   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1373     ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1374
1375   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1376     ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1377
1378   _return_fn = asAclFunction(std::move(fn));
1379 }
1380
1381 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1382 {
1383   const auto output_index{node.getOutputs().at(0)};
1384   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1385
1386   auto block_size = node.param().block_size;
1387   assert(block_size > 0);
1388
1389   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1390   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1391
1392   auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
1393     input_tensor->handle(), output_tensor->handle(), block_size);
1394
1395   _return_fn = asAclFunction(std::move(fn));
1396 }
1397
1398 void KernelGenerator::visit(const ir::operation::Split &node)
1399 {
1400   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1401   const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1402
1403   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1404   if (!_ctx.at(axis_index).isConstant())
1405   {
1406     throw std::runtime_error("Non-constant axis_index NYI for acl_cl backend");
1407   }
1408
1409   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1410   std::vector<ir::OperandIndex> output_indexes;
1411   for (const auto &output : node.getOutputs())
1412     output_indexes.emplace_back(output);
1413
1414   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1415   std::vector<arm_compute::ICLTensor *> output_tensors;
1416   for (const auto &ofm_ind : output_indexes)
1417     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1418
1419   const auto frontend_layout = _current_layout;
1420   const auto backend_layout = ifm_tensor->layout();
1421   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1422   if (axis < 0)
1423     axis += ifm_rank;
1424   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1425
1426   auto fn =
1427     acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
1428
1429   _return_fn = asAclFunction(std::move(fn));
1430 }
1431
1432 void KernelGenerator::visit(const ir::operation::SplitV &node)
1433 {
1434   const auto ifm_index{node.getInputs().at(ir::operation::SplitV::Input::INPUT)};
1435   const auto size_split_index{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
1436   const auto split_dim_index{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
1437
1438   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1439
1440   const size_t ifm_rank = _ctx.at(ifm_index).shape().rank();
1441   std::vector<ir::OperandIndex> output_indexes;
1442   for (const auto &output : node.getOutputs())
1443     output_indexes.emplace_back(output);
1444
1445   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1446   auto size_split_tensor = _tensor_reg->getAclTensor(size_split_index);
1447
1448   std::vector<arm_compute::ICLTensor *> output_tensors;
1449   for (const auto &ofm_ind : output_indexes)
1450     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1451
1452   auto fn = std::make_unique<arm_compute::CLSplitVEx>();
1453   const auto &split_dim_op = _ctx.at(split_dim_index);
1454   if (split_dim_op.isConstant())
1455   {
1456     int32_t split_dim = split_dim_op.asScalar<int32_t>();
1457     uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
1458     const auto frontend_layout = _current_layout;
1459     const auto backend_layout = ifm_tensor->layout();
1460
1461     if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
1462     {
1463       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1464       acl_common::disableDimCorrection(ifm_tensor);
1465     }
1466
1467     split_dim_revised =
1468       acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout)
1469         .value();
1470     fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
1471                   output_tensors, node.param().num_splits);
1472
1473     if (ifm_tensor->dimension(0) == 1)
1474     {
1475       acl_common::enableDimCorrection(ifm_tensor);
1476     }
1477   }
1478   else
1479   {
1480     throw std::runtime_error("Non-constant split_dim NYI for acl_cl backend");
1481   }
1482
1483   _return_fn = asAclFunction(std::move(fn));
1484 }
1485
1486 void KernelGenerator::visit(const ir::operation::Unpack &node)
1487 {
1488   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1489   auto axis{node.param().axis};
1490
1491   const auto input_rank = _ctx.at(input_index).shape().rank();
1492
1493   std::vector<ir::OperandIndex> output_indexes;
1494   for (const auto &output_index : node.getOutputs())
1495     output_indexes.emplace_back(output_index);
1496
1497   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1498   std::vector<arm_compute::ICLTensor *> outputs;
1499   for (const auto &output_index : output_indexes)
1500     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1501
1502   const auto frontend_layout = _current_layout;
1503   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1504   if (axis < 0)
1505     axis += input_rank;
1506   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1507
1508   // Disable applied dim_correction
1509   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1510   {
1511     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1512     acl_common::disableDimCorrection(input_tensor);
1513   }
1514
1515   auto fn =
1516     acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
1517
1518   // Revert disabling applied dim_correction
1519   if (input_tensor->dimension(0) == 1)
1520   {
1521     acl_common::enableDimCorrection(input_tensor);
1522   }
1523
1524   _return_fn = asAclFunction(std::move(fn));
1525 }
1526
1527 void KernelGenerator::visit(const ir::operation::Pad &node)
1528 {
1529   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
1530   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
1531   const auto output_index{node.getOutputs().at(0)};
1532   assert(_ctx.at(pad_index).data());
1533
1534   auto rank = _ctx.at(input_index).shape().rank();
1535   auto pad_base = _ctx.at(pad_index).data()->base();
1536
1537   auto input_type = _ctx.at(input_index).typeInfo();
1538   auto data_type = acl_common::asDataType(input_type.type());
1539   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.zero_point());
1540   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1541
1542   auto input = _tensor_reg->getAclTensor(input_index)->handle();
1543   auto output = _tensor_reg->getAclTensor(output_index)->handle();
1544
1545   const auto frontend_layout = _current_layout;
1546   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1547
1548   ::arm_compute::PaddingList padding_list;
1549   padding_list.resize(rank);
1550   for (int32_t n = 0; n < rank; ++n)
1551   {
1552     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
1553
1554     const auto axis =
1555       acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
1556     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1557   }
1558
1559   // Disable applied dim_correction
1560   const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
1561   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1562   {
1563     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1564     acl_common::disableDimCorrection(input_tensor);
1565   }
1566
1567   auto fn =
1568     acl_common::generateLayer<arm_compute::CLPadLayerEx>(input, output, padding_list, pixel_value);
1569
1570   // NOTE Do not revert disabling applied dim_correction for 4D.
1571   // It would produce a mistach of result by incorrect offset_first_element in
1572   // ICLKernel::add_tensor_argument<3>().
1573   // We have to disable applied dim_correction and not to revert enabling for the kernel that slices
1574   // 4D to 3D because slicing arm_compute::Window can causes incorrect offset_first_element if the
1575   // used tensor is 4D and the tensor's high dimention is 1
1576   if (input_tensor->num_dimensions() < 4 && input_tensor->dimension(0) == 1)
1577   {
1578     acl_common::enableDimCorrection(input_tensor);
1579   }
1580
1581   _return_fn = asAclFunction(std::move(fn));
1582 }
1583
1584 void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
1585 {
1586   const auto ofm_index{node.getOutputs().at(0)};
1587   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
1588
1589   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1590   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1591
1592   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1593     ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1594
1595   _return_fn = asAclFunction(std::move(fn));
1596 }
1597
1598 void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
1599 {
1600   const auto ofm_index{node.getOutputs().at(0)};
1601   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
1602
1603   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1604   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1605
1606   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1607     ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1608
1609   _return_fn = asAclFunction(std::move(fn));
1610 }
1611
1612 void KernelGenerator::visit(const ir::operation::Reverse &node)
1613 {
1614   const auto ofm_index{node.getOutputs().at(0)};
1615   const auto ifm_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)};
1616   const auto axis_index{node.getInputs().at(ir::operation::Reverse::Input::AXIS)};
1617
1618   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1619   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1620   auto axis_tensor = _tensor_reg->getAclTensor(axis_index);
1621
1622   // WORKAROUND: acl-cl backend only allow U32 type for axis
1623   //             ConstantInitializer will resolve S32 type to U32 type
1624   if (_ctx.at(axis_index).isConstant() &&
1625       (axis_tensor->handle()->info()->data_type() == arm_compute::DataType::S32))
1626   {
1627     axis_tensor->handle()->info()->set_data_type(arm_compute::DataType::U32);
1628   }
1629
1630   auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
1631     ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
1632
1633   _return_fn = asAclFunction(std::move(fn));
1634 }
1635
1636 } // namespace acl_cl
1637 } // namespace backend
1638 } // namespace onert