runtime/onert/backend/acl_cl/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
  20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
  21
  22 #include <AclActivationBuilder.h>
  23 #include <AclFunction.h>
  24 #include <Convert.h>
  25 #include <Swizzle.h>
  26
  27 #include "ir/Index.h"
  28 #include "ir/DataType.h"
  29 #include "ir/InternalType.h"
  30 #include "exec/NopFunction.h"
  31 #include "exec/FunctionSequence.h"
  32 #include "util/logging.h"
  33 #include "util/Utils.h"
  34 #include "AclKernelGen.h"
  35
  36 namespace onert
  37 {
  38 namespace backend
  39 {
  40 namespace acl_cl
  41 {
  42
  43 using ::onert::backend::acl_common::asAclFunction;
  44 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
  45     ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclFunction>;
  46
  47 KernelGenerator::KernelGenerator(
  48     const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
  49     const std::shared_ptr<TensorBuilder> &tensor_builder,
  50     const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
  51     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
  52       _tensor_reg(tensor_reg), _current_layout(ir::Layout::UNKNOWN)
  53 {
  54   // DO NOTHING
  55 }
  56
  57 void KernelGenerator::visit(const ir::OpSequence &op_seq)
  58 {
  59   // TODO Move this to IKernelGenerator
  60   //      (all derivatives have the same implementation for this)
  61   assert(!_return_fn_seq);
  62   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
  63   _return_fn_seq->enableDynamicShapeInferer(false);
  64
  65   _current_layout = op_seq.getLayout();
  66   for (const auto &operation_idx : op_seq.operations())
  67   {
  68     const auto &node = _operations_ctx.at(operation_idx);
  69     node.accept(*this);
  70     _return_fn_seq->append(releaseFunction());
  71   }
  72 }
  73
  74 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
  75 {
  76   const auto ofm_index{node.getOutputs().at(0)};
  77   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
  78   const auto block_size_index{
  79       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
  80
  81   const auto NNApiInputs = 2;
  82   if (node.getInputs().size() != NNApiInputs)
  83   {
  84     const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
  85     if (!_ctx.at(crops_index).isConstant())
  86     {
  87       throw std::runtime_error("Non-constant crops NYI for acl_cl backend BatchToSpaceND");
  88     }
  89
  90     auto crops = _ctx.at(crops_index).asVector<int32_t>();
  91     for (auto crop : crops)
  92     {
  93       if (crop != 0)
  94       {
  95         throw std::runtime_error("Non-zero crops NYI for acl_cl backend BatchToSpaceND");
  96       }
  97     }
  98   }
  99
 100   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 101   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 102   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
 103
 104   assert(_ctx.at(block_size_index).data());
 105
 106   auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
 107       ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 108
 109   _return_fn = asAclFunction(std::move(fn));
 110 }
 111
 112 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
 113 {
 114   const auto ofm_index{node.getOutputs().at(0)};
 115   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
 116   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
 117
 118   const auto activation = node.param().activation;
 119
 120   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 121   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 122   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 123
 124   const auto act_info = acl_common::asActivationLayerInfo(activation);
 125
 126   std::unique_ptr<arm_compute::IFunction> fn;
 127   switch (node.param().arithmetic_type)
 128   {
 129     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
 130     {
 131       fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
 132           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 133           arm_compute::ConvertPolicy::SATURATE, act_info);
 134       break;
 135     }
 136     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
 137     {
 138       fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
 139           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 140           arm_compute::ConvertPolicy::SATURATE, act_info);
 141       break;
 142     }
 143     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
 144     {
 145       fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
 146           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
 147           arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
 148           act_info);
 149       break;
 150     }
 151     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
 152     {
 153       fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
 154           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
 155       break;
 156     }
 157     default:
 158       assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
 159       break;
 160   }
 161
 162   _return_fn = asAclFunction(std::move(fn));
 163 }
 164
 165 void KernelGenerator::visit(const ir::operation::Conv2D &node)
 166 {
 167   using ir::operation::Conv2D;
 168
 169   const auto ofm_index{node.getOutputs().at(0)};
 170   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 171   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 172   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 173
 174   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
 175   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
 176   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 177   const auto &ker_shape = _ctx.at(ker_index).shape();
 178   const auto ker_height = ker_shape.dim(1);
 179   const auto ker_width = ker_shape.dim(2);
 180
 181   const auto stride = node.param().stride;
 182   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 183                                             ker_width, ker_height);
 184   const auto activation = node.param().activation;
 185
 186   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 187   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 188   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 189   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 190
 191   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 192   const auto act_info = acl_common::asActivationLayerInfo(activation);
 193
 194   auto fn = acl_common::generateLayer<arm_compute::CLConvolutionLayer>(
 195       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
 196       ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
 197       ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 198
 199   _return_fn = asAclFunction(std::move(fn));
 200 }
 201
 202 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 203 {
 204   using ir::operation::DepthwiseConv2D;
 205
 206   const auto ofm_index{node.getOutputs().at(0)};
 207   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 208   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 209   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 210
 211   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
 212   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
 213   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 214   const auto &ker_shape = _ctx.at(ker_index).shape();
 215   const auto ker_height = ker_shape.dim(1);
 216   const auto ker_width = ker_shape.dim(2);
 217
 218   const auto stride = node.param().stride;
 219   const auto dilation = node.param().dilation;
 220   const auto padding =
 221       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width,
 222                            ker_height, dilation.width_factor, dilation.height_factor);
 223   const auto multiplier = node.param().multiplier;
 224   const auto activation = node.param().activation;
 225
 226   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 227   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 228   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 229   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 230
 231   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 232   const auto act_info = acl_common::asActivationLayerInfo(activation);
 233   const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
 234
 235   auto fn = acl_common::generateLayer<arm_compute::CLDepthwiseConvolutionLayer>(
 236       ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
 237       conv_info, multiplier, act_info, dilation_info);
 238
 239   _return_fn = asAclFunction(std::move(fn));
 240 }
 241
 242 void KernelGenerator::visit(const ir::operation::Concat &node)
 243 {
 244   const auto ofm_index{node.getOutputs().at(0)};
 245
 246   std::vector<ir::OperandIndex> input_indexes;
 247
 248   for (const auto &input : node.getInputs())
 249     input_indexes.emplace_back(input);
 250
 251   const auto axis = node.param().axis;
 252
 253   // Concat elimination check
 254   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
 255   if (eliminated)
 256   {
 257     // If concat eliminated, return a NOP IFunction
 258     VERBOSE(acl_cl_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
 259     _return_fn = std::make_unique<exec::NopFunction>();
 260     return;
 261   }
 262
 263   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
 264   std::vector<::arm_compute::ICLTensor *> input_tensors;
 265   for (auto &ifm_ind : input_indexes)
 266     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 267
 268   std::unique_ptr<::arm_compute::IFunction> fn;
 269   if (input_indexes.size() < 2)
 270   {
 271     fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensors.at(0),
 272                                                         output_tensor->handle());
 273   }
 274   else
 275   {
 276     const auto rank = _ctx.at(ofm_index).shape().rank();
 277     const auto frontend_layout = _current_layout;
 278     const auto backend_layout = output_tensor->layout();
 279     const auto fixed_axis =
 280         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 281     fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
 282         input_tensors, output_tensor->handle(), fixed_axis);
 283   }
 284
 285   _return_fn = asAclFunction(std::move(fn));
 286 }
 287
 288 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 289 {
 290   const auto output_index{node.getOutputs().at(0)};
 291   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 292   const auto activation = node.param().activation;
 293   if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
 294     throw std::runtime_error(
 295         "KernelGenerator(acl_cl): FullyConnected 16x1Float32 weights is not supported.");
 296
 297   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
 298                                                 ::arm_compute::CLFullyConnectedReshapingLayer>(
 299       node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
 300   _return_fn = std::make_unique<exec::FunctionSequence>(
 301       std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 302 }
 303
 304 void KernelGenerator::visit(const ir::operation::Reduce &node)
 305 {
 306   const auto output_index{node.getOutputs().at(0)};
 307   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
 308   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 309   const auto keep_dims{node.param().keep_dims};
 310   const auto reduce_type = node.param().reduce_type;
 311
 312   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 313   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 314
 315   // Convert to ACL axes taking into account negative values and possible duplicates.
 316   const auto &axes = _ctx.at(axes_index);
 317   const auto input_rank = _ctx.at(input_index).shape().rank();
 318   const auto frontend_layout = _current_layout;
 319   const auto backend_layout = input_tensor->layout();
 320
 321   std::unique_ptr<arm_compute::IFunction> fn;
 322   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
 323   {
 324     const auto acl_axes =
 325         acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
 326     fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
 327                                                               keep_dims, output_tensor->handle());
 328   }
 329   else
 330   {
 331     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
 332
 333     fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
 334         _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 335         output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type));
 336   }
 337
 338   _return_fn = asAclFunction(std::move(fn));
 339 }
 340
 341 void KernelGenerator::visit(const ir::operation::Reshape &node)
 342 {
 343   const auto output_index{node.getOutputs().at(0)};
 344   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 345
 346   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 347   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 348
 349   // NOTE This operation must not be changed the layout from frontend to backend
 350   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 351   const auto frontend_layout = _current_layout;
 352   const auto backend_layout = output_tensor->layout();
 353   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 354          frontend_layout == backend_layout);
 355   UNUSED_RELEASE(frontend_layout);
 356   UNUSED_RELEASE(backend_layout);
 357
 358   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 359                                                                    output_tensor->handle());
 360
 361   _return_fn = asAclFunction(std::move(fn));
 362 }
 363
 364 void KernelGenerator::visit(const ir::operation::Squeeze &node)
 365 {
 366   // Squeeze is identical to reshape except that it has an optional dimensions input.
 367   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 368   // by freezer and toco
 369   // TODO Support multi-layout for frontend and backend
 370   const auto output_index{node.getOutputs().at(0)};
 371   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 372   const auto dims{node.param().dims};
 373   const auto ndim{node.param().ndim};
 374   (void)dims;
 375   (void)ndim;
 376
 377   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 378   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 379   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 380                                                                    output_tensor->handle());
 381   _return_fn = asAclFunction(std::move(fn));
 382 }
 383
 384 void KernelGenerator::visit(const ir::operation::Softmax &node)
 385 {
 386   const auto output_index{node.getOutputs().at(0)};
 387   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
 388
 389   const auto beta = node.param().beta;
 390
 391   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 392   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 393
 394   auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
 395       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 396       output_tensor->handle(), beta);
 397
 398   _return_fn = asAclFunction(std::move(fn));
 399 }
 400
 401 void KernelGenerator::visit(const ir::operation::Slice &node)
 402 {
 403   const auto output_index{node.getOutputs().at(0)};
 404   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
 405   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
 406   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 407
 408   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
 409   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
 410   const auto frontend_layout = _current_layout;
 411   const auto backend_layout = inputData_tensor->layout();
 412
 413   // Set initializers for indices data such as order of inputData
 414   int input_rank = _ctx.at(input_index).shape().rank();
 415   std::vector<int32_t> starts;
 416   std::vector<int32_t> ends;
 417   starts.resize(input_rank, 0);
 418   ends.resize(input_rank, 0);
 419   {
 420     assert(_ctx.at(begins_index).data());
 421     assert(_ctx.at(sizes_index).data());
 422     auto beginData_base = _ctx.at(begins_index).data()->base();
 423     auto sizeData_base = _ctx.at(sizes_index).data()->base();
 424     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
 425     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
 426
 427     using ir::DataType;
 428
 429     UNUSED_RELEASE(beginData_size);
 430     UNUSED_RELEASE(sizeData_size);
 431
 432     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
 433     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
 434     assert(beginData_size == input_rank);
 435     assert(sizeData_size == input_rank);
 436
 437     assert(beginData_base != nullptr);
 438     for (int n = 0; n < input_rank; ++n)
 439     {
 440       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 441                                                                  backend_layout)
 442                       .value();
 443
 444       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
 445       starts[axis] = begin_value;
 446
 447       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
 448       ends[axis] = begin_value + size_value;
 449     }
 450   }
 451
 452   ::arm_compute::Coordinates starts_set;
 453   ::arm_compute::Coordinates ends_set;
 454
 455   for (size_t i = 0; i < starts.size(); ++i)
 456   {
 457     starts_set.set(i, starts[i]);
 458     ends_set.set(i, ends[i]);
 459   }
 460
 461   auto fn = acl_common::generateLayer<arm_compute::CLSlice>(
 462       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 463
 464   _return_fn = asAclFunction(std::move(fn));
 465 }
 466
 467 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 468 {
 469   const auto output_index{node.getOutputs().at(0)};
 470   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
 471   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
 472   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
 473   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 474
 475   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
 476   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
 477   const auto frontend_layout = _current_layout;
 478   const auto backend_layout = inputData_tensor->layout();
 479
 480   // Set initializers for indices data such as order of inputData
 481   int input_rank = _ctx.at(input_index).shape().rank();
 482   std::vector<int32_t> starts;
 483   std::vector<int32_t> ends;
 484   std::vector<int32_t> strides;
 485   starts.resize(input_rank, 0);
 486   ends.resize(input_rank, 0);
 487   strides.resize(input_rank, 0);
 488   {
 489     assert(_ctx.at(starts_index).data());
 490     assert(_ctx.at(ends_index).data());
 491     assert(_ctx.at(strides_index).data());
 492     auto startData_base = _ctx.at(starts_index).data()->base();
 493     auto endData_base = _ctx.at(ends_index).data()->base();
 494     auto stridesData_base = _ctx.at(strides_index).data()->base();
 495     const int startData_size = _ctx.at(starts_index).shape().num_elements();
 496     const int endData_size = _ctx.at(ends_index).shape().num_elements();
 497     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
 498
 499     using ir::DataType;
 500
 501     UNUSED_RELEASE(startData_size);
 502     UNUSED_RELEASE(endData_size);
 503     UNUSED_RELEASE(stridesData_size);
 504
 505     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
 506     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
 507     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
 508     assert(startData_size == input_rank);
 509     assert(endData_size == input_rank);
 510     assert(stridesData_size == input_rank);
 511
 512     assert(startData_base != nullptr);
 513     for (int n = 0; n < input_rank; ++n)
 514     {
 515       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 516                                                                  backend_layout)
 517                       .value();
 518
 519       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
 520       starts[axis] = start_value;
 521
 522       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
 523       ends[axis] = end_value;
 524
 525       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
 526       strides[axis] = strides_value;
 527     }
 528   }
 529
 530   // Set mask bits such as order of inputData
 531   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
 532                                                            frontend_layout, backend_layout);
 533   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
 534                                                          frontend_layout, backend_layout);
 535   const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
 536       node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
 537
 538   ::arm_compute::Coordinates starts_set;
 539   ::arm_compute::Coordinates ends_set;
 540   ::arm_compute::BiStrides strides_set;
 541
 542   for (size_t i = 0; i < starts.size(); ++i)
 543   {
 544     starts_set.set(i, starts[i]);
 545     ends_set.set(i, ends[i]);
 546     strides_set.set(i, strides[i]);
 547   }
 548
 549   // Disable applied dim_correction
 550   if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
 551   {
 552     // This means that high dimension's value is 1 and input tensor is applied dim_correction
 553     acl_common::disableDimCorrection(inputData_tensor);
 554   }
 555
 556   auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
 557       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
 558       begin_mask, end_mask, shrink_axis_mask);
 559
 560   // Revert disabling applied dim_correction
 561   if (inputData_tensor->dimension(0) == 1)
 562   {
 563     acl_common::enableDimCorrection(inputData_tensor);
 564   }
 565
 566   _return_fn = asAclFunction(std::move(fn));
 567 }
 568
 569 void KernelGenerator::visit(const ir::operation::Transpose &node)
 570 {
 571   const auto ofm_idx{node.getOutputs().at(0)};
 572   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
 573   const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
 574
 575   const auto rank = _ctx.at(ifm_idx).shape().rank();
 576
 577   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
 578   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
 579   const auto frontend_layout = _current_layout;
 580   const auto backend_layout = ifm_tensor->layout();
 581
 582   const auto &perms = _ctx.at(perm_idx);
 583   std::vector<int32_t> pv;
 584   if (perms.shape() == ir::Shape{0})
 585   {
 586     pv.resize(rank);
 587     std::iota(pv.begin(), pv.end(), 0);
 588     std::reverse(pv.begin(), pv.end());
 589   }
 590   else
 591   {
 592     pv = _ctx.at(perm_idx).asVector<int32_t>();
 593   }
 594
 595   std::unique_ptr<arm_compute::IFunction> fn;
 596   if (rank == 1)
 597   {
 598     fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
 599   }
 600   else if (rank == 2)
 601   {
 602     assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
 603     fn = acl_common::generateLayer<arm_compute::CLTranspose>(ifm_tensor->handle(),
 604                                                              ofm_tensor->handle());
 605   }
 606   else
 607   {
 608     auto backend_pv =
 609         acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
 610
 611     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
 612                                                            ofm_tensor->handle(), backend_pv);
 613   }
 614
 615   _return_fn = asAclFunction(std::move(fn));
 616 }
 617
 618 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
 619 {
 620   const auto ofm_index{node.getOutputs().at(0)};
 621   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 622
 623   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 624   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 625
 626   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
 627       node.param().op_type, node.param().alpha, node.param().beta);
 628
 629   auto fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 630       ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 631
 632   _return_fn = asAclFunction(std::move(fn));
 633 }
 634
 635 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
 636 {
 637   const auto output_index{node.getOutputs().at(0)};
 638   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
 639   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 640
 641   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 642   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 643   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 644
 645   std::unique_ptr<arm_compute::IFunction> fn;
 646   switch (node.param().op_type)
 647   {
 648     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
 649     {
 650       fn = acl_common::generateLayer<arm_compute::CLBinaryLogicalOp>(
 651           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle(),
 652           arm_compute::BinaryLogicalOperation::AND);
 653       break;
 654     }
 655     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
 656     {
 657       fn = acl_common::generateLayer<arm_compute::CLBitwiseOr>(
 658           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 659       break;
 660     }
 661     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
 662     {
 663       fn = acl_common::generateLayer<arm_compute::CLElementwiseMax>(
 664           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 665       break;
 666     }
 667     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
 668     {
 669       fn = acl_common::generateLayer<arm_compute::CLElementwiseMin>(
 670           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 671       break;
 672     }
 673     default:
 674     {
 675       std::string err_msg("acl_cl KernelGenerator : " + node.name() +
 676                           "is not elementwise-binary operations");
 677       assert(false && err_msg.c_str());
 678       break;
 679     }
 680   }
 681
 682   _return_fn = asAclFunction(std::move(fn));
 683 }
 684
 685 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
 686 {
 687   const auto output_index{node.getOutputs().at(0)};
 688   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 689
 690   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 691   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 692
 693   std::unique_ptr<arm_compute::IFunction> fn;
 694   switch (node.param().op_type)
 695   {
 696     case ir::operation::ElementwiseUnary::Type::ABS:
 697     {
 698       const ::arm_compute::ActivationLayerInfo act_info{
 699           ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 700
 701       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 702           input_tensor->handle(), output_tensor->handle(), act_info);
 703       break;
 704     }
 705     case ir::operation::ElementwiseUnary::Type::CAST:
 706     {
 707       if (input_tensor->data_type() == output_tensor->data_type())
 708       {
 709         fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
 710                                                             output_tensor->handle());
 711       }
 712       else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
 713       {
 714         fn = acl_common::generateLayer<arm_compute::CLCastBool>(input_tensor->handle(),
 715                                                                 output_tensor->handle());
 716       }
 717       else
 718       {
 719         // TODO Support converting float to int32 as round down
 720         fn = acl_common::generateLayer<arm_compute::CLCast>(
 721             input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
 722       }
 723       break;
 724     }
 725     case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
 726     {
 727       fn = acl_common::generateLayer<arm_compute::CLDequantizationLayer>(input_tensor->handle(),
 728                                                                          output_tensor->handle());
 729       break;
 730     }
 731     case ir::operation::ElementwiseUnary::Type::EXP:
 732     {
 733       fn = acl_common::generateLayer<arm_compute::CLExpLayer>(input_tensor->handle(),
 734                                                               output_tensor->handle());
 735       break;
 736     }
 737     case ir::operation::ElementwiseUnary::Type::FLOOR:
 738     {
 739       fn = acl_common::generateLayer<arm_compute::CLFloor>(input_tensor->handle(),
 740                                                            output_tensor->handle());
 741       break;
 742     }
 743     case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
 744     {
 745       fn = acl_common::generateLayer<arm_compute::CLBitwiseNot>(input_tensor->handle(),
 746                                                                 output_tensor->handle());
 747       break;
 748     }
 749     case ir::operation::ElementwiseUnary::Type::NEG:
 750     {
 751       fn = acl_common::generateLayer<arm_compute::CLNeg>(input_tensor->handle(),
 752                                                          output_tensor->handle());
 753       break;
 754     }
 755     case ir::operation::ElementwiseUnary::Type::RSQRT:
 756     {
 757       fn = acl_common::generateLayer<arm_compute::CLRsqrtLayer>(input_tensor->handle(),
 758                                                                 output_tensor->handle());
 759       break;
 760     }
 761     case ir::operation::ElementwiseUnary::Type::SQRT:
 762     {
 763       const ::arm_compute::ActivationLayerInfo act_info{
 764           ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 765
 766       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 767           input_tensor->handle(), output_tensor->handle(), act_info);
 768       break;
 769     }
 770     default:
 771     {
 772       throw std::runtime_error("acl_cl KernelGenerator : " + node.name() + "is not supported yet");
 773       break;
 774     }
 775   }
 776
 777   auto acl_fn = asAclFunction(std::move(fn));
 778
 779   _return_fn = std::move(acl_fn);
 780 }
 781
 782 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
 783 {
 784   const auto output_index{node.getOutputs().at(0)};
 785   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 786
 787   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 788   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 789
 790   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 791                                                                    output_tensor->handle());
 792
 793   _return_fn = asAclFunction(std::move(fn));
 794 }
 795
 796 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
 797 {
 798   const auto ofm_index{node.getOutputs().at(0)};
 799   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
 800   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
 801   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 802
 803   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 804   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 805   auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
 806   auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
 807   auto epsilon = node.param().epsilon;
 808   auto activation = node.param().activation;
 809
 810   auto fn = acl_common::generateLayer<arm_compute::CLInstanceNormalizationLayerEx>(
 811       ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
 812       epsilon);
 813
 814   _return_fn = std::make_unique<exec::FunctionSequence>(
 815       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 816 }
 817
 818 void KernelGenerator::visit(const ir::operation::LSTM &node)
 819 {
 820   _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ICLTensor,
 821                                          ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_reg);
 822 }
 823
 824 void KernelGenerator::visit(const ir::operation::Comparison &node)
 825 {
 826   const auto output_index{node.getOutputs().at(0)};
 827   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
 828   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
 829
 830   const auto comparison_type = node.param().comparison_type;
 831
 832   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 833   auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
 834   auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
 835
 836   auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
 837       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
 838       (arm_compute::ComparisonOperation)comparison_type);
 839
 840   _return_fn = asAclFunction(std::move(fn));
 841 }
 842
 843 void KernelGenerator::visit(const ir::operation::OneHot &node)
 844 {
 845   const auto output_idx{node.getOutputs().at(0)};
 846   const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
 847   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
 848   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
 849   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
 850   const auto depth = _ctx.at(depth_idx).asScalar<int32_t>();
 851   assert(depth > 0);
 852
 853   auto output_tensor = _tensor_reg->getAclTensor(output_idx);
 854   auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
 855   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
 856
 857   const size_t output_rank = _ctx.at(output_idx).shape().rank();
 858   const auto frontend_layout = _current_layout;
 859   const auto backend_layout = output_tensor->layout();
 860   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
 861   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 862
 863   if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions())
 864   {
 865     // This means that high dimension's value is 1 and output_tensor is applied dim_correction
 866     acl_common::disableDimCorrection(output_tensor);
 867   }
 868
 869   std::unique_ptr<::arm_compute::IFunction> fn;
 870   const auto &offvalue = _ctx.at(offvalue_idx);
 871   if (offvalue.isConstant())
 872   {
 873     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
 874         indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(),
 875         acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis);
 876   }
 877   else
 878   {
 879     auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
 880     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
 881         indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
 882         output_tensor->handle(), static_cast<uint32_t>(depth), axis);
 883   }
 884
 885   if (output_tensor->dimension(0) == 1)
 886   {
 887     acl_common::enableDimCorrection(output_tensor);
 888   }
 889
 890   _return_fn = asAclFunction(std::move(fn));
 891 }
 892
 893 void KernelGenerator::visit(const ir::operation::Pack &node)
 894 {
 895   const auto output_index{node.getOutputs().at(0)};
 896   auto axis{node.param().axis};
 897
 898   const auto output_rank = _ctx.at(output_index).shape().rank();
 899
 900   std::vector<ir::OperandIndex> input_indexes;
 901   for (const auto &input_index : node.getInputs())
 902     input_indexes.emplace_back(input_index);
 903
 904   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 905   std::vector<arm_compute::ICLTensor *> inputs;
 906   for (const auto &input_index : input_indexes)
 907     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 908
 909   const auto frontend_layout = _current_layout;
 910   const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 911
 912   if (axis < 0)
 913     axis += output_rank;
 914   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 915
 916   // Disable applied dim_correction
 917   for (const auto &input_index : input_indexes)
 918   {
 919     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 920     if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
 921     {
 922       // This means that high dimension's value is 1 and input tensor is applied dim_correction
 923       acl_common::disableDimCorrection(input_tensor);
 924     }
 925   }
 926
 927   auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
 928
 929   // Revert disabling applied dim_correction
 930   for (const auto &input_index : input_indexes)
 931   {
 932     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 933     if (input_tensor->dimension(0) == 1)
 934     {
 935       acl_common::enableDimCorrection(input_tensor);
 936     }
 937   }
 938
 939   _return_fn = asAclFunction(std::move(fn));
 940 }
 941
 942 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 943 {
 944   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
 945       node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
 946
 947   const auto ofm_index{node.getOutputs().at(0)};
 948   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 949   const auto activation = node.param().activation;
 950   _return_fn = std::make_unique<exec::FunctionSequence>(
 951       asAclFunction(std::move(raw_fn)),
 952       ActivationBuilder::generate(activation, ofm_tensor->handle()));
 953 }
 954
 955 void KernelGenerator::visit(const ir::operation::Permute &node)
 956 {
 957   const auto ofm_idx{node.getOutputs().at(0)};
 958   const auto ifm_idx{node.getInputs().at(0)};
 959   const auto permute_type = node.getPermuteType();
 960   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
 961   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
 962   const auto rank = _ctx.at(ofm_idx).shape().rank();
 963   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 964
 965   std::unique_ptr<::arm_compute::IFunction> fn;
 966   arm_compute::PermutationVector pv;
 967   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
 968   {
 969     // WHCN -> CWHN
 970     pv = arm_compute::PermutationVector{2, 0, 1};
 971
 972     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
 973                                                            ofm_tensor->handle(), pv);
 974   }
 975   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
 976   {
 977     // CWHN -> WHCN
 978     pv = arm_compute::PermutationVector{1, 2, 0};
 979
 980     fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
 981                                                              ofm_tensor->handle(), pv);
 982   }
 983   else
 984   {
 985     fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
 986   }
 987
 988   _return_fn = asAclFunction(std::move(fn));
 989 }
 990
 991 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 992 {
 993   const auto ofm_index{node.getOutputs().at(0)};
 994   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 995
 996   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 997   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 998
 999   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
1000       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
1001       ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
1002       ::arm_compute::SamplingPolicy::TOP_LEFT);
1003
1004   _return_fn = asAclFunction(std::move(fn));
1005 }
1006
1007 void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
1008 {
1009   const auto ofm_index{node.getOutputs().at(0)};
1010   const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)};
1011
1012   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1013   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1014
1015   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
1016       ifm_tensor->handle(), ofm_tensor->handle(),
1017       ::arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, ::arm_compute::BorderMode::REPLICATE,
1018       ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
1019
1020   _return_fn = asAclFunction(std::move(fn));
1021 }
1022
1023 void KernelGenerator::visit(const ir::operation::RNN &node)
1024 {
1025   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
1026   const auto hidden_state_out_index{
1027       node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
1028
1029   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
1030   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
1031   const auto recurrent_weights_index{
1032       node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
1033   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
1034   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
1035
1036   const auto activation = node.param().activation;
1037
1038   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1039   auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
1040
1041   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1042   auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
1043   auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
1044   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
1045   auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
1046   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
1047
1048   auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
1049       hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
1050   _return_fn = asAclFunction(std::move(copy_layer));
1051
1052   auto fn = acl_common::generateLayer<arm_compute::CLRNNLayer>(
1053       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
1054       weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
1055       hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
1056   _return_fn = asAclFunction(std::move(fn));
1057 }
1058
1059 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1060 {
1061   const auto ofm_index{node.getOutputs().at(0)};
1062   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
1063   const auto block_size_index{
1064       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1065   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1066
1067   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1068   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1069   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1070   auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1071
1072   assert(_ctx.at(block_size_index).data());
1073   assert(_ctx.at(paddings_index).data());
1074
1075   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToBatchLayer>(
1076       ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1077       ofm_tensor->handle());
1078
1079   _return_fn = asAclFunction(std::move(fn));
1080 }
1081
1082 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1083 {
1084   const auto ofm_index{node.getOutputs().at(0)};
1085   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1086
1087   auto block_size = node.param().block_size;
1088
1089   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1090   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1091
1092   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
1093       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1094
1095   _return_fn = asAclFunction(std::move(fn));
1096 }
1097
1098 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
1099 {
1100   const auto output_index{node.getOutputs().at(0)};
1101   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
1102   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
1103
1104   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1105   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1106   auto values_tensor = _tensor_reg->getAclTensor(values_index);
1107
1108   auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
1109       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
1110
1111   _return_fn = asAclFunction(std::move(fn));
1112 }
1113
1114 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1115 {
1116   const auto ofm_index{node.getOutputs().at(0)};
1117   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
1118
1119   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1120   // L2 Normalization always performs the reduction along the depth axis
1121   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1122   // choosing normalization parameters as below
1123
1124   const auto &ifm_shape = _ctx.at(ifm_index).shape();
1125   // TODO Support optional constant dimension that normalization would be performed on
1126   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1127   int32_t radius =
1128       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1129   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
1130   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
1131   float bias = 0.0f;                             // Don't offset the reduction.
1132
1133   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1134   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1135
1136   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1137                                                                radius, alpha, beta, bias, false);
1138
1139   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1140       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1141
1142   _return_fn = asAclFunction(std::move(fn));
1143 }
1144
1145 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
1146 {
1147   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
1148   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
1149
1150   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
1151   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
1152   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
1153
1154   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1155   auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
1156
1157   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1158   auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
1159   auto values_tensor = _tensor_reg->getAclTensor(values_index);
1160
1161   auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
1162       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
1163       output_tensor->handle(), hits_tensor->handle());
1164
1165   _return_fn = asAclFunction(std::move(fn));
1166 }
1167
1168 void KernelGenerator::visit(const ir::operation::PReLU &node)
1169 {
1170   const auto ofm_index{node.getOutputs().at(0)};
1171   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
1172   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
1173
1174   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1175   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1176   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
1177
1178   auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
1179       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
1180
1181   _return_fn = asAclFunction(std::move(fn));
1182 }
1183
1184 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1185 {
1186   const auto ofm_index{node.getOutputs().at(0)};
1187   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1188   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1189
1190   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
1191   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
1192   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
1193
1194   const auto stride = node.param().stride;
1195
1196   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1197          (node.param().padding.type == ir::PaddingType::VALID));
1198   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1199                                       ker_shape.W, ker_shape.H);
1200   uint32_t invalid_horizontal = 0;
1201   uint32_t invalid_vertical = 0;
1202   if (node.param().padding.type == ir::PaddingType::VALID)
1203   {
1204     invalid_horizontal =
1205         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1206     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1207   }
1208
1209   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1210   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1211   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1212
1213   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1214
1215   auto fn = acl_common::generateLayer<arm_compute::CLTransposeConvLayer>(
1216       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
1217       ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
1218       invalid_vertical);
1219
1220   _return_fn = asAclFunction(std::move(fn));
1221 }
1222
1223 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1224 {
1225   const auto ofm_index{node.getOutputs().at(0)};
1226   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1227   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1228
1229   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1230   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1231   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1232
1233   auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
1234       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1235
1236   _return_fn = asAclFunction(std::move(fn));
1237 }
1238
1239 void KernelGenerator::visit(const ir::operation::TopKV2 &node)
1240 {
1241   const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
1242   const auto outputIndices_index{
1243       node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
1244
1245   const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
1246
1247   // Currently, we only support the vector input.
1248   assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1249          _ctx.at(inputData_index).shape().rank() == 2);
1250
1251   const auto k = node.param().k;
1252
1253   auto values_tensor = _tensor_reg->getAclTensor(outputValues_index);
1254   auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index);
1255   auto input_tensor = _tensor_reg->getAclTensor(inputData_index);
1256
1257   auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
1258       input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
1259
1260   _return_fn = asAclFunction(std::move(fn));
1261 }
1262
1263 void KernelGenerator::visit(const ir::operation::Gather &node)
1264 {
1265   const auto ofm_index{node.getOutputs().at(0)};
1266
1267   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
1268   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
1269
1270   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1271   const auto axis_raw = node.param().axis;
1272   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1273   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
1274
1275   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1276   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1277   auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
1278
1279   // NOTE The frontend layout and backend layout must be the same for this operation.
1280   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1281   //      is not not efficient even if it works well. If so, it would be better to set the
1282   //      layout of these backend tensors to the same layout.
1283   //      There is also one thing we have to think about. This operation depends on the layout of
1284   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1285   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1286   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1287   const auto backend_layout = ofm_tensor->layout();
1288   UNUSED_RELEASE(backend_layout);
1289   assert(backend_layout == ifm_tensor->layout());
1290   assert(backend_layout == indices_tensor->layout());
1291   assert(ifm_rank < 4 || _current_layout == backend_layout);
1292
1293   // input is n-D, indices k-D, output is (n + k - 1)-D
1294   size_t n = ifm_rank;
1295   assert(n == ifm_tensor->num_dimensions());
1296   size_t k = _ctx.at(indices_index).shape().rank();
1297   assert(k == indices_tensor->num_dimensions());
1298
1299   // Disable applied dim_correction
1300   if (n != ifm_tensor->info()->num_dimensions())
1301   {
1302     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1303     acl_common::disableDimCorrection(ifm_tensor);
1304   }
1305   if (k != indices_tensor->info()->num_dimensions())
1306   {
1307     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
1308     acl_common::disableDimCorrection(indices_tensor);
1309   }
1310
1311   auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
1312       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
1313
1314   // Revert disabling applied dim_correction
1315   if (ifm_tensor->dimension(0) == 1)
1316   {
1317     acl_common::enableDimCorrection(ifm_tensor);
1318   }
1319   if (indices_tensor->dimension(0) == 1)
1320   {
1321     acl_common::enableDimCorrection(indices_tensor);
1322   }
1323
1324   _return_fn = asAclFunction(std::move(fn));
1325 }
1326
1327 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
1328 {
1329   const auto ofm_index{node.getOutputs().at(0)};
1330   const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
1331   const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
1332
1333   auto ifm_shape = _ctx.at(ifm_index).shape();
1334   auto ofm_shape = _ctx.at(ofm_index).shape();
1335
1336   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1337
1338   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1339   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1340   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1341   auto frontend_layout = _current_layout;
1342   auto backend_layout = ifm_tensor->layout();
1343
1344   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
1345   if (axis_value < 0)
1346   {
1347     axis_value += ifm_rank;
1348   }
1349
1350   auto acl_axis =
1351       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1352   auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
1353                                              : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
1354   auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
1355       ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type);
1356
1357   _return_fn = asAclFunction(std::move(fn));
1358 }
1359
1360 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
1361 {
1362   const auto ofm_index{node.getOutputs().at(0)};
1363   const auto ifm_index{
1364       node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
1365
1366   auto radius = node.param().radius;
1367   auto alpha = node.param().alpha;
1368   auto beta = node.param().beta;
1369   auto bias = node.param().bias;
1370
1371   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1372   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1373
1374   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1375       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1376
1377   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1378       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1379
1380   _return_fn = asAclFunction(std::move(fn));
1381 }
1382
1383 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1384 {
1385   const auto output_index{node.getOutputs().at(0)};
1386   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1387
1388   auto block_size = node.param().block_size;
1389   assert(block_size > 0);
1390
1391   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1392   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1393
1394   auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
1395       input_tensor->handle(), output_tensor->handle(), block_size);
1396
1397   _return_fn = asAclFunction(std::move(fn));
1398 }
1399
1400 void KernelGenerator::visit(const ir::operation::Split &node)
1401 {
1402   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1403   const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1404
1405   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1406   if (!_ctx.at(axis_index).isConstant())
1407   {
1408     throw std::runtime_error("Non-constant axis_index NYI for acl_cl backend");
1409   }
1410
1411   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1412   std::vector<ir::OperandIndex> output_indexes;
1413   for (const auto &output : node.getOutputs())
1414     output_indexes.emplace_back(output);
1415
1416   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1417   std::vector<arm_compute::ICLTensor *> output_tensors;
1418   for (const auto &ofm_ind : output_indexes)
1419     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1420
1421   const auto frontend_layout = _current_layout;
1422   const auto backend_layout = ifm_tensor->layout();
1423   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1424   if (axis < 0)
1425     axis += ifm_rank;
1426   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1427
1428   auto fn =
1429       acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
1430
1431   _return_fn = asAclFunction(std::move(fn));
1432 }
1433
1434 void KernelGenerator::visit(const ir::operation::SplitV &node)
1435 {
1436   const auto ifm_index{node.getInputs().at(ir::operation::SplitV::Input::INPUT)};
1437   const auto size_split_index{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
1438   const auto split_dim_index{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
1439
1440   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1441
1442   const size_t ifm_rank = _ctx.at(ifm_index).shape().rank();
1443   std::vector<ir::OperandIndex> output_indexes;
1444   for (const auto &output : node.getOutputs())
1445     output_indexes.emplace_back(output);
1446
1447   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1448   auto size_split_tensor = _tensor_reg->getAclTensor(size_split_index);
1449
1450   std::vector<arm_compute::ICLTensor *> output_tensors;
1451   for (const auto &ofm_ind : output_indexes)
1452     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1453
1454   auto fn = std::make_unique<arm_compute::CLSplitVEx>();
1455   const auto &split_dim_op = _ctx.at(split_dim_index);
1456   if (split_dim_op.isConstant())
1457   {
1458     int32_t split_dim = split_dim_op.asScalar<int32_t>();
1459     uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
1460     const auto frontend_layout = _current_layout;
1461     const auto backend_layout = ifm_tensor->layout();
1462
1463     if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
1464     {
1465       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1466       acl_common::disableDimCorrection(ifm_tensor);
1467     }
1468
1469     split_dim_revised =
1470         acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout)
1471             .value();
1472     fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
1473                   output_tensors, node.param().num_splits);
1474
1475     if (ifm_tensor->dimension(0) == 1)
1476     {
1477       acl_common::enableDimCorrection(ifm_tensor);
1478     }
1479   }
1480   else
1481   {
1482     throw std::runtime_error("Non-constant split_dim NYI for acl_cl backend");
1483   }
1484
1485   _return_fn = asAclFunction(std::move(fn));
1486 }
1487
1488 void KernelGenerator::visit(const ir::operation::Unpack &node)
1489 {
1490   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1491   auto axis{node.param().axis};
1492
1493   const auto input_rank = _ctx.at(input_index).shape().rank();
1494
1495   std::vector<ir::OperandIndex> output_indexes;
1496   for (const auto &output_index : node.getOutputs())
1497     output_indexes.emplace_back(output_index);
1498
1499   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1500   std::vector<arm_compute::ICLTensor *> outputs;
1501   for (const auto &output_index : output_indexes)
1502     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1503
1504   const auto frontend_layout = _current_layout;
1505   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1506   if (axis < 0)
1507     axis += input_rank;
1508   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1509
1510   // Disable applied dim_correction
1511   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1512   {
1513     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1514     acl_common::disableDimCorrection(input_tensor);
1515   }
1516
1517   auto fn =
1518       acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
1519
1520   // Revert disabling applied dim_correction
1521   if (input_tensor->dimension(0) == 1)
1522   {
1523     acl_common::enableDimCorrection(input_tensor);
1524   }
1525
1526   _return_fn = asAclFunction(std::move(fn));
1527 }
1528
1529 void KernelGenerator::visit(const ir::operation::Pad &node)
1530 {
1531   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
1532   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
1533   const auto output_index{node.getOutputs().at(0)};
1534   assert(_ctx.at(pad_index).data());
1535
1536   auto rank = _ctx.at(input_index).shape().rank();
1537   auto pad_base = _ctx.at(pad_index).data()->base();
1538
1539   auto input_type = _ctx.at(input_index).typeInfo();
1540   auto data_type = acl_common::asDataType(input_type.type());
1541   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
1542   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1543
1544   auto input = _tensor_reg->getAclTensor(input_index)->handle();
1545   auto output = _tensor_reg->getAclTensor(output_index)->handle();
1546
1547   const auto frontend_layout = _current_layout;
1548   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1549
1550   ::arm_compute::PaddingList padding_list;
1551   padding_list.resize(rank);
1552   for (int32_t n = 0; n < rank; ++n)
1553   {
1554     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
1555
1556     const auto axis =
1557         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
1558     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1559   }
1560
1561   // Disable applied dim_correction
1562   const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
1563   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1564   {
1565     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1566     acl_common::disableDimCorrection(input_tensor);
1567   }
1568
1569   auto fn =
1570       acl_common::generateLayer<arm_compute::CLPadLayer>(input, output, padding_list, pixel_value);
1571
1572   // NOTE Do not revert disabling applied dim_correction for 4D.
1573   // It would produce a mistach of result by incorrect offset_first_element in
1574   // ICLKernel::add_tensor_argument<3>().
1575   // We have to disable applied dim_correction and not to revert enabling for the kernel that slices
1576   // 4D to 3D because slicing arm_compute::Window can causes incorrect offset_first_element if the
1577   // used tensor is 4D and the tensor's high dimention is 1
1578   if (input_tensor->num_dimensions() < 4 && input_tensor->dimension(0) == 1)
1579   {
1580     acl_common::enableDimCorrection(input_tensor);
1581   }
1582
1583   _return_fn = asAclFunction(std::move(fn));
1584 }
1585
1586 void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
1587 {
1588   const auto ofm_index{node.getOutputs().at(0)};
1589   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
1590
1591   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1592   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1593
1594   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1595       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1596
1597   _return_fn = asAclFunction(std::move(fn));
1598 }
1599
1600 void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
1601 {
1602   const auto ofm_index{node.getOutputs().at(0)};
1603   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
1604
1605   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1606   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1607
1608   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1609       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1610
1611   _return_fn = asAclFunction(std::move(fn));
1612 }
1613
1614 void KernelGenerator::visit(const ir::operation::Reverse &node)
1615 {
1616   const auto ofm_index{node.getOutputs().at(0)};
1617   const auto ifm_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)};
1618   const auto axis_index{node.getInputs().at(ir::operation::Reverse::Input::AXIS)};
1619
1620   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1621   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1622   auto axis_tensor = _tensor_reg->getAclTensor(axis_index);
1623
1624   // WORKAROUND: acl-cl backend only allow U32 type for axis
1625   //             ConstantInitializer will resolve S32 type to U32 type
1626   if (_ctx.at(axis_index).isConstant() &&
1627       (axis_tensor->handle()->info()->data_type() == arm_compute::DataType::S32))
1628   {
1629     axis_tensor->handle()->info()->set_data_type(arm_compute::DataType::U32);
1630   }
1631
1632   auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
1633       ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
1634
1635   _return_fn = asAclFunction(std::move(fn));
1636 }
1637
1638 } // namespace acl_cl
1639 } // namespace backend
1640 } // namespace onert