runtime/onert/backend/acl_cl/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
  20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
  21
  22 #include <AclActivationBuilder.h>
  23 #include <AclFunction.h>
  24 #include <Convert.h>
  25 #include <Swizzle.h>
  26
  27 #include "ir/Index.h"
  28 #include "ir/DataType.h"
  29 #include "ir/InternalType.h"
  30 #include "exec/NopFunction.h"
  31 #include "exec/FunctionSequence.h"
  32 #include "util/logging.h"
  33 #include "util/Utils.h"
  34 #include "AclKernelGen.h"
  35
  36 namespace onert
  37 {
  38 namespace backend
  39 {
  40 namespace acl_cl
  41 {
  42
  43 using ::onert::backend::acl_common::asAclFunction;
  44 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
  45     ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclFunction>;
  46
  47 KernelGenerator::KernelGenerator(
  48     const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
  49     const std::shared_ptr<TensorBuilder> &tensor_builder,
  50     const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
  51     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
  52       _tensor_reg(tensor_reg), _current_op_seq_layout(ir::Layout::UNKNOWN)
  53 {
  54   // DO NOTHING
  55 }
  56
  57 void KernelGenerator::visit(const ir::OpSequence &op_seq)
  58 {
  59   // TODO Move this to IKernelGenerator
  60   //      (all derivatives have the same implementation for this)
  61   assert(!_return_fn_seq);
  62   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
  63   _return_fn_seq->enableDynamicShapeInferer(false);
  64
  65   _current_op_seq_layout = op_seq.getLayout();
  66   for (const auto &operation_idx : op_seq.operations())
  67   {
  68     const auto &node = _operations_ctx.at(operation_idx);
  69     node.accept(*this);
  70     _return_fn_seq->append(releaseFunction());
  71   }
  72 }
  73
  74 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
  75 {
  76   const auto ofm_index{node.getOutputs().at(0)};
  77   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
  78   const auto block_size_index{
  79       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
  80
  81   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
  82   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
  83   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
  84
  85   assert(_ctx.at(block_size_index).data());
  86
  87   auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
  88       ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
  89
  90   _return_fn = asAclFunction(std::move(fn));
  91 }
  92
  93 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
  94 {
  95   const auto ofm_index{node.getOutputs().at(0)};
  96   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
  97   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
  98
  99   const auto activation = node.param().activation;
 100
 101   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 102   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 103   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 104
 105   const auto act_info = acl_common::asActivationLayerInfo(activation);
 106
 107   std::unique_ptr<arm_compute::IFunction> fn;
 108   switch (node.param().arithmetic_type)
 109   {
 110     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
 111     {
 112       fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
 113           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 114           arm_compute::ConvertPolicy::SATURATE, act_info);
 115       break;
 116     }
 117     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
 118     {
 119       fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
 120           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 121           arm_compute::ConvertPolicy::SATURATE, act_info);
 122       break;
 123     }
 124     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
 125     {
 126       fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
 127           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
 128           arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
 129           act_info);
 130       break;
 131     }
 132     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
 133     {
 134       fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
 135           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
 136       break;
 137     }
 138     default:
 139       assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
 140       break;
 141   }
 142
 143   _return_fn = asAclFunction(std::move(fn));
 144 }
 145
 146 void KernelGenerator::visit(const ir::operation::Conv2D &node)
 147 {
 148   using ir::operation::Conv2D;
 149
 150   const auto ofm_index{node.getOutputs().at(0)};
 151   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 152   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 153   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 154
 155   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 156   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 157   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 158   const auto &ker_shape = _ctx.at(ker_index).shape();
 159   const auto ker_height = ker_shape.dim(1);
 160   const auto ker_width = ker_shape.dim(2);
 161
 162   const auto stride = node.param().stride;
 163   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 164                                             ker_width, ker_height);
 165   const auto activation = node.param().activation;
 166
 167   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 168   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 169   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 170   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 171
 172   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 173   const auto act_info = acl_common::asActivationLayerInfo(activation);
 174
 175   auto fn = acl_common::generateLayer<arm_compute::CLConvolutionLayer>(
 176       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
 177       ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
 178       ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 179
 180   _return_fn = asAclFunction(std::move(fn));
 181 }
 182
 183 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 184 {
 185   using ir::operation::DepthwiseConv2D;
 186
 187   const auto ofm_index{node.getOutputs().at(0)};
 188   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 189   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 190   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 191
 192   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 193   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 194   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 195   const auto &ker_shape = _ctx.at(ker_index).shape();
 196   const auto ker_height = ker_shape.dim(1);
 197   const auto ker_width = ker_shape.dim(2);
 198
 199   const auto stride = node.param().stride;
 200   const auto dilation = node.param().dilation;
 201   const auto padding =
 202       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width,
 203                            ker_height, dilation.width_factor, dilation.height_factor);
 204   const auto multiplier = node.param().multiplier;
 205   const auto activation = node.param().activation;
 206
 207   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 208   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 209   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
 210   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
 211
 212   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 213   const auto act_info = acl_common::asActivationLayerInfo(activation);
 214   const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
 215
 216   auto fn = acl_common::generateLayer<arm_compute::CLDepthwiseConvolutionLayer>(
 217       ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
 218       conv_info, multiplier, act_info, dilation_info);
 219
 220   _return_fn = asAclFunction(std::move(fn));
 221 }
 222
 223 void KernelGenerator::visit(const ir::operation::Concat &node)
 224 {
 225   const auto ofm_index{node.getOutputs().at(0)};
 226
 227   std::vector<ir::OperandIndex> input_indexes;
 228
 229   for (const auto &input : node.getInputs())
 230     input_indexes.emplace_back(input);
 231
 232   const auto axis = node.param().axis;
 233
 234   // Concat elimination check
 235   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
 236   if (eliminated)
 237   {
 238     // If concat eliminated, return a NOP IFunction
 239     VERBOSE(acl_cl_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
 240     _return_fn = std::make_unique<exec::NopFunction>();
 241     return;
 242   }
 243
 244   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
 245   std::vector<::arm_compute::ICLTensor *> input_tensors;
 246   for (auto &ifm_ind : input_indexes)
 247     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 248
 249   std::unique_ptr<::arm_compute::IFunction> fn;
 250   if (input_indexes.size() < 2)
 251   {
 252     fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensors.at(0),
 253                                                         output_tensor->handle());
 254   }
 255   else
 256   {
 257     const auto rank = _ctx.at(ofm_index).shape().rank();
 258     const auto frontend_layout = _current_op_seq_layout;
 259     const auto backend_layout = output_tensor->layout();
 260     const auto fixed_axis =
 261         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 262     fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
 263         input_tensors, output_tensor->handle(), fixed_axis);
 264   }
 265
 266   _return_fn = asAclFunction(std::move(fn));
 267 }
 268
 269 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 270 {
 271   const auto output_index{node.getOutputs().at(0)};
 272   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 273   const auto activation = node.param().activation;
 274   if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
 275     throw std::runtime_error(
 276         "KernelGenerator(acl_cl): FullyConnected 16x1Float32 weights is not supported.");
 277
 278   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
 279                                                 ::arm_compute::CLFullyConnectedReshapingLayer>(
 280       node, _ctx, _tensor_builder, _tensor_reg, _current_op_seq_layout);
 281   _return_fn = std::make_unique<exec::FunctionSequence>(
 282       std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 283 }
 284
 285 void KernelGenerator::visit(const ir::operation::Reduce &node)
 286 {
 287   const auto output_index{node.getOutputs().at(0)};
 288   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
 289   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 290   const auto keep_dims{node.param().keep_dims};
 291   const auto reduce_type = node.param().reduce_type;
 292
 293   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 294   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 295
 296   // Convert to ACL axes taking into account negative values and possible duplicates.
 297   const auto &axes = _ctx.at(axes_index);
 298   const auto input_rank = _ctx.at(input_index).shape().rank();
 299   const auto frontend_layout = _current_op_seq_layout;
 300   const auto backend_layout = input_tensor->layout();
 301
 302   std::unique_ptr<arm_compute::IFunction> fn;
 303   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
 304   {
 305     const auto acl_axes =
 306         acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
 307     fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
 308                                                               keep_dims, output_tensor->handle());
 309   }
 310   else
 311   {
 312     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
 313
 314     fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
 315         _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 316         output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type));
 317   }
 318
 319   _return_fn = asAclFunction(std::move(fn));
 320 }
 321
 322 void KernelGenerator::visit(const ir::operation::Reshape &node)
 323 {
 324   const auto output_index{node.getOutputs().at(0)};
 325   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 326
 327   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 328   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 329
 330   // NOTE This operation must not be changed the layout from frontend to backend
 331   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 332   const auto frontend_layout = _current_op_seq_layout;
 333   const auto backend_layout = output_tensor->layout();
 334   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 335          frontend_layout == backend_layout);
 336   UNUSED_RELEASE(frontend_layout);
 337   UNUSED_RELEASE(backend_layout);
 338
 339   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 340                                                                    output_tensor->handle());
 341
 342   _return_fn = asAclFunction(std::move(fn));
 343 }
 344
 345 void KernelGenerator::visit(const ir::operation::Squeeze &node)
 346 {
 347   // Squeeze is identical to reshape except that it has an optional dimensions input.
 348   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 349   // by freezer and toco
 350   // TODO Support multi-layout for frontend and backend
 351   const auto output_index{node.getOutputs().at(0)};
 352   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 353   const auto dims{node.param().dims};
 354   const auto ndim{node.param().ndim};
 355   (void)dims;
 356   (void)ndim;
 357
 358   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 359   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 360   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 361                                                                    output_tensor->handle());
 362   _return_fn = asAclFunction(std::move(fn));
 363 }
 364
 365 void KernelGenerator::visit(const ir::operation::Softmax &node)
 366 {
 367   const auto output_index{node.getOutputs().at(0)};
 368   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
 369
 370   const auto beta = node.param().beta;
 371
 372   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 373   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 374
 375   auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
 376       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 377       output_tensor->handle(), beta);
 378
 379   _return_fn = asAclFunction(std::move(fn));
 380 }
 381
 382 void KernelGenerator::visit(const ir::operation::Slice &node)
 383 {
 384   const auto output_index{node.getOutputs().at(0)};
 385   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
 386   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
 387   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 388
 389   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
 390   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
 391   const auto frontend_layout = _current_op_seq_layout;
 392   const auto backend_layout = inputData_tensor->layout();
 393
 394   // Set initializers for indices data such as order of inputData
 395   int input_rank = _ctx.at(input_index).shape().rank();
 396   std::vector<int32_t> starts;
 397   std::vector<int32_t> ends;
 398   starts.resize(input_rank, 0);
 399   ends.resize(input_rank, 0);
 400   {
 401     assert(_ctx.at(begins_index).data());
 402     assert(_ctx.at(sizes_index).data());
 403     auto beginData_base = _ctx.at(begins_index).data()->base();
 404     auto sizeData_base = _ctx.at(sizes_index).data()->base();
 405     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
 406     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
 407
 408     using ir::DataType;
 409
 410     UNUSED_RELEASE(beginData_size);
 411     UNUSED_RELEASE(sizeData_size);
 412
 413     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
 414     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
 415     assert(beginData_size == input_rank);
 416     assert(sizeData_size == input_rank);
 417
 418     assert(beginData_base != nullptr);
 419     for (int n = 0; n < input_rank; ++n)
 420     {
 421       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 422                                                                  backend_layout)
 423                       .value();
 424
 425       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
 426       starts[axis] = begin_value;
 427
 428       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
 429       ends[axis] = begin_value + size_value;
 430     }
 431   }
 432
 433   ::arm_compute::Coordinates starts_set;
 434   ::arm_compute::Coordinates ends_set;
 435
 436   for (size_t i = 0; i < starts.size(); ++i)
 437   {
 438     starts_set.set(i, starts[i]);
 439     ends_set.set(i, ends[i]);
 440   }
 441
 442   auto fn = acl_common::generateLayer<arm_compute::CLSlice>(
 443       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 444
 445   _return_fn = asAclFunction(std::move(fn));
 446 }
 447
 448 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 449 {
 450   const auto output_index{node.getOutputs().at(0)};
 451   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
 452   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
 453   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
 454   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 455
 456   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
 457   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
 458   const auto frontend_layout = _current_op_seq_layout;
 459   const auto backend_layout = inputData_tensor->layout();
 460
 461   // Set initializers for indices data such as order of inputData
 462   int input_rank = _ctx.at(input_index).shape().rank();
 463   std::vector<int32_t> starts;
 464   std::vector<int32_t> ends;
 465   std::vector<int32_t> strides;
 466   starts.resize(input_rank, 0);
 467   ends.resize(input_rank, 0);
 468   strides.resize(input_rank, 0);
 469   {
 470     assert(_ctx.at(starts_index).data());
 471     assert(_ctx.at(ends_index).data());
 472     assert(_ctx.at(strides_index).data());
 473     auto startData_base = _ctx.at(starts_index).data()->base();
 474     auto endData_base = _ctx.at(ends_index).data()->base();
 475     auto stridesData_base = _ctx.at(strides_index).data()->base();
 476     const int startData_size = _ctx.at(starts_index).shape().num_elements();
 477     const int endData_size = _ctx.at(ends_index).shape().num_elements();
 478     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
 479
 480     using ir::DataType;
 481
 482     UNUSED_RELEASE(startData_size);
 483     UNUSED_RELEASE(endData_size);
 484     UNUSED_RELEASE(stridesData_size);
 485
 486     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
 487     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
 488     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
 489     assert(startData_size == input_rank);
 490     assert(endData_size == input_rank);
 491     assert(stridesData_size == input_rank);
 492
 493     assert(startData_base != nullptr);
 494     for (int n = 0; n < input_rank; ++n)
 495     {
 496       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 497                                                                  backend_layout)
 498                       .value();
 499
 500       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
 501       starts[axis] = start_value;
 502
 503       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
 504       ends[axis] = end_value;
 505
 506       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
 507       strides[axis] = strides_value;
 508     }
 509   }
 510
 511   // Set mask bits such as order of inputData
 512   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
 513                                                            frontend_layout, backend_layout);
 514   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
 515                                                          frontend_layout, backend_layout);
 516   const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
 517       node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
 518
 519   ::arm_compute::Coordinates starts_set;
 520   ::arm_compute::Coordinates ends_set;
 521   ::arm_compute::BiStrides strides_set;
 522
 523   for (size_t i = 0; i < starts.size(); ++i)
 524   {
 525     starts_set.set(i, starts[i]);
 526     ends_set.set(i, ends[i]);
 527     strides_set.set(i, strides[i]);
 528   }
 529
 530   // Disable applied dim_correction
 531   if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
 532   {
 533     // This means that high dimension's value is 1 and input tensor is applied dim_correction
 534     acl_common::disableDimCorrection(inputData_tensor);
 535   }
 536
 537   auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
 538       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
 539       begin_mask, end_mask, shrink_axis_mask);
 540
 541   // Revert disabling applied dim_correction
 542   if (inputData_tensor->dimension(0) == 1)
 543   {
 544     acl_common::enableDimCorrection(inputData_tensor);
 545   }
 546
 547   _return_fn = asAclFunction(std::move(fn));
 548 }
 549
 550 void KernelGenerator::visit(const ir::operation::Transpose &node)
 551 {
 552   const auto ofm_idx{node.getOutputs().at(0)};
 553   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
 554   const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
 555
 556   const auto rank = _ctx.at(ifm_idx).shape().rank();
 557
 558   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
 559   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
 560   const auto frontend_layout = _current_op_seq_layout;
 561   const auto backend_layout = ifm_tensor->layout();
 562
 563   const auto &perms = _ctx.at(perm_idx);
 564   std::vector<int32_t> pv;
 565   if (perms.shape() == ir::Shape{0})
 566   {
 567     pv.resize(rank);
 568     std::iota(pv.begin(), pv.end(), 0);
 569     std::reverse(pv.begin(), pv.end());
 570   }
 571   else
 572   {
 573     pv = _ctx.at(perm_idx).asVector<int32_t>();
 574   }
 575
 576   std::unique_ptr<arm_compute::IFunction> fn;
 577   if (rank == 1)
 578   {
 579     fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
 580   }
 581   else if (rank == 2)
 582   {
 583     assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
 584     fn = acl_common::generateLayer<arm_compute::CLTranspose>(ifm_tensor->handle(),
 585                                                              ofm_tensor->handle());
 586   }
 587   else
 588   {
 589     auto backend_pv =
 590         acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
 591
 592     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
 593                                                            ofm_tensor->handle(), backend_pv);
 594   }
 595
 596   _return_fn = asAclFunction(std::move(fn));
 597 }
 598
 599 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
 600 {
 601   const auto ofm_index{node.getOutputs().at(0)};
 602   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 603
 604   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 605   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 606
 607   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
 608       node.param().op_type, node.param().alpha, node.param().beta);
 609
 610   auto fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 611       ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 612
 613   _return_fn = asAclFunction(std::move(fn));
 614 }
 615
 616 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
 617 {
 618   const auto output_index{node.getOutputs().at(0)};
 619   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
 620   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 621
 622   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 623   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
 624   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 625
 626   std::unique_ptr<arm_compute::IFunction> fn;
 627   switch (node.param().op_type)
 628   {
 629     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
 630     {
 631       fn = acl_common::generateLayer<arm_compute::CLBinaryLogicalOp>(
 632           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle(),
 633           arm_compute::BinaryLogicalOperation::AND);
 634       break;
 635     }
 636     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
 637     {
 638       fn = acl_common::generateLayer<arm_compute::CLBitwiseOr>(
 639           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 640       break;
 641     }
 642     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
 643     {
 644       fn = acl_common::generateLayer<arm_compute::CLElementwiseMax>(
 645           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 646       break;
 647     }
 648     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
 649     {
 650       fn = acl_common::generateLayer<arm_compute::CLElementwiseMin>(
 651           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 652       break;
 653     }
 654     default:
 655     {
 656       std::string err_msg("acl_cl KernelGenerator : " + node.name() +
 657                           "is not elementwise-binary operations");
 658       assert(false && err_msg.c_str());
 659       break;
 660     }
 661   }
 662
 663   _return_fn = asAclFunction(std::move(fn));
 664 }
 665
 666 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
 667 {
 668   const auto output_index{node.getOutputs().at(0)};
 669   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 670
 671   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 672   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 673
 674   std::unique_ptr<arm_compute::IFunction> fn;
 675   switch (node.param().op_type)
 676   {
 677     case ir::operation::ElementwiseUnary::Type::ABS:
 678     {
 679       const ::arm_compute::ActivationLayerInfo act_info{
 680           ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 681
 682       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 683           input_tensor->handle(), output_tensor->handle(), act_info);
 684       break;
 685     }
 686     case ir::operation::ElementwiseUnary::Type::CAST:
 687     {
 688       if (input_tensor->data_type() == output_tensor->data_type())
 689       {
 690         fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
 691                                                             output_tensor->handle());
 692       }
 693       else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
 694       {
 695         fn = acl_common::generateLayer<arm_compute::CLCastBool>(input_tensor->handle(),
 696                                                                 output_tensor->handle());
 697       }
 698       else
 699       {
 700         // TODO Support converting float to int32 as round down
 701         fn = acl_common::generateLayer<arm_compute::CLCast>(
 702             input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
 703       }
 704       break;
 705     }
 706     case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
 707     {
 708       fn = acl_common::generateLayer<arm_compute::CLDequantizationLayer>(input_tensor->handle(),
 709                                                                          output_tensor->handle());
 710       break;
 711     }
 712     case ir::operation::ElementwiseUnary::Type::EXP:
 713     {
 714       fn = acl_common::generateLayer<arm_compute::CLExpLayer>(input_tensor->handle(),
 715                                                               output_tensor->handle());
 716       break;
 717     }
 718     case ir::operation::ElementwiseUnary::Type::FLOOR:
 719     {
 720       fn = acl_common::generateLayer<arm_compute::CLFloor>(input_tensor->handle(),
 721                                                            output_tensor->handle());
 722       break;
 723     }
 724     case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
 725     {
 726       fn = acl_common::generateLayer<arm_compute::CLBitwiseNot>(input_tensor->handle(),
 727                                                                 output_tensor->handle());
 728       break;
 729     }
 730     case ir::operation::ElementwiseUnary::Type::NEG:
 731     {
 732       fn = acl_common::generateLayer<arm_compute::CLNeg>(input_tensor->handle(),
 733                                                          output_tensor->handle());
 734       break;
 735     }
 736     case ir::operation::ElementwiseUnary::Type::RSQRT:
 737     {
 738       fn = acl_common::generateLayer<arm_compute::CLRsqrtLayer>(input_tensor->handle(),
 739                                                                 output_tensor->handle());
 740       break;
 741     }
 742     case ir::operation::ElementwiseUnary::Type::SQRT:
 743     {
 744       const ::arm_compute::ActivationLayerInfo act_info{
 745           ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 746
 747       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 748           input_tensor->handle(), output_tensor->handle(), act_info);
 749       break;
 750     }
 751     default:
 752     {
 753       throw std::runtime_error("acl_cl KernelGenerator : " + node.name() + "is not supported yet");
 754       break;
 755     }
 756   }
 757
 758   auto acl_fn = asAclFunction(std::move(fn));
 759
 760   _return_fn = std::move(acl_fn);
 761 }
 762
 763 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
 764 {
 765   const auto output_index{node.getOutputs().at(0)};
 766   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 767
 768   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 769   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 770
 771   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 772                                                                    output_tensor->handle());
 773
 774   _return_fn = asAclFunction(std::move(fn));
 775 }
 776
 777 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
 778 {
 779   const auto ofm_index{node.getOutputs().at(0)};
 780   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
 781   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
 782   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 783
 784   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 785   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 786   auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
 787   auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
 788   auto epsilon = node.param().epsilon;
 789   auto activation = node.param().activation;
 790
 791   auto fn = acl_common::generateLayer<arm_compute::CLInstanceNormalizationLayerEx>(
 792       ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
 793       epsilon);
 794
 795   _return_fn = std::make_unique<exec::FunctionSequence>(
 796       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 797 }
 798
 799 void KernelGenerator::visit(const ir::operation::LSTM &node)
 800 {
 801   _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ICLTensor,
 802                                          ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_reg);
 803 }
 804
 805 void KernelGenerator::visit(const ir::operation::Comparison &node)
 806 {
 807   const auto output_index{node.getOutputs().at(0)};
 808   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
 809   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
 810
 811   const auto comparison_type = node.param().comparison_type;
 812
 813   auto output_tensor = _tensor_reg->getAclTensor(output_index);
 814   auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
 815   auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
 816
 817   auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
 818       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
 819       (arm_compute::ComparisonOperation)comparison_type);
 820
 821   _return_fn = asAclFunction(std::move(fn));
 822 }
 823
 824 void KernelGenerator::visit(const ir::operation::OneHot &node)
 825 {
 826   const auto output_idx{node.getOutputs().at(0)};
 827   const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
 828   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
 829   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
 830   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
 831   const auto depth = _ctx.at(depth_idx).asScalar<int32_t>();
 832   assert(depth > 0);
 833
 834   auto output_tensor = _tensor_reg->getAclTensor(output_idx);
 835   auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
 836   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
 837
 838   const size_t output_rank = _ctx.at(output_idx).shape().rank();
 839   const auto frontend_layout = _current_op_seq_layout;
 840   const auto backend_layout = output_tensor->layout();
 841   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
 842   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 843
 844   if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions())
 845   {
 846     // This means that high dimension's value is 1 and output_tensor is applied dim_correction
 847     acl_common::disableDimCorrection(output_tensor);
 848   }
 849
 850   std::unique_ptr<::arm_compute::IFunction> fn;
 851   const auto &offvalue = _ctx.at(offvalue_idx);
 852   if (offvalue.isConstant())
 853   {
 854     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
 855         indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(),
 856         acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis);
 857   }
 858   else
 859   {
 860     auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
 861     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
 862         indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
 863         output_tensor->handle(), static_cast<uint32_t>(depth), axis);
 864   }
 865
 866   if (output_tensor->dimension(0) == 1)
 867   {
 868     acl_common::enableDimCorrection(output_tensor);
 869   }
 870
 871   _return_fn = asAclFunction(std::move(fn));
 872 }
 873
 874 void KernelGenerator::visit(const ir::operation::Pack &node)
 875 {
 876   const auto output_index{node.getOutputs().at(0)};
 877   auto axis{node.param().axis};
 878
 879   const auto output_rank = _ctx.at(output_index).shape().rank();
 880
 881   std::vector<ir::OperandIndex> input_indexes;
 882   for (const auto &input_index : node.getInputs())
 883     input_indexes.emplace_back(input_index);
 884
 885   auto output = _tensor_reg->getAclTensor(output_index)->handle();
 886   std::vector<arm_compute::ICLTensor *> inputs;
 887   for (const auto &input_index : input_indexes)
 888     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 889
 890   const auto frontend_layout = _current_op_seq_layout;
 891   const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
 892
 893   if (axis < 0)
 894     axis += output_rank;
 895   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 896
 897   // Disable applied dim_correction
 898   for (const auto &input_index : input_indexes)
 899   {
 900     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 901     if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
 902     {
 903       // This means that high dimension's value is 1 and input tensor is applied dim_correction
 904       acl_common::disableDimCorrection(input_tensor);
 905     }
 906   }
 907
 908   auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
 909
 910   // Revert disabling applied dim_correction
 911   for (const auto &input_index : input_indexes)
 912   {
 913     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 914     if (input_tensor->dimension(0) == 1)
 915     {
 916       acl_common::enableDimCorrection(input_tensor);
 917     }
 918   }
 919
 920   _return_fn = asAclFunction(std::move(fn));
 921 }
 922
 923 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 924 {
 925   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
 926       node, _ctx, _tensor_reg, _current_op_seq_layout,
 927       acl_common::convertPoolType(node.param().op_type));
 928
 929   const auto ofm_index{node.getOutputs().at(0)};
 930   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 931   const auto activation = node.param().activation;
 932   _return_fn = std::make_unique<exec::FunctionSequence>(
 933       asAclFunction(std::move(raw_fn)),
 934       ActivationBuilder::generate(activation, ofm_tensor->handle()));
 935 }
 936
 937 void KernelGenerator::visit(const ir::operation::Permute &node)
 938 {
 939   const auto ofm_idx{node.getOutputs().at(0)};
 940   const auto ifm_idx{node.getInputs().at(0)};
 941   const auto permute_type = node.getPermuteType();
 942   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
 943   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
 944   const auto rank = _ctx.at(ofm_idx).shape().rank();
 945   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 946
 947   std::unique_ptr<::arm_compute::IFunction> fn;
 948   arm_compute::PermutationVector pv;
 949   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
 950   {
 951     // WHCN -> CWHN
 952     pv = arm_compute::PermutationVector{2, 0, 1};
 953
 954     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
 955                                                            ofm_tensor->handle(), pv);
 956   }
 957   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
 958   {
 959     // CWHN -> WHCN
 960     pv = arm_compute::PermutationVector{1, 2, 0};
 961
 962     fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
 963                                                              ofm_tensor->handle(), pv);
 964   }
 965   else
 966   {
 967     fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
 968   }
 969
 970   _return_fn = asAclFunction(std::move(fn));
 971 }
 972
 973 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 974 {
 975   const auto ofm_index{node.getOutputs().at(0)};
 976   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 977
 978   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 979   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 980
 981   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
 982       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
 983       ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
 984       ::arm_compute::SamplingPolicy::TOP_LEFT);
 985
 986   _return_fn = asAclFunction(std::move(fn));
 987 }
 988
 989 void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
 990 {
 991   const auto ofm_index{node.getOutputs().at(0)};
 992   const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)};
 993
 994   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
 995   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 996
 997   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
 998       ifm_tensor->handle(), ofm_tensor->handle(),
 999       ::arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, ::arm_compute::BorderMode::REPLICATE,
1000       ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
1001
1002   _return_fn = asAclFunction(std::move(fn));
1003 }
1004
1005 void KernelGenerator::visit(const ir::operation::RNN &node)
1006 {
1007   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
1008   const auto hidden_state_out_index{
1009       node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
1010
1011   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
1012   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
1013   const auto recurrent_weights_index{
1014       node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
1015   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
1016   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
1017
1018   const auto activation = node.param().activation;
1019
1020   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1021   auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
1022
1023   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1024   auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
1025   auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
1026   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
1027   auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
1028   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
1029
1030   auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
1031       hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
1032   _return_fn = asAclFunction(std::move(copy_layer));
1033
1034   auto fn = acl_common::generateLayer<arm_compute::CLRNNLayer>(
1035       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
1036       weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
1037       hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
1038   _return_fn = asAclFunction(std::move(fn));
1039 }
1040
1041 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1042 {
1043   const auto ofm_index{node.getOutputs().at(0)};
1044   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
1045   const auto block_size_index{
1046       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1047   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1048
1049   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1050   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1051   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1052   auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1053
1054   assert(_ctx.at(block_size_index).data());
1055   assert(_ctx.at(paddings_index).data());
1056
1057   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToBatchLayer>(
1058       ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1059       ofm_tensor->handle());
1060
1061   _return_fn = asAclFunction(std::move(fn));
1062 }
1063
1064 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1065 {
1066   const auto ofm_index{node.getOutputs().at(0)};
1067   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1068
1069   auto block_size = node.param().block_size;
1070
1071   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1072   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1073
1074   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
1075       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1076
1077   _return_fn = asAclFunction(std::move(fn));
1078 }
1079
1080 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
1081 {
1082   const auto output_index{node.getOutputs().at(0)};
1083   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
1084   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
1085
1086   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1087   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1088   auto values_tensor = _tensor_reg->getAclTensor(values_index);
1089
1090   auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
1091       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
1092
1093   _return_fn = asAclFunction(std::move(fn));
1094 }
1095
1096 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1097 {
1098   const auto ofm_index{node.getOutputs().at(0)};
1099   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
1100
1101   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1102   // L2 Normalization always performs the reduction along the depth axis
1103   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1104   // choosing normalization parameters as below
1105
1106   const auto &ifm_shape = _ctx.at(ifm_index).shape();
1107   // TODO Support optional constant dimension that normalization would be performed on
1108   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1109   int32_t radius =
1110       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1111   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
1112   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
1113   float bias = 0.0f;                             // Don't offset the reduction.
1114
1115   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1116   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1117
1118   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1119                                                                radius, alpha, beta, bias, false);
1120
1121   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1122       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1123
1124   _return_fn = asAclFunction(std::move(fn));
1125 }
1126
1127 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
1128 {
1129   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
1130   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
1131
1132   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
1133   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
1134   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
1135
1136   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1137   auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
1138
1139   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1140   auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
1141   auto values_tensor = _tensor_reg->getAclTensor(values_index);
1142
1143   auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
1144       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
1145       output_tensor->handle(), hits_tensor->handle());
1146
1147   _return_fn = asAclFunction(std::move(fn));
1148 }
1149
1150 void KernelGenerator::visit(const ir::operation::PReLU &node)
1151 {
1152   const auto ofm_index{node.getOutputs().at(0)};
1153   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
1154   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
1155
1156   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1157   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1158   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
1159
1160   auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
1161       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
1162
1163   _return_fn = asAclFunction(std::move(fn));
1164 }
1165
1166 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1167 {
1168   const auto ofm_index{node.getOutputs().at(0)};
1169   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1170   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1171
1172   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1173   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1174   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
1175
1176   const auto stride = node.param().stride;
1177
1178   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1179          (node.param().padding.type == ir::PaddingType::VALID));
1180   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1181                                       ker_shape.W, ker_shape.H);
1182   uint32_t invalid_horizontal = 0;
1183   uint32_t invalid_vertical = 0;
1184   if (node.param().padding.type == ir::PaddingType::VALID)
1185   {
1186     invalid_horizontal =
1187         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1188     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1189   }
1190
1191   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1192   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1193   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1194
1195   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1196
1197   auto fn = acl_common::generateLayer<arm_compute::CLTransposeConvLayer>(
1198       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
1199       ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
1200       invalid_vertical);
1201
1202   _return_fn = asAclFunction(std::move(fn));
1203 }
1204
1205 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1206 {
1207   const auto ofm_index{node.getOutputs().at(0)};
1208   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1209   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1210
1211   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1212   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1213   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1214
1215   auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
1216       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1217
1218   _return_fn = asAclFunction(std::move(fn));
1219 }
1220
1221 void KernelGenerator::visit(const ir::operation::TopKV2 &node)
1222 {
1223   const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
1224   const auto outputIndices_index{
1225       node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
1226
1227   const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
1228
1229   // Currently, we only support the vector input.
1230   assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1231          _ctx.at(inputData_index).shape().rank() == 2);
1232
1233   const auto k = node.param().k;
1234
1235   auto values_tensor = _tensor_reg->getAclTensor(outputValues_index);
1236   auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index);
1237   auto input_tensor = _tensor_reg->getAclTensor(inputData_index);
1238
1239   auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
1240       input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
1241
1242   _return_fn = asAclFunction(std::move(fn));
1243 }
1244
1245 void KernelGenerator::visit(const ir::operation::Gather &node)
1246 {
1247   const auto ofm_index{node.getOutputs().at(0)};
1248
1249   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
1250   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
1251
1252   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1253   const auto axis_raw = node.param().axis;
1254   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1255   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
1256
1257   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1258   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1259   auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
1260
1261   // NOTE The frontend layout and backend layout must be the same for this operation.
1262   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1263   //      is not not efficient even if it works well. If so, it would be better to set the
1264   //      layout of these backend tensors to the same layout.
1265   //      There is also one thing we have to think about. This operation depends on the layout of
1266   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1267   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1268   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1269   const auto backend_layout = ofm_tensor->layout();
1270   UNUSED_RELEASE(backend_layout);
1271   assert(backend_layout == ifm_tensor->layout());
1272   assert(backend_layout == indices_tensor->layout());
1273   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
1274
1275   // input is n-D, indices k-D, output is (n + k - 1)-D
1276   size_t n = ifm_rank;
1277   assert(n == ifm_tensor->num_dimensions());
1278   size_t k = _ctx.at(indices_index).shape().rank();
1279   assert(k == indices_tensor->num_dimensions());
1280
1281   // Disable applied dim_correction
1282   if (n != ifm_tensor->info()->num_dimensions())
1283   {
1284     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1285     acl_common::disableDimCorrection(ifm_tensor);
1286   }
1287   if (k != indices_tensor->info()->num_dimensions())
1288   {
1289     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
1290     acl_common::disableDimCorrection(indices_tensor);
1291   }
1292
1293   auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
1294       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
1295
1296   // Revert disabling applied dim_correction
1297   if (ifm_tensor->dimension(0) == 1)
1298   {
1299     acl_common::enableDimCorrection(ifm_tensor);
1300   }
1301   if (indices_tensor->dimension(0) == 1)
1302   {
1303     acl_common::enableDimCorrection(indices_tensor);
1304   }
1305
1306   _return_fn = asAclFunction(std::move(fn));
1307 }
1308
1309 void KernelGenerator::visit(const ir::operation::ArgMax &node)
1310 {
1311   const auto ofm_index{node.getOutputs().at(0)};
1312   const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
1313   const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
1314
1315   auto ifm_shape = _ctx.at(ifm_index).shape();
1316   auto ofm_shape = _ctx.at(ofm_index).shape();
1317
1318   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1319
1320   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1321   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1322   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1323   auto frontend_layout = _current_op_seq_layout;
1324   auto backend_layout = ifm_tensor->layout();
1325
1326   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
1327   if (axis_value < 0)
1328   {
1329     axis_value += ifm_rank;
1330   }
1331
1332   auto acl_axis =
1333       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1334
1335   auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
1336       ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
1337       ::arm_compute::ReductionOperation::ARG_IDX_MAX);
1338
1339   _return_fn = asAclFunction(std::move(fn));
1340 }
1341
1342 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
1343 {
1344   const auto ofm_index{node.getOutputs().at(0)};
1345   const auto ifm_index{
1346       node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
1347
1348   auto radius = node.param().radius;
1349   auto alpha = node.param().alpha;
1350   auto beta = node.param().beta;
1351   auto bias = node.param().bias;
1352
1353   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1354   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1355
1356   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1357       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1358
1359   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1360       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1361
1362   _return_fn = asAclFunction(std::move(fn));
1363 }
1364
1365 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1366 {
1367   const auto output_index{node.getOutputs().at(0)};
1368   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1369
1370   auto block_size = node.param().block_size;
1371   assert(block_size > 0);
1372
1373   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1374   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1375
1376   auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
1377       input_tensor->handle(), output_tensor->handle(), block_size);
1378
1379   _return_fn = asAclFunction(std::move(fn));
1380 }
1381
1382 void KernelGenerator::visit(const ir::operation::Split &node)
1383 {
1384   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1385   const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1386
1387   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1388   if (!_ctx.at(axis_index).isConstant())
1389   {
1390     throw std::runtime_error("Non-constant axis_index NYI for acl_cl backend");
1391   }
1392
1393   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1394   std::vector<ir::OperandIndex> output_indexes;
1395   for (const auto &output : node.getOutputs())
1396     output_indexes.emplace_back(output);
1397
1398   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1399   std::vector<arm_compute::ICLTensor *> output_tensors;
1400   for (const auto &ofm_ind : output_indexes)
1401     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1402
1403   const auto frontend_layout = _current_op_seq_layout;
1404   const auto backend_layout = ifm_tensor->layout();
1405   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1406   if (axis < 0)
1407     axis += ifm_rank;
1408   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1409
1410   auto fn =
1411       acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
1412
1413   _return_fn = asAclFunction(std::move(fn));
1414 }
1415
1416 void KernelGenerator::visit(const ir::operation::SplitV &node)
1417 {
1418   const auto ifm_index{node.getInputs().at(ir::operation::SplitV::Input::INPUT)};
1419   const auto size_split_index{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
1420   const auto split_dim_index{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
1421
1422   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1423
1424   const size_t ifm_rank = _ctx.at(ifm_index).shape().rank();
1425   std::vector<ir::OperandIndex> output_indexes;
1426   for (const auto &output : node.getOutputs())
1427     output_indexes.emplace_back(output);
1428
1429   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1430   auto size_split_tensor = _tensor_reg->getAclTensor(size_split_index);
1431
1432   std::vector<arm_compute::ICLTensor *> output_tensors;
1433   for (const auto &ofm_ind : output_indexes)
1434     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1435
1436   auto fn = std::make_unique<arm_compute::CLSplitVEx>();
1437   const auto &split_dim_op = _ctx.at(split_dim_index);
1438   if (split_dim_op.isConstant())
1439   {
1440     int32_t split_dim = split_dim_op.asScalar<int32_t>();
1441     uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
1442     const auto frontend_layout = _current_op_seq_layout;
1443     const auto backend_layout = ifm_tensor->layout();
1444
1445     if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
1446     {
1447       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1448       acl_common::disableDimCorrection(ifm_tensor);
1449     }
1450
1451     split_dim_revised =
1452         acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout)
1453             .value();
1454     fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
1455                   output_tensors, node.param().num_splits);
1456
1457     if (ifm_tensor->dimension(0) == 1)
1458     {
1459       acl_common::enableDimCorrection(ifm_tensor);
1460     }
1461   }
1462   else
1463   {
1464     throw std::runtime_error("Non-constant split_dim NYI for acl_cl backend");
1465   }
1466
1467   _return_fn = asAclFunction(std::move(fn));
1468 }
1469
1470 void KernelGenerator::visit(const ir::operation::Unpack &node)
1471 {
1472   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1473   auto axis{node.param().axis};
1474
1475   const auto input_rank = _ctx.at(input_index).shape().rank();
1476
1477   std::vector<ir::OperandIndex> output_indexes;
1478   for (const auto &output_index : node.getOutputs())
1479     output_indexes.emplace_back(output_index);
1480
1481   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1482   std::vector<arm_compute::ICLTensor *> outputs;
1483   for (const auto &output_index : output_indexes)
1484     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1485
1486   const auto frontend_layout = _current_op_seq_layout;
1487   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1488   if (axis < 0)
1489     axis += input_rank;
1490   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1491
1492   // Disable applied dim_correction
1493   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1494   {
1495     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1496     acl_common::disableDimCorrection(input_tensor);
1497   }
1498
1499   auto fn =
1500       acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
1501
1502   // Revert disabling applied dim_correction
1503   if (input_tensor->dimension(0) == 1)
1504   {
1505     acl_common::enableDimCorrection(input_tensor);
1506   }
1507
1508   _return_fn = asAclFunction(std::move(fn));
1509 }
1510
1511 void KernelGenerator::visit(const ir::operation::Pad &node)
1512 {
1513   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
1514   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
1515   const auto output_index{node.getOutputs().at(0)};
1516   assert(_ctx.at(pad_index).data());
1517
1518   auto rank = _ctx.at(input_index).shape().rank();
1519   auto pad_base = _ctx.at(pad_index).data()->base();
1520
1521   auto input_type = _ctx.at(input_index).typeInfo();
1522   auto data_type = acl_common::asDataType(input_type.type());
1523   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
1524   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1525
1526   auto input = _tensor_reg->getAclTensor(input_index)->handle();
1527   auto output = _tensor_reg->getAclTensor(output_index)->handle();
1528
1529   const auto frontend_layout = _current_op_seq_layout;
1530   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1531
1532   ::arm_compute::PaddingList padding_list;
1533   padding_list.resize(rank);
1534   for (int32_t n = 0; n < rank; ++n)
1535   {
1536     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
1537
1538     const auto axis =
1539         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
1540     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1541   }
1542
1543   // Disable applied dim_correction
1544   const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
1545   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1546   {
1547     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1548     acl_common::disableDimCorrection(input_tensor);
1549   }
1550
1551   auto fn =
1552       acl_common::generateLayer<arm_compute::CLPadLayer>(input, output, padding_list, pixel_value);
1553
1554   // NOTE Do not revert disabling applied dim_correction for 4D.
1555   // It would produce a mistach of result by incorrect offset_first_element in
1556   // ICLKernel::add_tensor_argument<3>().
1557   // We have to disable applied dim_correction and not to revert enabling for the kernel that slices
1558   // 4D to 3D because slicing arm_compute::Window can causes incorrect offset_first_element if the
1559   // used tensor is 4D and the tensor's high dimention is 1
1560   if (input_tensor->num_dimensions() < 4 && input_tensor->dimension(0) == 1)
1561   {
1562     acl_common::enableDimCorrection(input_tensor);
1563   }
1564
1565   _return_fn = asAclFunction(std::move(fn));
1566 }
1567
1568 void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
1569 {
1570   const auto ofm_index{node.getOutputs().at(0)};
1571   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
1572
1573   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1574   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1575
1576   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1577       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1578
1579   _return_fn = asAclFunction(std::move(fn));
1580 }
1581
1582 void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
1583 {
1584   const auto ofm_index{node.getOutputs().at(0)};
1585   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
1586
1587   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1588   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1589
1590   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1591       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1592
1593   _return_fn = asAclFunction(std::move(fn));
1594 }
1595
1596 void KernelGenerator::visit(const ir::operation::Reverse &node)
1597 {
1598   const auto ofm_index{node.getOutputs().at(0)};
1599   const auto ifm_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)};
1600   const auto axis_index{node.getInputs().at(ir::operation::Reverse::Input::AXIS)};
1601
1602   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1603   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1604   auto axis_tensor = _tensor_reg->getAclTensor(axis_index);
1605
1606   // WORKAROUND: acl-cl backend only allow U32 type for axis
1607   //             ConstantInitializer will resolve S32 type to U32 type
1608   if (_ctx.at(axis_index).isConstant() &&
1609       (axis_tensor->handle()->info()->data_type() == arm_compute::DataType::S32))
1610   {
1611     axis_tensor->handle()->info()->set_data_type(arm_compute::DataType::U32);
1612   }
1613
1614   auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
1615       ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
1616
1617   _return_fn = asAclFunction(std::move(fn));
1618 }
1619
1620 } // namespace acl_cl
1621 } // namespace backend
1622 } // namespace onert