runtime/onert/backend/acl_cl/KernelGenerator.cc

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "KernelGenerator.h"
  18
  19 #include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
  20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
  21
  22 #include <AclActivationBuilder.h>
  23 #include <AclFunction.h>
  24 #include <Convert.h>
  25 #include <Swizzle.h>
  26
  27 #include "ir/Index.h"
  28 #include "ir/DataType.h"
  29 #include "ir/InternalType.h"
  30 #include "exec/NopFunction.h"
  31 #include "exec/FunctionSequence.h"
  32 #include "util/logging.h"
  33 #include "util/Utils.h"
  34 #include "AclKernelGen.h"
  35
  36 namespace onert
  37 {
  38 namespace backend
  39 {
  40 namespace acl_cl
  41 {
  42
  43 using ::onert::backend::acl_common::asAclFunction;
  44 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
  45     ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclFunction>;
  46
  47 KernelGenerator::KernelGenerator(
  48     const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
  49     const std::shared_ptr<TensorBuilder> &tensor_builder,
  50     const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
  51     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
  52       _tensor_reg(tensor_reg), _current_op_seq_layout(ir::Layout::UNKNOWN)
  53 {
  54   // DO NOTHING
  55 }
  56
  57 void KernelGenerator::visit(const ir::OpSequence &op_seq)
  58 {
  59   // TODO Move this to IKernelGenerator
  60   //      (all derivatives have the same implementation for this)
  61   assert(!_return_fn_seq);
  62   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
  63   _return_fn_seq->enableDynamicShapeInferer(false);
  64
  65   _current_op_seq_layout = op_seq.getLayout();
  66   for (const auto &operation_idx : op_seq.operations())
  67   {
  68     const auto &node = _operations_ctx.at(operation_idx);
  69     node.accept(*this);
  70     _return_fn_seq->append(releaseFunction());
  71   }
  72 }
  73
  74 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
  75 {
  76   const auto ofm_index{node.getOutputs().at(0)};
  77   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
  78   const auto block_size_index{
  79       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
  80
  81   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
  82   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
  83   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
  84
  85   assert(_ctx.at(block_size_index).data());
  86
  87   auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
  88       ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
  89
  90   _return_fn = asAclFunction(std::move(fn));
  91 }
  92
  93 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
  94 {
  95   const auto ofm_index{node.getOutputs().at(0)};
  96   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
  97   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
  98
  99   const auto activation = node.param().activation;
 100
 101   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 102   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
 103   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
 104
 105   const auto act_info = acl_common::asActivationLayerInfo(activation);
 106
 107   std::unique_ptr<arm_compute::IFunction> fn;
 108   switch (node.param().arithmetic_type)
 109   {
 110     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
 111     {
 112       fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
 113           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 114           arm_compute::ConvertPolicy::SATURATE, act_info);
 115       break;
 116     }
 117     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
 118     {
 119       fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
 120           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
 121           arm_compute::ConvertPolicy::SATURATE, act_info);
 122       break;
 123     }
 124     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
 125     {
 126       fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
 127           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
 128           arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
 129           act_info);
 130       break;
 131     }
 132     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
 133     {
 134       fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
 135           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
 136       break;
 137     }
 138     default:
 139       assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
 140       break;
 141   }
 142
 143   _return_fn = asAclFunction(std::move(fn));
 144 }
 145
 146 void KernelGenerator::visit(const ir::operation::Conv2D &node)
 147 {
 148   using ir::operation::Conv2D;
 149
 150   const auto ofm_index{node.getOutputs().at(0)};
 151   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
 152   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
 153   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 154
 155   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 156   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 157   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
 158   const auto &ker_shape = _ctx.at(ker_index).shape();
 159   const auto ker_height = ker_shape.dim(1);
 160   const auto ker_width = ker_shape.dim(2);
 161
 162   const auto stride = node.param().stride;
 163   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 164                                             ker_width, ker_height);
 165   const auto activation = node.param().activation;
 166
 167   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 168   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 169   auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
 170   auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
 171
 172   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 173   const auto act_info = acl_common::asActivationLayerInfo(activation);
 174
 175   auto fn = acl_common::generateLayer<arm_compute::CLConvolutionLayer>(
 176       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
 177       ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
 178       ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 179
 180   _return_fn = asAclFunction(std::move(fn));
 181 }
 182
 183 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 184 {
 185   using ir::operation::DepthwiseConv2D;
 186
 187   const auto ofm_index{node.getOutputs().at(0)};
 188   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
 189   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
 190   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
 191
 192   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
 193   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
 194   // Kernel format is [1, kernel_height, kernel_width, depth_out].
 195   const auto &ker_shape = _ctx.at(ker_index).shape();
 196   const auto ker_height = ker_shape.dim(1);
 197   const auto ker_width = ker_shape.dim(2);
 198
 199   const auto stride = node.param().stride;
 200   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
 201                                             ker_width, ker_height);
 202   const auto multiplier = node.param().multiplier;
 203   const auto activation = node.param().activation;
 204
 205   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 206   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 207   auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
 208   auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
 209
 210   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
 211   const auto act_info = acl_common::asActivationLayerInfo(activation);
 212
 213   {
 214     auto fn = acl_common::generateLayer<arm_compute::CLDepthwiseConvolutionLayer>(
 215         ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
 216         conv_info, multiplier, act_info);
 217
 218     _return_fn = asAclFunction(std::move(fn));
 219   }
 220 }
 221
 222 void KernelGenerator::visit(const ir::operation::Concat &node)
 223 {
 224   const auto ofm_index{node.getOutputs().at(0)};
 225
 226   std::vector<ir::OperandIndex> input_indexes;
 227
 228   for (const auto &input : node.getInputs())
 229     input_indexes.emplace_back(input);
 230
 231   const auto axis = node.param().axis;
 232
 233   // Concat elimination check
 234   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
 235   if (eliminated)
 236   {
 237     // If concat eliminated, return a NOP IFunction
 238     VERBOSE(acl_cl_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
 239     _return_fn = std::make_unique<exec::NopFunction>();
 240     return;
 241   }
 242
 243   auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 244   std::vector<::arm_compute::ICLTensor *> input_tensors;
 245   for (auto &ifm_ind : input_indexes)
 246     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 247
 248   std::unique_ptr<::arm_compute::IFunction> fn;
 249   if (input_indexes.size() < 2)
 250   {
 251     fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensors.at(0),
 252                                                         output_tensor->handle());
 253   }
 254   else
 255   {
 256     const auto rank = _ctx.at(ofm_index).shape().rank();
 257     const auto frontend_layout = _current_op_seq_layout;
 258     const auto backend_layout = output_tensor->layout();
 259     const auto fixed_axis =
 260         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
 261     fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
 262         input_tensors, output_tensor->handle(), fixed_axis);
 263   }
 264
 265   _return_fn = asAclFunction(std::move(fn));
 266 }
 267
 268 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 269 {
 270   const auto output_index{node.getOutputs().at(0)};
 271   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 272   const auto activation = node.param().activation;
 273
 274   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
 275                                                 ::arm_compute::CLFullyConnectedReshapingLayer>(
 276       node, _ctx, _tensor_builder, _tensor_reg, _current_op_seq_layout);
 277   _return_fn = std::make_unique<exec::FunctionSequence>(
 278       std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 279 }
 280
 281 void KernelGenerator::visit(const ir::operation::Reduce &node)
 282 {
 283   const auto output_index{node.getOutputs().at(0)};
 284   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
 285   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 286   const auto keep_dims{node.param().keep_dims};
 287   const auto reduce_type = node.param().reduce_type;
 288
 289   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 290   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 291
 292   // Convert to ACL axes taking into account negative values and possible duplicates.
 293   const auto &axes = _ctx.at(axes_index);
 294   const auto input_rank = _ctx.at(input_index).shape().rank();
 295   const auto frontend_layout = _current_op_seq_layout;
 296   const auto backend_layout = input_tensor->layout();
 297
 298   std::unique_ptr<arm_compute::IFunction> fn;
 299   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
 300   {
 301     const auto acl_axes =
 302         acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
 303     fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
 304                                                               keep_dims, output_tensor->handle());
 305   }
 306   else
 307   {
 308     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
 309
 310     fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
 311         _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 312         output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type));
 313   }
 314
 315   _return_fn = asAclFunction(std::move(fn));
 316 }
 317
 318 void KernelGenerator::visit(const ir::operation::Reshape &node)
 319 {
 320   const auto output_index{node.getOutputs().at(0)};
 321   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 322
 323   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 324   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 325
 326   // NOTE This operation must not be changed the layout from frontend to backend
 327   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
 328   const auto frontend_layout = _current_op_seq_layout;
 329   const auto backend_layout = output_tensor->layout();
 330   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
 331          frontend_layout == backend_layout);
 332   UNUSED_RELEASE(frontend_layout);
 333   UNUSED_RELEASE(backend_layout);
 334
 335   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 336                                                                    output_tensor->handle());
 337
 338   _return_fn = asAclFunction(std::move(fn));
 339 }
 340
 341 void KernelGenerator::visit(const ir::operation::Squeeze &node)
 342 {
 343   // Squeeze is identical to reshape except that it has an optional dimensions input.
 344   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
 345   // by freezer and toco
 346   // TODO Support multi-layout for frontend and backend
 347   const auto output_index{node.getOutputs().at(0)};
 348   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 349   const auto dims{node.param().dims};
 350   const auto ndim{node.param().ndim};
 351   (void)dims;
 352   (void)ndim;
 353
 354   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 355   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 356   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 357                                                                    output_tensor->handle());
 358   _return_fn = asAclFunction(std::move(fn));
 359 }
 360
 361 void KernelGenerator::visit(const ir::operation::Softmax &node)
 362 {
 363   const auto output_index{node.getOutputs().at(0)};
 364   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
 365
 366   const auto beta = node.param().beta;
 367
 368   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 369   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 370
 371   auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
 372       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 373       output_tensor->handle(), beta);
 374
 375   _return_fn = asAclFunction(std::move(fn));
 376 }
 377
 378 void KernelGenerator::visit(const ir::operation::Slice &node)
 379 {
 380   const auto output_index{node.getOutputs().at(0)};
 381   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
 382   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
 383   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 384
 385   auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
 386   auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
 387   const auto frontend_layout = _current_op_seq_layout;
 388   const auto backend_layout = inputData_tensor->layout();
 389
 390   // Set initializers for indices data such as order of inputData
 391   int input_rank = _ctx.at(input_index).shape().rank();
 392   std::vector<int32_t> starts;
 393   std::vector<int32_t> ends;
 394   starts.resize(input_rank, 0);
 395   ends.resize(input_rank, 0);
 396   {
 397     assert(_ctx.at(begins_index).data());
 398     assert(_ctx.at(sizes_index).data());
 399     auto beginData_base = _ctx.at(begins_index).data()->base();
 400     auto sizeData_base = _ctx.at(sizes_index).data()->base();
 401     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
 402     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
 403
 404     using ir::DataType;
 405
 406     UNUSED_RELEASE(beginData_size);
 407     UNUSED_RELEASE(sizeData_size);
 408
 409     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
 410     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
 411     assert(beginData_size == input_rank);
 412     assert(sizeData_size == input_rank);
 413
 414     assert(beginData_base != nullptr);
 415     for (int n = 0; n < input_rank; ++n)
 416     {
 417       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 418                                                                  backend_layout)
 419                       .value();
 420
 421       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
 422       starts[axis] = begin_value;
 423
 424       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
 425       ends[axis] = begin_value + size_value;
 426     }
 427   }
 428
 429   ::arm_compute::Coordinates starts_set;
 430   ::arm_compute::Coordinates ends_set;
 431
 432   for (size_t i = 0; i < starts.size(); ++i)
 433   {
 434     starts_set.set(i, starts[i]);
 435     ends_set.set(i, ends[i]);
 436   }
 437
 438   auto fn = acl_common::generateLayer<arm_compute::CLSlice>(
 439       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 440
 441   _return_fn = asAclFunction(std::move(fn));
 442 }
 443
 444 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 445 {
 446   const auto output_index{node.getOutputs().at(0)};
 447   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
 448   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
 449   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
 450   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 451
 452   auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
 453   auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
 454   const auto frontend_layout = _current_op_seq_layout;
 455   const auto backend_layout = inputData_tensor->layout();
 456
 457   // Set initializers for indices data such as order of inputData
 458   int input_rank = _ctx.at(input_index).shape().rank();
 459   std::vector<int32_t> starts;
 460   std::vector<int32_t> ends;
 461   std::vector<int32_t> strides;
 462   starts.resize(input_rank, 0);
 463   ends.resize(input_rank, 0);
 464   strides.resize(input_rank, 0);
 465   {
 466     assert(_ctx.at(starts_index).data());
 467     assert(_ctx.at(ends_index).data());
 468     assert(_ctx.at(strides_index).data());
 469     auto startData_base = _ctx.at(starts_index).data()->base();
 470     auto endData_base = _ctx.at(ends_index).data()->base();
 471     auto stridesData_base = _ctx.at(strides_index).data()->base();
 472     const int startData_size = _ctx.at(starts_index).shape().num_elements();
 473     const int endData_size = _ctx.at(ends_index).shape().num_elements();
 474     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
 475
 476     using ir::DataType;
 477
 478     UNUSED_RELEASE(startData_size);
 479     UNUSED_RELEASE(endData_size);
 480     UNUSED_RELEASE(stridesData_size);
 481
 482     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
 483     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
 484     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
 485     assert(startData_size == input_rank);
 486     assert(endData_size == input_rank);
 487     assert(stridesData_size == input_rank);
 488
 489     assert(startData_base != nullptr);
 490     for (int n = 0; n < input_rank; ++n)
 491     {
 492       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
 493                                                                  backend_layout)
 494                       .value();
 495
 496       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
 497       starts[axis] = start_value;
 498
 499       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
 500       ends[axis] = end_value;
 501
 502       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
 503       strides[axis] = strides_value;
 504     }
 505   }
 506
 507   // Set mask bits such as order of inputData
 508   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
 509                                                            frontend_layout, backend_layout);
 510   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
 511                                                          frontend_layout, backend_layout);
 512   const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
 513       node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
 514
 515   ::arm_compute::Coordinates starts_set;
 516   ::arm_compute::Coordinates ends_set;
 517   ::arm_compute::BiStrides strides_set;
 518
 519   for (size_t i = 0; i < starts.size(); ++i)
 520   {
 521     starts_set.set(i, starts[i]);
 522     ends_set.set(i, ends[i]);
 523     strides_set.set(i, strides[i]);
 524   }
 525
 526   auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
 527       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
 528       begin_mask, end_mask, shrink_axis_mask);
 529
 530   _return_fn = asAclFunction(std::move(fn));
 531 }
 532
 533 void KernelGenerator::visit(const ir::operation::Transpose &node)
 534 {
 535   const auto ofm_idx{node.getOutputs().at(0)};
 536   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
 537   const auto &perm{node.param().perm};
 538
 539   const auto rank = _ctx.at(ifm_idx).shape().rank();
 540
 541   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
 542   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
 543   const auto frontend_layout = _current_op_seq_layout;
 544   const auto backend_layout = ifm_tensor->layout();
 545
 546   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
 547   // Reversed
 548   auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
 549       rank, pv, frontend_layout, backend_layout);
 550
 551   auto fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
 552                                                                 ofm_tensor->handle(), backend_pv);
 553
 554   _return_fn = asAclFunction(std::move(fn));
 555 }
 556
 557 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
 558 {
 559   const auto ofm_index{node.getOutputs().at(0)};
 560   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
 561
 562   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 563   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 564
 565   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
 566       node.param().op_type, node.param().alpha, node.param().beta);
 567
 568   auto fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 569       ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 570
 571   _return_fn = asAclFunction(std::move(fn));
 572 }
 573
 574 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
 575 {
 576   const auto output_index{node.getOutputs().at(0)};
 577   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
 578   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
 579
 580   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 581   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
 582   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
 583
 584   std::unique_ptr<arm_compute::IFunction> fn;
 585   switch (node.param().op_type)
 586   {
 587     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
 588     {
 589       fn = acl_common::generateLayer<arm_compute::CLBinaryLogicalOp>(
 590           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle(),
 591           arm_compute::BinaryLogicalOperation::AND);
 592       break;
 593     }
 594     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
 595     {
 596       fn = acl_common::generateLayer<arm_compute::CLBitwiseOr>(
 597           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 598       break;
 599     }
 600     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
 601     {
 602       fn = acl_common::generateLayer<arm_compute::CLElementwiseMax>(
 603           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 604       break;
 605     }
 606     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
 607     {
 608       fn = acl_common::generateLayer<arm_compute::CLElementwiseMin>(
 609           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
 610       break;
 611     }
 612     default:
 613     {
 614       std::string err_msg("acl_cl KernelGenerator : " + node.name() +
 615                           "is not elementwise-binary operations");
 616       assert(false && err_msg.c_str());
 617       break;
 618     }
 619   }
 620
 621   _return_fn = asAclFunction(std::move(fn));
 622 }
 623
 624 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
 625 {
 626   const auto output_index{node.getOutputs().at(0)};
 627   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
 628
 629   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 630   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 631
 632   std::unique_ptr<arm_compute::IFunction> fn;
 633   switch (node.param().op_type)
 634   {
 635     case ir::operation::ElementwiseUnary::Type::ABS:
 636     {
 637       const ::arm_compute::ActivationLayerInfo act_info{
 638           ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 639
 640       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 641           input_tensor->handle(), output_tensor->handle(), act_info);
 642       break;
 643     }
 644     case ir::operation::ElementwiseUnary::Type::CAST:
 645     {
 646       if (input_tensor->data_type() == output_tensor->data_type())
 647       {
 648         fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
 649                                                             output_tensor->handle());
 650         ;
 651       }
 652       else
 653       {
 654         // TODO Support converting float to int32 as round down
 655         fn = acl_common::generateLayer<arm_compute::CLCast>(
 656             input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
 657       }
 658       break;
 659     }
 660     case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
 661     {
 662       fn = acl_common::generateLayer<arm_compute::CLDequantizationLayer>(input_tensor->handle(),
 663                                                                          output_tensor->handle());
 664       break;
 665     }
 666     case ir::operation::ElementwiseUnary::Type::EXP:
 667     {
 668       fn = acl_common::generateLayer<arm_compute::CLExpLayer>(input_tensor->handle(),
 669                                                               output_tensor->handle());
 670       break;
 671     }
 672     case ir::operation::ElementwiseUnary::Type::FLOOR:
 673     {
 674       fn = acl_common::generateLayer<arm_compute::CLFloor>(input_tensor->handle(),
 675                                                            output_tensor->handle());
 676       break;
 677     }
 678     case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
 679     {
 680       fn = acl_common::generateLayer<arm_compute::CLBitwiseNot>(input_tensor->handle(),
 681                                                                 output_tensor->handle());
 682       break;
 683     }
 684     case ir::operation::ElementwiseUnary::Type::NEG:
 685     {
 686       fn = acl_common::generateLayer<arm_compute::CLNeg>(input_tensor->handle(),
 687                                                          output_tensor->handle());
 688       break;
 689     }
 690     case ir::operation::ElementwiseUnary::Type::RSQRT:
 691     {
 692       fn = acl_common::generateLayer<arm_compute::CLRsqrtLayer>(input_tensor->handle(),
 693                                                                 output_tensor->handle());
 694       break;
 695     }
 696     case ir::operation::ElementwiseUnary::Type::SQRT:
 697     {
 698       const ::arm_compute::ActivationLayerInfo act_info{
 699           ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 700
 701       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
 702           input_tensor->handle(), output_tensor->handle(), act_info);
 703       break;
 704     }
 705     default:
 706     {
 707       throw std::runtime_error("acl_cl KernelGenerator : " + node.name() + "is not supported yet");
 708       break;
 709     }
 710   }
 711
 712   auto acl_fn = asAclFunction(std::move(fn));
 713
 714   _return_fn = std::move(acl_fn);
 715 }
 716
 717 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
 718 {
 719   const auto output_index{node.getOutputs().at(0)};
 720   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 721
 722   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 723   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 724
 725   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
 726                                                                    output_tensor->handle());
 727
 728   _return_fn = asAclFunction(std::move(fn));
 729 }
 730
 731 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
 732 {
 733   const auto ofm_index{node.getOutputs().at(0)};
 734   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
 735   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
 736   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 737
 738   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 739   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 740   auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get();
 741   auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get();
 742   auto epsilon = node.param().epsilon;
 743   auto activation = node.param().activation;
 744
 745   auto fn = acl_common::generateLayer<arm_compute::CLInstanceNormalizationLayerEx>(
 746       ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
 747       epsilon);
 748
 749   _return_fn = std::make_unique<exec::FunctionSequence>(
 750       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 751 }
 752
 753 void KernelGenerator::visit(const ir::operation::LSTM &node)
 754 {
 755   _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ICLTensor,
 756                                          ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_reg);
 757 }
 758
 759 void KernelGenerator::visit(const ir::operation::Comparison &node)
 760 {
 761   const auto output_index{node.getOutputs().at(0)};
 762   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
 763   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
 764
 765   const auto comparison_type = node.param().comparison_type;
 766
 767   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 768   auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get();
 769   auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get();
 770
 771   auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
 772       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
 773       (arm_compute::ComparisonOperation)comparison_type);
 774
 775   _return_fn = asAclFunction(std::move(fn));
 776 }
 777
 778 void KernelGenerator::visit(const ir::operation::Pack &node)
 779 {
 780   const auto output_index{node.getOutputs().at(0)};
 781   auto axis{node.param().axis};
 782
 783   const auto output_rank = _ctx.at(output_index).shape().rank();
 784
 785   std::vector<ir::OperandIndex> input_indexes;
 786   for (const auto &input_index : node.getInputs())
 787     input_indexes.emplace_back(input_index);
 788
 789   auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
 790   std::vector<arm_compute::ICLTensor *> inputs;
 791   for (const auto &input_index : input_indexes)
 792     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
 793
 794   const auto frontend_layout = _current_op_seq_layout;
 795   const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout();
 796
 797   if (axis < 0)
 798     axis += output_rank;
 799   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 800
 801   // Disable applied dim_correction
 802   std::vector<arm_compute::TensorShape> orig_inputs_acl_tensor_shapes;
 803   for (const auto &input_index : input_indexes)
 804   {
 805     size_t input_rank = _ctx.at(input_index).shape().rank();
 806     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
 807     orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape());
 808     assert(input_rank == input_tensor->num_dimensions());
 809     if (input_rank != input_tensor->info()->num_dimensions())
 810     {
 811       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
 812       input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
 813           _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
 814     }
 815   }
 816
 817   auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
 818
 819   // Revert disabling applied dim_correction
 820   assert(inputs.size() == orig_inputs_acl_tensor_shapes.size());
 821   for (size_t i = 0; i < inputs.size(); ++i)
 822   {
 823     inputs.at(i)->info()->set_tensor_shape(orig_inputs_acl_tensor_shapes.at(i));
 824   }
 825
 826   _return_fn = asAclFunction(std::move(fn));
 827 }
 828
 829 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 830 {
 831   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
 832       node, _ctx, _tensor_reg, _current_op_seq_layout,
 833       acl_common::convertPoolType(node.param().op_type));
 834
 835   const auto ofm_index{node.getOutputs().at(0)};
 836   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 837   const auto activation = node.param().activation;
 838   _return_fn = std::make_unique<exec::FunctionSequence>(
 839       asAclFunction(std::move(raw_fn)),
 840       ActivationBuilder::generate(activation, ofm_tensor->handle()));
 841 }
 842
 843 void KernelGenerator::visit(const ir::operation::Permute &node)
 844 {
 845   const auto ofm_idx{node.getOutputs().at(0)};
 846   const auto ifm_idx{node.getInputs().at(0)};
 847   const auto permute_type = node.getPermuteType();
 848   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
 849   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
 850   const auto rank = _ctx.at(ofm_idx).shape().rank();
 851   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 852
 853   std::unique_ptr<::arm_compute::IFunction> fn;
 854   arm_compute::PermutationVector pv;
 855   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
 856   {
 857     // WHCN -> CWHN
 858     pv = arm_compute::PermutationVector{2, 0, 1};
 859
 860     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
 861                                                            ofm_tensor->handle(), pv);
 862   }
 863   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
 864   {
 865     // CWHN -> WHCN
 866     pv = arm_compute::PermutationVector{1, 2, 0};
 867
 868     fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
 869                                                              ofm_tensor->handle(), pv);
 870   }
 871   else
 872   {
 873     fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
 874   }
 875
 876   _return_fn = asAclFunction(std::move(fn));
 877 }
 878
 879 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 880 {
 881   const auto ofm_index{node.getOutputs().at(0)};
 882
 883   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 884
 885   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 886   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 887
 888   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
 889       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
 890       ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
 891       ::arm_compute::SamplingPolicy::TOP_LEFT);
 892
 893   _return_fn = asAclFunction(std::move(fn));
 894 }
 895
 896 void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
 897 {
 898   const auto ofm_index{node.getOutputs().at(0)};
 899
 900   const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)};
 901
 902   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 903   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 904
 905   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
 906       ifm_tensor->handle(), ofm_tensor->handle(),
 907       ::arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, ::arm_compute::BorderMode::REPLICATE,
 908       ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
 909
 910   _return_fn = asAclFunction(std::move(fn));
 911 }
 912
 913 void KernelGenerator::visit(const ir::operation::RNN &node)
 914 {
 915   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
 916   const auto hidden_state_out_index{
 917       node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
 918
 919   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
 920   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
 921   const auto recurrent_weights_index{
 922       node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
 923   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
 924   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
 925
 926   const auto activation = node.param().activation;
 927
 928   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 929   auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get();
 930
 931   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
 932   auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get();
 933   auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get();
 934   auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
 935   auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get();
 936   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 937
 938   auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
 939       hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
 940   _return_fn = asAclFunction(std::move(copy_layer));
 941
 942   auto fn = acl_common::generateLayer<arm_compute::CLRNNLayer>(
 943       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 944       weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
 945       hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
 946   _return_fn = asAclFunction(std::move(fn));
 947 }
 948
 949 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
 950 {
 951   const auto ofm_index{node.getOutputs().at(0)};
 952   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
 953   const auto block_size_index{
 954       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
 955   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 956
 957   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 958   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 959   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
 960   auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get();
 961
 962   assert(_ctx.at(block_size_index).data());
 963   assert(_ctx.at(paddings_index).data());
 964
 965   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToBatchLayer>(
 966       ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
 967       ofm_tensor->handle());
 968
 969   _return_fn = asAclFunction(std::move(fn));
 970 }
 971
 972 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 973 {
 974   const auto ofm_index{node.getOutputs().at(0)};
 975   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
 976
 977   auto block_size = node.param().block_size;
 978
 979   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
 980   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
 981
 982   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
 983       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
 984
 985   _return_fn = asAclFunction(std::move(fn));
 986 }
 987
 988 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
 989 {
 990   const auto output_index{node.getOutputs().at(0)};
 991   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
 992   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 993
 994   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
 995   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
 996   auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
 997
 998   auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
 999       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
1000
1001   _return_fn = asAclFunction(std::move(fn));
1002 }
1003
1004 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1005 {
1006   const auto ofm_index{node.getOutputs().at(0)};
1007   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
1008
1009   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1010   // L2 Normalization always performs the reduction along the depth axis
1011   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1012   // choosing normalization parameters as below
1013
1014   const auto &ifm_shape = _ctx.at(ifm_index).shape();
1015   // TODO Support optional constant dimension that normalization would be performed on
1016   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1017   int32_t radius =
1018       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1019   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
1020   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
1021   float bias = 0.0f;                             // Don't offset the reduction.
1022
1023   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1024   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1025
1026   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1027                                                                radius, alpha, beta, bias, false);
1028
1029   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1030       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1031
1032   _return_fn = asAclFunction(std::move(fn));
1033 }
1034
1035 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
1036 {
1037   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
1038   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
1039
1040   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
1041   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
1042   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
1043
1044   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
1045   auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get();
1046
1047   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
1048   auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get();
1049   auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
1050
1051   auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
1052       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
1053       output_tensor->handle(), hits_tensor->handle());
1054
1055   _return_fn = asAclFunction(std::move(fn));
1056 }
1057
1058 void KernelGenerator::visit(const ir::operation::PReLU &node)
1059 {
1060   const auto ofm_index{node.getOutputs().at(0)};
1061   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
1062   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
1063
1064   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1065   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1066   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get();
1067
1068   auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
1069       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
1070
1071   _return_fn = asAclFunction(std::move(fn));
1072 }
1073
1074 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1075 {
1076   const auto ofm_index{node.getOutputs().at(0)};
1077   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1078   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1079
1080   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1081   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1082   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
1083
1084   const auto stride = node.param().stride;
1085
1086   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1087          (node.param().padding.type == ir::PaddingType::VALID));
1088   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1089                                       ker_shape.W, ker_shape.H);
1090   uint32_t invalid_horizontal = 0;
1091   uint32_t invalid_vertical = 0;
1092   if (node.param().padding.type == ir::PaddingType::VALID)
1093   {
1094     invalid_horizontal =
1095         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1096     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1097   }
1098
1099   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1100   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1101   auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
1102
1103   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1104
1105   auto fn = acl_common::generateLayer<arm_compute::CLTransposeConvLayer>(
1106       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
1107       ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
1108       invalid_vertical);
1109
1110   _return_fn = asAclFunction(std::move(fn));
1111 }
1112
1113 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1114 {
1115   const auto ofm_index{node.getOutputs().at(0)};
1116   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1117   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1118
1119   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1120   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
1121   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
1122
1123   auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
1124       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1125
1126   _return_fn = asAclFunction(std::move(fn));
1127 }
1128
1129 void KernelGenerator::visit(const ir::operation::TopKV2 &node)
1130 {
1131   const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
1132   const auto outputIndices_index{
1133       node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
1134
1135   const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
1136
1137   // Currently, we only support the vector input.
1138   assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1139          _ctx.at(inputData_index).shape().rank() == 2);
1140
1141   const auto k = node.param().k;
1142
1143   auto values_tensor = _tensor_reg->getAclTensor(outputValues_index).get();
1144   auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index).get();
1145   auto input_tensor = _tensor_reg->getAclTensor(inputData_index).get();
1146
1147   auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
1148       input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
1149
1150   _return_fn = asAclFunction(std::move(fn));
1151 }
1152
1153 void KernelGenerator::visit(const ir::operation::Gather &node)
1154 {
1155   const auto ofm_index{node.getOutputs().at(0)};
1156
1157   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
1158   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
1159
1160   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1161   const auto axis_raw = node.param().axis;
1162   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1163   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
1164
1165   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1166   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1167   auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get();
1168
1169   // NOTE The frontend layout and backend layout must be the same for this operation.
1170   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1171   //      is not not efficient even if it works well. If so, it would be better to set the
1172   //      layout of these backend tensors to the same layout.
1173   //      There is also one thing we have to think about. This operation depends on the layout of
1174   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1175   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1176   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1177   const auto backend_layout = ofm_tensor->layout();
1178   UNUSED_RELEASE(backend_layout);
1179   assert(backend_layout == ifm_tensor->layout());
1180   assert(backend_layout == indices_tensor->layout());
1181   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
1182
1183   // input is n-D, indices k-D, output is (n + k - 1)-D
1184   size_t n = ifm_rank;
1185   assert(n == ifm_tensor->num_dimensions());
1186   size_t k = _ctx.at(indices_index).shape().rank();
1187   assert(k == indices_tensor->num_dimensions());
1188
1189   // Disable applied dim_correction
1190   const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape();
1191   if (n != ifm_tensor->info()->num_dimensions())
1192   {
1193     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1194     const auto ifm = _ctx.at(ifm_index);
1195     ifm_tensor->info()->set_tensor_shape(
1196         acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
1197   }
1198   const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape();
1199   if (k != indices_tensor->info()->num_dimensions())
1200   {
1201     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
1202     const auto indices = _ctx.at(indices_index);
1203     indices_tensor->info()->set_tensor_shape(
1204         acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
1205   }
1206
1207   auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
1208       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
1209
1210   // Revert disabling applied dim_correction
1211   ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
1212   indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
1213
1214   _return_fn = asAclFunction(std::move(fn));
1215 }
1216
1217 void KernelGenerator::visit(const ir::operation::ArgMax &node)
1218 {
1219   const auto ofm_index{node.getOutputs().at(0)};
1220   const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
1221
1222   auto ifm_shape = _ctx.at(ifm_index).shape();
1223   auto ofm_shape = _ctx.at(ofm_index).shape();
1224
1225   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1226
1227   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1228   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1229   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1230   auto frontend_layout = _current_op_seq_layout;
1231   auto backend_layout = ifm_tensor->layout();
1232
1233   int axis_value = node.param().axis;
1234   if (axis_value < 0)
1235   {
1236     axis_value += ifm_rank;
1237   }
1238
1239   auto acl_axis =
1240       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1241
1242   auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayer>(
1243       ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
1244       ::arm_compute::ReductionOperation::ARG_IDX_MAX);
1245
1246   _return_fn = asAclFunction(std::move(fn));
1247 }
1248
1249 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
1250 {
1251   const auto ofm_index{node.getOutputs().at(0)};
1252   const auto ifm_index{
1253       node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
1254
1255   auto radius = node.param().radius;
1256   auto alpha = node.param().alpha;
1257   auto beta = node.param().beta;
1258   auto bias = node.param().bias;
1259
1260   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1261   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1262
1263   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1264       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1265
1266   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1267       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1268
1269   _return_fn = asAclFunction(std::move(fn));
1270 }
1271
1272 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1273 {
1274   const auto output_index{node.getOutputs().at(0)};
1275   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1276
1277   auto block_size = node.param().block_size;
1278   assert(block_size > 0);
1279
1280   auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
1281   auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
1282
1283   auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
1284       input_tensor->handle(), output_tensor->handle(), block_size);
1285
1286   _return_fn = asAclFunction(std::move(fn));
1287 }
1288
1289 void KernelGenerator::visit(const ir::operation::Split &node)
1290 {
1291   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1292
1293   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1294
1295   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1296   std::vector<ir::OperandIndex> output_indexes;
1297   for (const auto &output : node.getOutputs())
1298     output_indexes.emplace_back(output);
1299
1300   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1301   std::vector<arm_compute::ICLTensor *> output_tensors;
1302   for (const auto &ofm_ind : output_indexes)
1303     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle());
1304
1305   const auto frontend_layout = _current_op_seq_layout;
1306   const auto backend_layout = ifm_tensor->layout();
1307   auto axis = node.param().axis;
1308   if (axis < 0)
1309     axis += ifm_rank;
1310   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1311
1312   auto fn =
1313       acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
1314
1315   _return_fn = asAclFunction(std::move(fn));
1316 }
1317
1318 void KernelGenerator::visit(const ir::operation::Unpack &node)
1319 {
1320   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1321   auto axis{node.param().axis};
1322
1323   const auto input_rank = _ctx.at(input_index).shape().rank();
1324
1325   std::vector<ir::OperandIndex> output_indexes;
1326   for (const auto &output_index : node.getOutputs())
1327     output_indexes.emplace_back(output_index);
1328
1329   auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
1330   std::vector<arm_compute::ICLTensor *> outputs;
1331   for (const auto &output_index : output_indexes)
1332     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1333
1334   const auto frontend_layout = _current_op_seq_layout;
1335   const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
1336   if (axis < 0)
1337     axis += input_rank;
1338   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1339
1340   // Disable applied dim_correction
1341   std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
1342   for (const auto &output_index : output_indexes)
1343   {
1344     size_t output_rank = _ctx.at(output_index).shape().rank();
1345     const auto &output_tensor = _tensor_reg->getAclTensor(output_index);
1346     orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
1347     assert(output_rank == output_tensor->num_dimensions());
1348     if (output_rank != output_tensor->info()->num_dimensions())
1349     {
1350       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1351       output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
1352           _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
1353     }
1354   }
1355
1356   auto fn = acl_common::generateLayer<arm_compute::CLUnstack>(input, outputs, axis);
1357
1358   _return_fn = asAclFunction(std::move(fn));
1359 }
1360
1361 void KernelGenerator::visit(const ir::operation::Pad &node)
1362 {
1363   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
1364   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
1365   const auto output_index{node.getOutputs().at(0)};
1366   assert(_ctx.at(pad_index).data());
1367
1368   auto rank = _ctx.at(input_index).shape().rank();
1369   auto pad_base = _ctx.at(pad_index).data()->base();
1370
1371   auto input_type = _ctx.at(input_index).typeInfo();
1372   auto data_type = acl_common::asDataType(input_type.type());
1373   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
1374   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1375
1376   auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
1377   auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
1378
1379   const auto frontend_layout = _current_op_seq_layout;
1380   const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
1381
1382   ::arm_compute::PaddingList padding_list;
1383   padding_list.resize(rank);
1384   for (int32_t n = 0; n < rank; ++n)
1385   {
1386     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
1387
1388     const auto axis =
1389         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
1390     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1391   }
1392
1393   // Disable applied dim_correction
1394   size_t input_rank = _ctx.at(input_index).shape().rank();
1395   const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
1396   assert(input_rank == input_tensor->num_dimensions());
1397   if (input_rank != input_tensor->info()->num_dimensions())
1398   {
1399     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1400     input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
1401         _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
1402   }
1403
1404   auto fn =
1405       acl_common::generateLayer<arm_compute::CLPadLayer>(input, output, padding_list, pixel_value);
1406
1407   // Do not revert disabling applied dim_correction CLPadKernel has cl kernel for 4-dimension
1408   // It would produce a mistach of result
1409
1410   _return_fn = asAclFunction(std::move(fn));
1411 }
1412
1413 void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
1414 {
1415   const auto ofm_index{node.getOutputs().at(0)};
1416   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
1417
1418   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1419   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1420
1421   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1422       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1423
1424   _return_fn = asAclFunction(std::move(fn));
1425 }
1426
1427 void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
1428 {
1429   const auto ofm_index{node.getOutputs().at(0)};
1430   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
1431
1432   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1433   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1434
1435   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1436       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1437
1438   _return_fn = asAclFunction(std::move(fn));
1439 }
1440
1441 } // namespace acl_cl
1442 } // namespace backend
1443 } // namespace onert