Imported Upstream version 1.12.0
[platform/core/ml/nnfw.git] / runtime / onert / backend / acl_cl / KernelGenerator.cc
1 /*
2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "KernelGenerator.h"
18
19 #include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
21
22 #include <AclActivationBuilder.h>
23 #include <AclFunction.h>
24 #include <Convert.h>
25 #include <Swizzle.h>
26
27 #include "ir/Index.h"
28 #include "ir/DataType.h"
29 #include "ir/InternalType.h"
30 #include "exec/NopFunction.h"
31 #include "exec/FunctionSequence.h"
32 #include "util/logging.h"
33 #include "util/Utils.h"
34 #include "AclKernelGen.h"
35
36 namespace onert
37 {
38 namespace backend
39 {
40 namespace acl_cl
41 {
42
43 using ::onert::backend::acl_common::asAclFunction;
44 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
45     ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclFunction>;
46
47 KernelGenerator::KernelGenerator(
48     const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
49     const std::shared_ptr<TensorBuilder> &tensor_builder,
50     const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
51     : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
52       _tensor_reg(tensor_reg), _current_layout(ir::Layout::UNKNOWN)
53 {
54   // DO NOTHING
55 }
56
57 void KernelGenerator::visit(const ir::OpSequence &op_seq)
58 {
59   // TODO Move this to IKernelGenerator
60   //      (all derivatives have the same implementation for this)
61   assert(!_return_fn_seq);
62   _return_fn_seq = std::make_unique<exec::FunctionSequence>();
63   _return_fn_seq->enableDynamicShapeInferer(false);
64
65   _current_layout = op_seq.getLayout();
66   for (const auto &operation_idx : op_seq.operations())
67   {
68     const auto &node = _operations_ctx.at(operation_idx);
69     node.accept(*this);
70     _return_fn_seq->append(releaseFunction());
71   }
72 }
73
74 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
75 {
76   const auto ofm_index{node.getOutputs().at(0)};
77   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
78   const auto block_size_index{
79       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
80
81   const auto NNApiInputs = 2;
82   if (node.getInputs().size() != NNApiInputs)
83   {
84     const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
85     if (!_ctx.at(crops_index).isConstant())
86     {
87       throw std::runtime_error("Non-constant crops NYI for acl_cl backend BatchToSpaceND");
88     }
89
90     auto crops = _ctx.at(crops_index).asVector<int32_t>();
91     for (auto crop : crops)
92     {
93       if (crop != 0)
94       {
95         throw std::runtime_error("Non-zero crops NYI for acl_cl backend BatchToSpaceND");
96       }
97     }
98   }
99
100   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
101   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
102   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
103
104   assert(_ctx.at(block_size_index).data());
105
106   auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
107       ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
108
109   _return_fn = asAclFunction(std::move(fn));
110 }
111
112 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
113 {
114   const auto ofm_index{node.getOutputs().at(0)};
115   const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
116   const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
117
118   const auto activation = node.param().activation;
119
120   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
121   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
122   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
123
124   const auto act_info = acl_common::asActivationLayerInfo(activation);
125
126   std::unique_ptr<arm_compute::IFunction> fn;
127   switch (node.param().arithmetic_type)
128   {
129     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
130     {
131       fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
132           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
133           arm_compute::ConvertPolicy::SATURATE, act_info);
134       break;
135     }
136     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
137     {
138       fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
139           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
140           arm_compute::ConvertPolicy::SATURATE, act_info);
141       break;
142     }
143     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
144     {
145       fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
146           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
147           arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
148           act_info);
149       break;
150     }
151     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
152     {
153       fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
154           lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
155       break;
156     }
157     default:
158       assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
159       break;
160   }
161
162   _return_fn = asAclFunction(std::move(fn));
163 }
164
165 void KernelGenerator::visit(const ir::operation::Conv2D &node)
166 {
167   using ir::operation::Conv2D;
168
169   const auto ofm_index{node.getOutputs().at(0)};
170   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
171   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
172   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
173
174   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
175   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
176   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
177   const auto &ker_shape = _ctx.at(ker_index).shape();
178   const auto ker_height = ker_shape.dim(1);
179   const auto ker_width = ker_shape.dim(2);
180
181   const auto stride = node.param().stride;
182   const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
183                                             ker_width, ker_height);
184   const auto activation = node.param().activation;
185
186   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
187   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
188   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
189   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
190
191   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
192   const auto act_info = acl_common::asActivationLayerInfo(activation);
193
194   auto fn = acl_common::generateLayer<arm_compute::CLConvolutionLayer>(
195       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
196       ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
197       ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
198
199   _return_fn = asAclFunction(std::move(fn));
200 }
201
202 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
203 {
204   using ir::operation::DepthwiseConv2D;
205
206   const auto ofm_index{node.getOutputs().at(0)};
207   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
208   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
209   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
210
211   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
212   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
213   // Kernel format is [1, kernel_height, kernel_width, depth_out].
214   const auto &ker_shape = _ctx.at(ker_index).shape();
215   const auto ker_height = ker_shape.dim(1);
216   const auto ker_width = ker_shape.dim(2);
217
218   const auto stride = node.param().stride;
219   const auto dilation = node.param().dilation;
220   const auto padding =
221       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width,
222                            ker_height, dilation.width_factor, dilation.height_factor);
223   const auto multiplier = node.param().multiplier;
224   const auto activation = node.param().activation;
225
226   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
227   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
228   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
229   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
230
231   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
232   const auto act_info = acl_common::asActivationLayerInfo(activation);
233   const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
234
235   auto fn = acl_common::generateLayer<arm_compute::CLDepthwiseConvolutionLayer>(
236       ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
237       conv_info, multiplier, act_info, dilation_info);
238
239   _return_fn = asAclFunction(std::move(fn));
240 }
241
242 void KernelGenerator::visit(const ir::operation::Concat &node)
243 {
244   const auto ofm_index{node.getOutputs().at(0)};
245
246   std::vector<ir::OperandIndex> input_indexes;
247
248   for (const auto &input : node.getInputs())
249     input_indexes.emplace_back(input);
250
251   const auto axis = node.param().axis;
252
253   // Concat elimination check
254   bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
255   if (eliminated)
256   {
257     // If concat eliminated, return a NOP IFunction
258     VERBOSE(acl_cl_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
259     _return_fn = std::make_unique<exec::NopFunction>();
260     return;
261   }
262
263   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
264   std::vector<::arm_compute::ICLTensor *> input_tensors;
265   for (auto &ifm_ind : input_indexes)
266     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
267
268   std::unique_ptr<::arm_compute::IFunction> fn;
269   if (input_indexes.size() < 2)
270   {
271     fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensors.at(0),
272                                                         output_tensor->handle());
273   }
274   else
275   {
276     const auto rank = _ctx.at(ofm_index).shape().rank();
277     const auto frontend_layout = _current_layout;
278     const auto backend_layout = output_tensor->layout();
279     const auto fixed_axis =
280         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
281     fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
282         input_tensors, output_tensor->handle(), fixed_axis);
283   }
284
285   _return_fn = asAclFunction(std::move(fn));
286 }
287
288 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
289 {
290   const auto output_index{node.getOutputs().at(0)};
291   auto output_tensor = _tensor_reg->getAclTensor(output_index);
292   const auto activation = node.param().activation;
293   if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
294     throw std::runtime_error(
295         "KernelGenerator(acl_cl): FullyConnected 16x1Float32 weights is not supported.");
296
297   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
298                                                 ::arm_compute::CLFullyConnectedReshapingLayer>(
299       node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
300   _return_fn = std::make_unique<exec::FunctionSequence>(
301       std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
302 }
303
304 void KernelGenerator::visit(const ir::operation::Reduce &node)
305 {
306   const auto output_index{node.getOutputs().at(0)};
307   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
308   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
309   const auto keep_dims{node.param().keep_dims};
310   const auto reduce_type = node.param().reduce_type;
311
312   auto output_tensor = _tensor_reg->getAclTensor(output_index);
313   auto input_tensor = _tensor_reg->getAclTensor(input_index);
314
315   // Convert to ACL axes taking into account negative values and possible duplicates.
316   const auto &axes = _ctx.at(axes_index);
317   const auto input_rank = _ctx.at(input_index).shape().rank();
318   const auto frontend_layout = _current_layout;
319   const auto backend_layout = input_tensor->layout();
320
321   std::unique_ptr<arm_compute::IFunction> fn;
322   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
323   {
324     const auto acl_axes =
325         acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
326     fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
327                                                               keep_dims, output_tensor->handle());
328   }
329   else
330   {
331     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
332
333     fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
334         _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
335         output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type));
336   }
337
338   _return_fn = asAclFunction(std::move(fn));
339 }
340
341 void KernelGenerator::visit(const ir::operation::Reshape &node)
342 {
343   const auto output_index{node.getOutputs().at(0)};
344   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
345
346   auto output_tensor = _tensor_reg->getAclTensor(output_index);
347   auto input_tensor = _tensor_reg->getAclTensor(input_index);
348
349   // NOTE This operation must not be changed the layout from frontend to backend
350   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
351   const auto frontend_layout = _current_layout;
352   const auto backend_layout = output_tensor->layout();
353   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
354          frontend_layout == backend_layout);
355   UNUSED_RELEASE(frontend_layout);
356   UNUSED_RELEASE(backend_layout);
357
358   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
359                                                                    output_tensor->handle());
360
361   _return_fn = asAclFunction(std::move(fn));
362 }
363
364 void KernelGenerator::visit(const ir::operation::Squeeze &node)
365 {
366   // Squeeze is identical to reshape except that it has an optional dimensions input.
367   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
368   // by freezer and toco
369   // TODO Support multi-layout for frontend and backend
370   const auto output_index{node.getOutputs().at(0)};
371   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
372   const auto dims{node.param().dims};
373   const auto ndim{node.param().ndim};
374   (void)dims;
375   (void)ndim;
376
377   auto output_tensor = _tensor_reg->getAclTensor(output_index);
378   auto input_tensor = _tensor_reg->getAclTensor(input_index);
379   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
380                                                                    output_tensor->handle());
381   _return_fn = asAclFunction(std::move(fn));
382 }
383
384 void KernelGenerator::visit(const ir::operation::Softmax &node)
385 {
386   const auto output_index{node.getOutputs().at(0)};
387   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
388
389   const auto beta = node.param().beta;
390
391   auto output_tensor = _tensor_reg->getAclTensor(output_index);
392   auto input_tensor = _tensor_reg->getAclTensor(input_index);
393
394   auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
395       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
396       output_tensor->handle(), beta);
397
398   _return_fn = asAclFunction(std::move(fn));
399 }
400
401 void KernelGenerator::visit(const ir::operation::Slice &node)
402 {
403   const auto output_index{node.getOutputs().at(0)};
404   const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
405   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
406   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
407
408   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
409   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
410   const auto frontend_layout = _current_layout;
411   const auto backend_layout = inputData_tensor->layout();
412
413   // Set initializers for indices data such as order of inputData
414   int input_rank = _ctx.at(input_index).shape().rank();
415   std::vector<int32_t> starts;
416   std::vector<int32_t> ends;
417   starts.resize(input_rank, 0);
418   ends.resize(input_rank, 0);
419   {
420     assert(_ctx.at(begins_index).data());
421     assert(_ctx.at(sizes_index).data());
422     auto beginData_base = _ctx.at(begins_index).data()->base();
423     auto sizeData_base = _ctx.at(sizes_index).data()->base();
424     const int beginData_size = _ctx.at(begins_index).shape().num_elements();
425     const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
426
427     using ir::DataType;
428
429     UNUSED_RELEASE(beginData_size);
430     UNUSED_RELEASE(sizeData_size);
431
432     assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
433     assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
434     assert(beginData_size == input_rank);
435     assert(sizeData_size == input_rank);
436
437     assert(beginData_base != nullptr);
438     for (int n = 0; n < input_rank; ++n)
439     {
440       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
441                                                                  backend_layout)
442                       .value();
443
444       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
445       starts[axis] = begin_value;
446
447       int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
448       ends[axis] = begin_value + size_value;
449     }
450   }
451
452   ::arm_compute::Coordinates starts_set;
453   ::arm_compute::Coordinates ends_set;
454
455   for (size_t i = 0; i < starts.size(); ++i)
456   {
457     starts_set.set(i, starts[i]);
458     ends_set.set(i, ends[i]);
459   }
460
461   auto fn = acl_common::generateLayer<arm_compute::CLSlice>(
462       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
463
464   _return_fn = asAclFunction(std::move(fn));
465 }
466
467 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
468 {
469   const auto output_index{node.getOutputs().at(0)};
470   const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
471   const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
472   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
473   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
474
475   auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
476   auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
477   const auto frontend_layout = _current_layout;
478   const auto backend_layout = inputData_tensor->layout();
479
480   // Set initializers for indices data such as order of inputData
481   int input_rank = _ctx.at(input_index).shape().rank();
482   std::vector<int32_t> starts;
483   std::vector<int32_t> ends;
484   std::vector<int32_t> strides;
485   starts.resize(input_rank, 0);
486   ends.resize(input_rank, 0);
487   strides.resize(input_rank, 0);
488   {
489     assert(_ctx.at(starts_index).data());
490     assert(_ctx.at(ends_index).data());
491     assert(_ctx.at(strides_index).data());
492     auto startData_base = _ctx.at(starts_index).data()->base();
493     auto endData_base = _ctx.at(ends_index).data()->base();
494     auto stridesData_base = _ctx.at(strides_index).data()->base();
495     const int startData_size = _ctx.at(starts_index).shape().num_elements();
496     const int endData_size = _ctx.at(ends_index).shape().num_elements();
497     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
498
499     using ir::DataType;
500
501     UNUSED_RELEASE(startData_size);
502     UNUSED_RELEASE(endData_size);
503     UNUSED_RELEASE(stridesData_size);
504
505     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
506     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
507     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
508     assert(startData_size == input_rank);
509     assert(endData_size == input_rank);
510     assert(stridesData_size == input_rank);
511
512     assert(startData_base != nullptr);
513     for (int n = 0; n < input_rank; ++n)
514     {
515       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
516                                                                  backend_layout)
517                       .value();
518
519       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
520       starts[axis] = start_value;
521
522       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
523       ends[axis] = end_value;
524
525       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
526       strides[axis] = strides_value;
527     }
528   }
529
530   // Set mask bits such as order of inputData
531   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
532                                                            frontend_layout, backend_layout);
533   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
534                                                          frontend_layout, backend_layout);
535   const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
536       node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
537
538   ::arm_compute::Coordinates starts_set;
539   ::arm_compute::Coordinates ends_set;
540   ::arm_compute::BiStrides strides_set;
541
542   for (size_t i = 0; i < starts.size(); ++i)
543   {
544     starts_set.set(i, starts[i]);
545     ends_set.set(i, ends[i]);
546     strides_set.set(i, strides[i]);
547   }
548
549   // Disable applied dim_correction
550   if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
551   {
552     // This means that high dimension's value is 1 and input tensor is applied dim_correction
553     acl_common::disableDimCorrection(inputData_tensor);
554   }
555
556   auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
557       inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
558       begin_mask, end_mask, shrink_axis_mask);
559
560   // Revert disabling applied dim_correction
561   if (inputData_tensor->dimension(0) == 1)
562   {
563     acl_common::enableDimCorrection(inputData_tensor);
564   }
565
566   _return_fn = asAclFunction(std::move(fn));
567 }
568
569 void KernelGenerator::visit(const ir::operation::Transpose &node)
570 {
571   const auto ofm_idx{node.getOutputs().at(0)};
572   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
573   const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
574
575   const auto rank = _ctx.at(ifm_idx).shape().rank();
576
577   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
578   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
579   const auto frontend_layout = _current_layout;
580   const auto backend_layout = ifm_tensor->layout();
581
582   const auto &perms = _ctx.at(perm_idx);
583   std::vector<int32_t> pv;
584   if (perms.shape() == ir::Shape{0})
585   {
586     pv.resize(rank);
587     std::iota(pv.begin(), pv.end(), 0);
588     std::reverse(pv.begin(), pv.end());
589   }
590   else
591   {
592     pv = _ctx.at(perm_idx).asVector<int32_t>();
593   }
594
595   std::unique_ptr<arm_compute::IFunction> fn;
596   if (rank == 1)
597   {
598     fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
599   }
600   else if (rank == 2)
601   {
602     assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
603     fn = acl_common::generateLayer<arm_compute::CLTranspose>(ifm_tensor->handle(),
604                                                              ofm_tensor->handle());
605   }
606   else
607   {
608     auto backend_pv =
609         acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
610
611     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
612                                                            ofm_tensor->handle(), backend_pv);
613   }
614
615   _return_fn = asAclFunction(std::move(fn));
616 }
617
618 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
619 {
620   const auto ofm_index{node.getOutputs().at(0)};
621   const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
622
623   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
624   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
625
626   const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
627       node.param().op_type, node.param().alpha, node.param().beta);
628
629   auto fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
630       ifm_tensor->handle(), ofm_tensor->handle(), act_info);
631
632   _return_fn = asAclFunction(std::move(fn));
633 }
634
635 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
636 {
637   const auto output_index{node.getOutputs().at(0)};
638   const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
639   const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
640
641   auto output_tensor = _tensor_reg->getAclTensor(output_index);
642   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
643   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
644
645   std::unique_ptr<arm_compute::IFunction> fn;
646   switch (node.param().op_type)
647   {
648     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
649     {
650       fn = acl_common::generateLayer<arm_compute::CLBinaryLogicalOp>(
651           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle(),
652           arm_compute::BinaryLogicalOperation::AND);
653       break;
654     }
655     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
656     {
657       fn = acl_common::generateLayer<arm_compute::CLBitwiseOr>(
658           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
659       break;
660     }
661     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
662     {
663       fn = acl_common::generateLayer<arm_compute::CLElementwiseMax>(
664           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
665       break;
666     }
667     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
668     {
669       fn = acl_common::generateLayer<arm_compute::CLElementwiseMin>(
670           lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
671       break;
672     }
673     default:
674     {
675       std::string err_msg("acl_cl KernelGenerator : " + node.name() +
676                           "is not elementwise-binary operations");
677       assert(false && err_msg.c_str());
678       break;
679     }
680   }
681
682   _return_fn = asAclFunction(std::move(fn));
683 }
684
685 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
686 {
687   const auto output_index{node.getOutputs().at(0)};
688   const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
689
690   auto output_tensor = _tensor_reg->getAclTensor(output_index);
691   auto input_tensor = _tensor_reg->getAclTensor(input_index);
692
693   std::unique_ptr<arm_compute::IFunction> fn;
694   switch (node.param().op_type)
695   {
696     case ir::operation::ElementwiseUnary::Type::ABS:
697     {
698       const ::arm_compute::ActivationLayerInfo act_info{
699           ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
700
701       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
702           input_tensor->handle(), output_tensor->handle(), act_info);
703       break;
704     }
705     case ir::operation::ElementwiseUnary::Type::CAST:
706     {
707       if (input_tensor->data_type() == output_tensor->data_type())
708       {
709         fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
710                                                             output_tensor->handle());
711       }
712       else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
713       {
714         fn = acl_common::generateLayer<arm_compute::CLCastBool>(input_tensor->handle(),
715                                                                 output_tensor->handle());
716       }
717       else
718       {
719         // TODO Support converting float to int32 as round down
720         fn = acl_common::generateLayer<arm_compute::CLCast>(
721             input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
722       }
723       break;
724     }
725     case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
726     {
727       fn = acl_common::generateLayer<arm_compute::CLDequantizationLayer>(input_tensor->handle(),
728                                                                          output_tensor->handle());
729       break;
730     }
731     case ir::operation::ElementwiseUnary::Type::EXP:
732     {
733       fn = acl_common::generateLayer<arm_compute::CLExpLayer>(input_tensor->handle(),
734                                                               output_tensor->handle());
735       break;
736     }
737     case ir::operation::ElementwiseUnary::Type::FLOOR:
738     {
739       fn = acl_common::generateLayer<arm_compute::CLFloor>(input_tensor->handle(),
740                                                            output_tensor->handle());
741       break;
742     }
743     case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
744     {
745       fn = acl_common::generateLayer<arm_compute::CLBitwiseNot>(input_tensor->handle(),
746                                                                 output_tensor->handle());
747       break;
748     }
749     case ir::operation::ElementwiseUnary::Type::NEG:
750     {
751       fn = acl_common::generateLayer<arm_compute::CLNeg>(input_tensor->handle(),
752                                                          output_tensor->handle());
753       break;
754     }
755     case ir::operation::ElementwiseUnary::Type::RSQRT:
756     {
757       fn = acl_common::generateLayer<arm_compute::CLRsqrtLayer>(input_tensor->handle(),
758                                                                 output_tensor->handle());
759       break;
760     }
761     case ir::operation::ElementwiseUnary::Type::SQRT:
762     {
763       const ::arm_compute::ActivationLayerInfo act_info{
764           ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
765
766       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
767           input_tensor->handle(), output_tensor->handle(), act_info);
768       break;
769     }
770     default:
771     {
772       throw std::runtime_error("acl_cl KernelGenerator : " + node.name() + "is not supported yet");
773       break;
774     }
775   }
776
777   auto acl_fn = asAclFunction(std::move(fn));
778
779   _return_fn = std::move(acl_fn);
780 }
781
782 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
783 {
784   const auto output_index{node.getOutputs().at(0)};
785   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
786
787   auto output_tensor = _tensor_reg->getAclTensor(output_index);
788   auto input_tensor = _tensor_reg->getAclTensor(input_index);
789
790   auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
791                                                                    output_tensor->handle());
792
793   _return_fn = asAclFunction(std::move(fn));
794 }
795
796 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
797 {
798   const auto ofm_index{node.getOutputs().at(0)};
799   const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
800   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
801   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
802
803   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
804   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
805   auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
806   auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
807   auto epsilon = node.param().epsilon;
808   auto activation = node.param().activation;
809
810   auto fn = acl_common::generateLayer<arm_compute::CLInstanceNormalizationLayerEx>(
811       ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
812       epsilon);
813
814   _return_fn = std::make_unique<exec::FunctionSequence>(
815       asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
816 }
817
818 void KernelGenerator::visit(const ir::operation::LSTM &node)
819 {
820   _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ICLTensor,
821                                          ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_reg);
822 }
823
824 void KernelGenerator::visit(const ir::operation::Comparison &node)
825 {
826   const auto output_index{node.getOutputs().at(0)};
827   const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
828   const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
829
830   const auto comparison_type = node.param().comparison_type;
831
832   auto output_tensor = _tensor_reg->getAclTensor(output_index);
833   auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
834   auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
835
836   auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
837       input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
838       (arm_compute::ComparisonOperation)comparison_type);
839
840   _return_fn = asAclFunction(std::move(fn));
841 }
842
843 void KernelGenerator::visit(const ir::operation::OneHot &node)
844 {
845   const auto output_idx{node.getOutputs().at(0)};
846   const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
847   const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
848   const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
849   const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
850   const auto depth = _ctx.at(depth_idx).asScalar<int32_t>();
851   assert(depth > 0);
852
853   auto output_tensor = _tensor_reg->getAclTensor(output_idx);
854   auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
855   auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
856
857   const size_t output_rank = _ctx.at(output_idx).shape().rank();
858   const auto frontend_layout = _current_layout;
859   const auto backend_layout = output_tensor->layout();
860   int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
861   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
862
863   if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions())
864   {
865     // This means that high dimension's value is 1 and output_tensor is applied dim_correction
866     acl_common::disableDimCorrection(output_tensor);
867   }
868
869   std::unique_ptr<::arm_compute::IFunction> fn;
870   const auto &offvalue = _ctx.at(offvalue_idx);
871   if (offvalue.isConstant())
872   {
873     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
874         indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(),
875         acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis);
876   }
877   else
878   {
879     auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
880     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
881         indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
882         output_tensor->handle(), static_cast<uint32_t>(depth), axis);
883   }
884
885   if (output_tensor->dimension(0) == 1)
886   {
887     acl_common::enableDimCorrection(output_tensor);
888   }
889
890   _return_fn = asAclFunction(std::move(fn));
891 }
892
893 void KernelGenerator::visit(const ir::operation::Pack &node)
894 {
895   const auto output_index{node.getOutputs().at(0)};
896   auto axis{node.param().axis};
897
898   const auto output_rank = _ctx.at(output_index).shape().rank();
899
900   std::vector<ir::OperandIndex> input_indexes;
901   for (const auto &input_index : node.getInputs())
902     input_indexes.emplace_back(input_index);
903
904   auto output = _tensor_reg->getAclTensor(output_index)->handle();
905   std::vector<arm_compute::ICLTensor *> inputs;
906   for (const auto &input_index : input_indexes)
907     inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
908
909   const auto frontend_layout = _current_layout;
910   const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
911
912   if (axis < 0)
913     axis += output_rank;
914   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
915
916   // Disable applied dim_correction
917   for (const auto &input_index : input_indexes)
918   {
919     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
920     if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
921     {
922       // This means that high dimension's value is 1 and input tensor is applied dim_correction
923       acl_common::disableDimCorrection(input_tensor);
924     }
925   }
926
927   auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
928
929   // Revert disabling applied dim_correction
930   for (const auto &input_index : input_indexes)
931   {
932     const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
933     if (input_tensor->dimension(0) == 1)
934     {
935       acl_common::enableDimCorrection(input_tensor);
936     }
937   }
938
939   _return_fn = asAclFunction(std::move(fn));
940 }
941
942 void KernelGenerator::visit(const ir::operation::Pool2D &node)
943 {
944   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
945       node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
946
947   const auto ofm_index{node.getOutputs().at(0)};
948   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
949   const auto activation = node.param().activation;
950   _return_fn = std::make_unique<exec::FunctionSequence>(
951       asAclFunction(std::move(raw_fn)),
952       ActivationBuilder::generate(activation, ofm_tensor->handle()));
953 }
954
955 void KernelGenerator::visit(const ir::operation::Permute &node)
956 {
957   const auto ofm_idx{node.getOutputs().at(0)};
958   const auto ifm_idx{node.getInputs().at(0)};
959   const auto permute_type = node.getPermuteType();
960   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
961   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
962   const auto rank = _ctx.at(ofm_idx).shape().rank();
963   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
964
965   std::unique_ptr<::arm_compute::IFunction> fn;
966   arm_compute::PermutationVector pv;
967   if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
968   {
969     // WHCN -> CWHN
970     pv = arm_compute::PermutationVector{2, 0, 1};
971
972     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
973                                                            ofm_tensor->handle(), pv);
974   }
975   else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
976   {
977     // CWHN -> WHCN
978     pv = arm_compute::PermutationVector{1, 2, 0};
979
980     fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
981                                                              ofm_tensor->handle(), pv);
982   }
983   else
984   {
985     fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
986   }
987
988   _return_fn = asAclFunction(std::move(fn));
989 }
990
991 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
992 {
993   const auto ofm_index{node.getOutputs().at(0)};
994   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
995
996   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
997   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
998
999   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
1000       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
1001       ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
1002       ::arm_compute::SamplingPolicy::TOP_LEFT);
1003
1004   _return_fn = asAclFunction(std::move(fn));
1005 }
1006
1007 void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
1008 {
1009   const auto ofm_index{node.getOutputs().at(0)};
1010   const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)};
1011
1012   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1013   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1014
1015   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
1016       ifm_tensor->handle(), ofm_tensor->handle(),
1017       ::arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, ::arm_compute::BorderMode::REPLICATE,
1018       ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
1019
1020   _return_fn = asAclFunction(std::move(fn));
1021 }
1022
1023 void KernelGenerator::visit(const ir::operation::RNN &node)
1024 {
1025   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
1026   const auto hidden_state_out_index{
1027       node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
1028
1029   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
1030   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
1031   const auto recurrent_weights_index{
1032       node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
1033   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
1034   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
1035
1036   const auto activation = node.param().activation;
1037
1038   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1039   auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
1040
1041   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1042   auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
1043   auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
1044   auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
1045   auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
1046   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
1047
1048   auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
1049       hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
1050   _return_fn = asAclFunction(std::move(copy_layer));
1051
1052   auto fn = acl_common::generateLayer<arm_compute::CLRNNLayer>(
1053       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
1054       weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
1055       hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
1056   _return_fn = asAclFunction(std::move(fn));
1057 }
1058
1059 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1060 {
1061   const auto ofm_index{node.getOutputs().at(0)};
1062   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
1063   const auto block_size_index{
1064       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1065   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1066
1067   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1068   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1069   auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1070   auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1071
1072   assert(_ctx.at(block_size_index).data());
1073   assert(_ctx.at(paddings_index).data());
1074
1075   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToBatchLayer>(
1076       ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1077       ofm_tensor->handle());
1078
1079   _return_fn = asAclFunction(std::move(fn));
1080 }
1081
1082 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1083 {
1084   const auto ofm_index{node.getOutputs().at(0)};
1085   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1086
1087   auto block_size = node.param().block_size;
1088
1089   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1090   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1091
1092   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
1093       ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1094
1095   _return_fn = asAclFunction(std::move(fn));
1096 }
1097
1098 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
1099 {
1100   const auto output_index{node.getOutputs().at(0)};
1101   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
1102   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
1103
1104   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1105   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1106   auto values_tensor = _tensor_reg->getAclTensor(values_index);
1107
1108   auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
1109       values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
1110
1111   _return_fn = asAclFunction(std::move(fn));
1112 }
1113
1114 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1115 {
1116   const auto ofm_index{node.getOutputs().at(0)};
1117   const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
1118
1119   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1120   // L2 Normalization always performs the reduction along the depth axis
1121   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1122   // choosing normalization parameters as below
1123
1124   const auto &ifm_shape = _ctx.at(ifm_index).shape();
1125   // TODO Support optional constant dimension that normalization would be performed on
1126   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1127   int32_t radius =
1128       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1129   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
1130   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
1131   float bias = 0.0f;                             // Don't offset the reduction.
1132
1133   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1134   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1135
1136   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1137                                                                radius, alpha, beta, bias, false);
1138
1139   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1140       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1141
1142   _return_fn = asAclFunction(std::move(fn));
1143 }
1144
1145 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
1146 {
1147   const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
1148   const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
1149
1150   const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
1151   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
1152   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
1153
1154   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1155   auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
1156
1157   auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1158   auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
1159   auto values_tensor = _tensor_reg->getAclTensor(values_index);
1160
1161   auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
1162       lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
1163       output_tensor->handle(), hits_tensor->handle());
1164
1165   _return_fn = asAclFunction(std::move(fn));
1166 }
1167
1168 void KernelGenerator::visit(const ir::operation::PReLU &node)
1169 {
1170   const auto ofm_index{node.getOutputs().at(0)};
1171   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
1172   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
1173
1174   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1175   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1176   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
1177
1178   auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
1179       ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
1180
1181   _return_fn = asAclFunction(std::move(fn));
1182 }
1183
1184 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1185 {
1186   const auto ofm_index{node.getOutputs().at(0)};
1187   const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1188   const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1189
1190   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
1191   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
1192   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
1193
1194   const auto stride = node.param().stride;
1195
1196   assert((node.param().padding.type == ir::PaddingType::SAME) ||
1197          (node.param().padding.type == ir::PaddingType::VALID));
1198   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1199                                       ker_shape.W, ker_shape.H);
1200   uint32_t invalid_horizontal = 0;
1201   uint32_t invalid_vertical = 0;
1202   if (node.param().padding.type == ir::PaddingType::VALID)
1203   {
1204     invalid_horizontal =
1205         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1206     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1207   }
1208
1209   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1210   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1211   auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1212
1213   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1214
1215   auto fn = acl_common::generateLayer<arm_compute::CLTransposeConvLayer>(
1216       _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
1217       ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
1218       invalid_vertical);
1219
1220   _return_fn = asAclFunction(std::move(fn));
1221 }
1222
1223 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1224 {
1225   const auto ofm_index{node.getOutputs().at(0)};
1226   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1227   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1228
1229   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1230   auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1231   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1232
1233   auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
1234       lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1235
1236   _return_fn = asAclFunction(std::move(fn));
1237 }
1238
1239 void KernelGenerator::visit(const ir::operation::TopKV2 &node)
1240 {
1241   const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
1242   const auto outputIndices_index{
1243       node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
1244
1245   const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
1246
1247   // Currently, we only support the vector input.
1248   assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1249          _ctx.at(inputData_index).shape().rank() == 2);
1250
1251   const auto k = node.param().k;
1252
1253   auto values_tensor = _tensor_reg->getAclTensor(outputValues_index);
1254   auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index);
1255   auto input_tensor = _tensor_reg->getAclTensor(inputData_index);
1256
1257   auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
1258       input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
1259
1260   _return_fn = asAclFunction(std::move(fn));
1261 }
1262
1263 void KernelGenerator::visit(const ir::operation::Gather &node)
1264 {
1265   const auto ofm_index{node.getOutputs().at(0)};
1266
1267   const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
1268   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
1269
1270   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1271   const auto axis_raw = node.param().axis;
1272   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1273   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
1274
1275   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1276   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1277   auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
1278
1279   // NOTE The frontend layout and backend layout must be the same for this operation.
1280   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1281   //      is not not efficient even if it works well. If so, it would be better to set the
1282   //      layout of these backend tensors to the same layout.
1283   //      There is also one thing we have to think about. This operation depends on the layout of
1284   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1285   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1286   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1287   const auto backend_layout = ofm_tensor->layout();
1288   UNUSED_RELEASE(backend_layout);
1289   assert(backend_layout == ifm_tensor->layout());
1290   assert(backend_layout == indices_tensor->layout());
1291   assert(ifm_rank < 4 || _current_layout == backend_layout);
1292
1293   // input is n-D, indices k-D, output is (n + k - 1)-D
1294   size_t n = ifm_rank;
1295   assert(n == ifm_tensor->num_dimensions());
1296   size_t k = _ctx.at(indices_index).shape().rank();
1297   assert(k == indices_tensor->num_dimensions());
1298
1299   // Disable applied dim_correction
1300   if (n != ifm_tensor->info()->num_dimensions())
1301   {
1302     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1303     acl_common::disableDimCorrection(ifm_tensor);
1304   }
1305   if (k != indices_tensor->info()->num_dimensions())
1306   {
1307     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
1308     acl_common::disableDimCorrection(indices_tensor);
1309   }
1310
1311   auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
1312       ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
1313
1314   // Revert disabling applied dim_correction
1315   if (ifm_tensor->dimension(0) == 1)
1316   {
1317     acl_common::enableDimCorrection(ifm_tensor);
1318   }
1319   if (indices_tensor->dimension(0) == 1)
1320   {
1321     acl_common::enableDimCorrection(indices_tensor);
1322   }
1323
1324   _return_fn = asAclFunction(std::move(fn));
1325 }
1326
1327 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
1328 {
1329   const auto ofm_index{node.getOutputs().at(0)};
1330   const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
1331   const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
1332
1333   auto ifm_shape = _ctx.at(ifm_index).shape();
1334   auto ofm_shape = _ctx.at(ofm_index).shape();
1335
1336   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1337
1338   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1339   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1340   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1341   auto frontend_layout = _current_layout;
1342   auto backend_layout = ifm_tensor->layout();
1343
1344   int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
1345   if (axis_value < 0)
1346   {
1347     axis_value += ifm_rank;
1348   }
1349
1350   auto acl_axis =
1351       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1352   auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
1353                                              : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
1354   auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
1355       ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type);
1356
1357   _return_fn = asAclFunction(std::move(fn));
1358 }
1359
1360 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
1361 {
1362   const auto ofm_index{node.getOutputs().at(0)};
1363   const auto ifm_index{
1364       node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
1365
1366   auto radius = node.param().radius;
1367   auto alpha = node.param().alpha;
1368   auto beta = node.param().beta;
1369   auto bias = node.param().bias;
1370
1371   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1372   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1373
1374   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1375       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1376
1377   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1378       ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1379
1380   _return_fn = asAclFunction(std::move(fn));
1381 }
1382
1383 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1384 {
1385   const auto output_index{node.getOutputs().at(0)};
1386   const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1387
1388   auto block_size = node.param().block_size;
1389   assert(block_size > 0);
1390
1391   auto output_tensor = _tensor_reg->getAclTensor(output_index);
1392   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1393
1394   auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
1395       input_tensor->handle(), output_tensor->handle(), block_size);
1396
1397   _return_fn = asAclFunction(std::move(fn));
1398 }
1399
1400 void KernelGenerator::visit(const ir::operation::Split &node)
1401 {
1402   const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1403   const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1404
1405   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1406   if (!_ctx.at(axis_index).isConstant())
1407   {
1408     throw std::runtime_error("Non-constant axis_index NYI for acl_cl backend");
1409   }
1410
1411   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1412   std::vector<ir::OperandIndex> output_indexes;
1413   for (const auto &output : node.getOutputs())
1414     output_indexes.emplace_back(output);
1415
1416   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1417   std::vector<arm_compute::ICLTensor *> output_tensors;
1418   for (const auto &ofm_ind : output_indexes)
1419     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1420
1421   const auto frontend_layout = _current_layout;
1422   const auto backend_layout = ifm_tensor->layout();
1423   auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1424   if (axis < 0)
1425     axis += ifm_rank;
1426   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1427
1428   auto fn =
1429       acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
1430
1431   _return_fn = asAclFunction(std::move(fn));
1432 }
1433
1434 void KernelGenerator::visit(const ir::operation::SplitV &node)
1435 {
1436   const auto ifm_index{node.getInputs().at(ir::operation::SplitV::Input::INPUT)};
1437   const auto size_split_index{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
1438   const auto split_dim_index{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
1439
1440   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1441
1442   const size_t ifm_rank = _ctx.at(ifm_index).shape().rank();
1443   std::vector<ir::OperandIndex> output_indexes;
1444   for (const auto &output : node.getOutputs())
1445     output_indexes.emplace_back(output);
1446
1447   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1448   auto size_split_tensor = _tensor_reg->getAclTensor(size_split_index);
1449
1450   std::vector<arm_compute::ICLTensor *> output_tensors;
1451   for (const auto &ofm_ind : output_indexes)
1452     output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1453
1454   auto fn = std::make_unique<arm_compute::CLSplitVEx>();
1455   const auto &split_dim_op = _ctx.at(split_dim_index);
1456   if (split_dim_op.isConstant())
1457   {
1458     int32_t split_dim = split_dim_op.asScalar<int32_t>();
1459     uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
1460     const auto frontend_layout = _current_layout;
1461     const auto backend_layout = ifm_tensor->layout();
1462
1463     if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
1464     {
1465       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1466       acl_common::disableDimCorrection(ifm_tensor);
1467     }
1468
1469     split_dim_revised =
1470         acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout)
1471             .value();
1472     fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
1473                   output_tensors, node.param().num_splits);
1474
1475     if (ifm_tensor->dimension(0) == 1)
1476     {
1477       acl_common::enableDimCorrection(ifm_tensor);
1478     }
1479   }
1480   else
1481   {
1482     throw std::runtime_error("Non-constant split_dim NYI for acl_cl backend");
1483   }
1484
1485   _return_fn = asAclFunction(std::move(fn));
1486 }
1487
1488 void KernelGenerator::visit(const ir::operation::Unpack &node)
1489 {
1490   const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1491   auto axis{node.param().axis};
1492
1493   const auto input_rank = _ctx.at(input_index).shape().rank();
1494
1495   std::vector<ir::OperandIndex> output_indexes;
1496   for (const auto &output_index : node.getOutputs())
1497     output_indexes.emplace_back(output_index);
1498
1499   auto input_tensor = _tensor_reg->getAclTensor(input_index);
1500   std::vector<arm_compute::ICLTensor *> outputs;
1501   for (const auto &output_index : output_indexes)
1502     outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1503
1504   const auto frontend_layout = _current_layout;
1505   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1506   if (axis < 0)
1507     axis += input_rank;
1508   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1509
1510   // Disable applied dim_correction
1511   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1512   {
1513     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1514     acl_common::disableDimCorrection(input_tensor);
1515   }
1516
1517   auto fn =
1518       acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
1519
1520   // Revert disabling applied dim_correction
1521   if (input_tensor->dimension(0) == 1)
1522   {
1523     acl_common::enableDimCorrection(input_tensor);
1524   }
1525
1526   _return_fn = asAclFunction(std::move(fn));
1527 }
1528
1529 void KernelGenerator::visit(const ir::operation::Pad &node)
1530 {
1531   const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
1532   const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
1533   const auto output_index{node.getOutputs().at(0)};
1534   assert(_ctx.at(pad_index).data());
1535
1536   auto rank = _ctx.at(input_index).shape().rank();
1537   auto pad_base = _ctx.at(pad_index).data()->base();
1538
1539   auto input_type = _ctx.at(input_index).typeInfo();
1540   auto data_type = acl_common::asDataType(input_type.type());
1541   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
1542   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1543
1544   auto input = _tensor_reg->getAclTensor(input_index)->handle();
1545   auto output = _tensor_reg->getAclTensor(output_index)->handle();
1546
1547   const auto frontend_layout = _current_layout;
1548   const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1549
1550   ::arm_compute::PaddingList padding_list;
1551   padding_list.resize(rank);
1552   for (int32_t n = 0; n < rank; ++n)
1553   {
1554     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
1555
1556     const auto axis =
1557         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
1558     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1559   }
1560
1561   // Disable applied dim_correction
1562   const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
1563   if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1564   {
1565     // This means that high dimension's value is 1 and input tensor is applied dim_correction
1566     acl_common::disableDimCorrection(input_tensor);
1567   }
1568
1569   auto fn =
1570       acl_common::generateLayer<arm_compute::CLPadLayer>(input, output, padding_list, pixel_value);
1571
1572   // NOTE Do not revert disabling applied dim_correction for 4D.
1573   // It would produce a mistach of result by incorrect offset_first_element in
1574   // ICLKernel::add_tensor_argument<3>().
1575   // We have to disable applied dim_correction and not to revert enabling for the kernel that slices
1576   // 4D to 3D because slicing arm_compute::Window can causes incorrect offset_first_element if the
1577   // used tensor is 4D and the tensor's high dimention is 1
1578   if (input_tensor->num_dimensions() < 4 && input_tensor->dimension(0) == 1)
1579   {
1580     acl_common::enableDimCorrection(input_tensor);
1581   }
1582
1583   _return_fn = asAclFunction(std::move(fn));
1584 }
1585
1586 void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
1587 {
1588   const auto ofm_index{node.getOutputs().at(0)};
1589   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
1590
1591   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1592   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1593
1594   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1595       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1596
1597   _return_fn = asAclFunction(std::move(fn));
1598 }
1599
1600 void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
1601 {
1602   const auto ofm_index{node.getOutputs().at(0)};
1603   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
1604
1605   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1606   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1607
1608   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1609       ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1610
1611   _return_fn = asAclFunction(std::move(fn));
1612 }
1613
1614 void KernelGenerator::visit(const ir::operation::Reverse &node)
1615 {
1616   const auto ofm_index{node.getOutputs().at(0)};
1617   const auto ifm_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)};
1618   const auto axis_index{node.getInputs().at(ir::operation::Reverse::Input::AXIS)};
1619
1620   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1621   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1622   auto axis_tensor = _tensor_reg->getAclTensor(axis_index);
1623
1624   // WORKAROUND: acl-cl backend only allow U32 type for axis
1625   //             ConstantInitializer will resolve S32 type to U32 type
1626   if (_ctx.at(axis_index).isConstant() &&
1627       (axis_tensor->handle()->info()->data_type() == arm_compute::DataType::S32))
1628   {
1629     axis_tensor->handle()->info()->set_data_type(arm_compute::DataType::U32);
1630   }
1631
1632   auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
1633       ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
1634
1635   _return_fn = asAclFunction(std::move(fn));
1636 }
1637
1638 } // namespace acl_cl
1639 } // namespace backend
1640 } // namespace onert