08d77c81320f6912e438f9c9768382d88688f83b
[platform/core/ml/nnfw.git] / runtime / neurun / backend / acl_cl / KernelGenerator.cc
1 /*
2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #include "KernelGenerator.h"
18
19 #include <arm_compute/runtime/CL/CLFunctions.h>   // Include all ARM Compute CL functions
20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
21
22 #include <AclFunction.h>
23 #include <Convert.h>
24 #include <Swizzle.h>
25
26 #include "kernel/ConcatLayer.h"
27 #include "model/Index.h"
28 #include "model/DataType.h"
29 #include "model/InternalType.h"
30 #include "compiler/IExecutionBuilder.h"
31 #include "exec/NopFunction.h"
32 #include "util/logging.h"
33 #include "util/Utils.h"
34 #include "util/Padding.h"
35
36 using ::neurun::compiler::IExecutionBuilder;
37
38 namespace neurun
39 {
40 namespace backend
41 {
42 namespace acl_cl
43 {
44
45 using ::neurun::backend::acl_common::asAclFunction;
46
47 //
48 // ActivationBuilder
49 //
50 class ActivationBuilder
51 {
52 public:
53   explicit ActivationBuilder(IExecutionBuilder &builder) : _builder(builder)
54   {
55     // DO NOTHING
56   }
57
58 private:
59   void appendReLU(::arm_compute::ICLTensor *ifm_alloc);
60   void appendReLU1(::arm_compute::ICLTensor *ifm_alloc);
61   void appendReLU6(::arm_compute::ICLTensor *ifm_alloc);
62
63 public:
64   void append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc);
65
66 private:
67   IExecutionBuilder &_builder;
68 };
69
70 void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc)
71 {
72   const ::arm_compute::ActivationLayerInfo act_info{
73       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
74
75   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
76
77   fn->configure(ifm_alloc, nullptr, act_info);
78
79   auto acl_fn = asAclFunction(std::move(fn));
80
81   _builder.append(std::move(acl_fn));
82 }
83
84 void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc)
85 {
86   const ::arm_compute::ActivationLayerInfo act_info{
87       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
88
89   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
90
91   fn->configure(ifm_alloc, nullptr, act_info);
92
93   auto acl_fn = asAclFunction(std::move(fn));
94
95   _builder.append(std::move(acl_fn));
96 }
97
98 void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc)
99 {
100   const ::arm_compute::ActivationLayerInfo act_info{
101       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
102
103   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
104
105   fn->configure(ifm_alloc, nullptr, act_info);
106
107   auto acl_fn = asAclFunction(std::move(fn));
108
109   _builder.append(std::move(acl_fn));
110 }
111
112 void ActivationBuilder::append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc)
113 {
114   switch (code)
115   {
116     case model::Activation::NONE:
117     {
118       // DO NOTHING
119       break;
120     }
121     case model::Activation::RELU:
122     {
123       appendReLU(ifm_alloc);
124       break;
125     }
126     case model::Activation::RELU1:
127     {
128       appendReLU1(ifm_alloc);
129       break;
130     }
131     case model::Activation::RELU6:
132     {
133       appendReLU6(ifm_alloc);
134       break;
135     }
136     default:
137     {
138       throw std::runtime_error("Not supported, yet");
139     }
140   }
141 }
142
143 //
144 // KernelGenerator
145 //
146 KernelGenerator::KernelGenerator(const neurun::model::Operands &ctx,
147                                  const std::shared_ptr<TensorBuilder> &tensor_builder)
148     : _ctx(ctx), _tensor_builder(tensor_builder), _current_subg_layout(ir::Layout::UNKNOWN)
149 {
150   // DO NOTHING
151 }
152
153 void KernelGenerator::visit(const model::Subgraph &subgraph)
154 {
155   _current_subg_layout = subgraph.getLayout();
156   for (const auto &e : subgraph.operations())
157   {
158     const auto &node = *(e.node);
159     _tensor_builder->preVisit(node);
160     node.accept(*this);
161     _tensor_builder->postVisit(node);
162   }
163 }
164
165 void KernelGenerator::visit(const model::operation::BatchToSpaceND &node)
166 {
167   const auto ofm_index{node.getOutputs().at(0)};
168   const auto ifm_index{node.getInputs().at(model::operation::BatchToSpaceND::Input::INPUT)};
169   const auto block_size_index{
170       node.getInputs().at(model::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
171
172   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
173   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
174   auto block_size_alloc = _tensor_builder->at(block_size_index).get();
175
176   assert(_ctx.at(block_size_index).isConstant());
177
178   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBatchToSpaceLayer>();
179
180   fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
181
182   auto acl_fn = asAclFunction(std::move(fn));
183
184   _execution_builder->append(std::move(acl_fn));
185 }
186
187 void KernelGenerator::visit(const model::operation::Cast &node)
188 {
189   const auto ofm_index{node.getOutputs().at(0)};
190   const auto ifm_index{node.getInputs().at(model::operation::Cast::Input::INPUT)};
191
192   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
193   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
194
195   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
196
197   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
198
199   auto acl_fn = asAclFunction(std::move(fn));
200
201   _execution_builder->append(std::move(acl_fn));
202 }
203
204 void KernelGenerator::visit(const model::operation::Conv2D &node)
205 {
206   using model::operation::Conv2D;
207
208   const auto ofm_index{node.getOutputs().at(0)};
209   const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
210   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
211   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
212
213   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
214   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
215   // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
216   const auto &ker_shape = _ctx.at(ker_index).shape();
217   const auto ker_height = ker_shape.dim(1);
218   const auto ker_width = ker_shape.dim(2);
219
220   const auto stride = node.param().stride;
221   const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
222                                                       stride, ker_width, ker_height);
223   const auto activation = node.param().activation;
224
225   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
226   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
227   auto ker_alloc = _tensor_builder->at(ker_index).get();
228   auto bias_alloc = _tensor_builder->at(bias_index).get();
229
230   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
231   const auto act_info = acl_common::asActivationLayerInfo(activation);
232
233   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLConvolutionLayer>(
234       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
235
236   fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
237                 conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
238
239   _execution_builder->append(asAclFunction(std::move(fn)));
240 }
241
242 void KernelGenerator::visit(const model::operation::DepthwiseConv2D &node)
243 {
244   using model::operation::DepthwiseConv2D;
245
246   const auto ofm_index{node.getOutputs().at(0)};
247   const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
248   const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
249   const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
250
251   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
252   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
253   // Kernel format is [1, kernel_height, kernel_width, depth_out].
254   const auto &ker_shape = _ctx.at(ker_index).shape();
255   const auto ker_height = ker_shape.dim(1);
256   const auto ker_width = ker_shape.dim(2);
257
258   const auto stride = node.param().stride;
259   const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
260                                                       stride, ker_width, ker_height);
261   const auto multiplier = node.param().multiplier;
262   const auto activation = node.param().activation;
263
264   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
265   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
266   auto ker_alloc = _tensor_builder->at(ker_index).get();
267   auto bias_alloc = _tensor_builder->at(bias_index).get();
268
269   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
270   const auto act_info = acl_common::asActivationLayerInfo(activation);
271
272   if (ker_height == 3 && ker_width == 3)
273   {
274     auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer3x3>(
275         _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
276
277     fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
278                   ofm_alloc->handle(), conv_info, multiplier, act_info);
279
280     _execution_builder->append(asAclFunction(std::move(fn)));
281   }
282   else
283   {
284     auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
285
286     fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
287                   ofm_alloc->handle(), conv_info, multiplier, act_info);
288
289     _execution_builder->append(asAclFunction(std::move(fn)));
290   }
291 }
292
293 void KernelGenerator::visit(const model::operation::MaxPool2D &node)
294 {
295   const auto ofm_index{node.getOutputs().at(0)};
296   const auto ifm_index{node.getInputs().at(model::operation::MaxPool2D::Input::INPUT)};
297
298   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
299   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
300
301   const auto kh = node.param().kh;
302   const auto kw = node.param().kw;
303   const auto stride = node.param().stride;
304   const auto padding =
305       neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
306   const auto activation = node.param().activation;
307
308   VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
309   VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
310   VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
311   VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
312   VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
313   VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
314   VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
315   VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
316   VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
317   VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
318   VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
319   VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
320
321   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
322   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
323
324   ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
325                                        ::arm_compute::Size2D{kw, kh},
326                                        acl_common::asPadStrideInfo(padding, stride)};
327
328   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
329
330   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
331
332   auto acl_fn = asAclFunction(std::move(fn));
333
334   _execution_builder->append((std::move(acl_fn)));
335
336   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
337 }
338
339 void KernelGenerator::visit(const model::operation::AvgPool2D &node)
340 {
341   const auto ofm_index{node.getOutputs().at(0)};
342   const auto ifm_index{node.getInputs().at(model::operation::AvgPool2D::Input::INPUT)};
343
344   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
345   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
346
347   const auto kh = node.param().kh;
348   const auto kw = node.param().kw;
349   const auto stride = node.param().stride;
350   const auto padding =
351       neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
352   const auto activation = node.param().activation;
353
354   VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
355   VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
356   VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
357   VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
358   VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
359   VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
360   VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
361   VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
362   VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
363   VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
364   VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
365   VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
366
367   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
368   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
369
370   ::arm_compute::PoolingLayerInfo info{
371       ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
372       acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
373
374   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
375
376   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
377
378   auto acl_fn = asAclFunction(std::move(fn));
379
380   _execution_builder->append((std::move(acl_fn)));
381
382   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
383 }
384
385 void KernelGenerator::visit(const model::operation::Concat &node)
386 {
387   const auto ofm_index{node.getOutputs().at(0)};
388
389   std::vector<model::OperandIndex> input_indexes;
390
391   for (const auto &input : node.getInputs())
392     input_indexes.emplace_back(input);
393
394   const auto axis = node.param().axis;
395
396   // If tensor allocator allocate as subtensor
397   bool canEliminate = true;
398   for (auto &ifm_ind : input_indexes)
399   {
400     if (!_tensor_builder->isSubTensorOf(ofm_index, ifm_ind))
401     {
402       canEliminate = false;
403       break;
404     }
405   }
406   if (canEliminate)
407   {
408     // If concat eliminated, return a NOP IFunction
409     _execution_builder->append(nnfw::cpp14::make_unique<exec::NopFunction>());
410     return;
411   }
412
413   auto output_alloc = _tensor_builder->at(ofm_index).get();
414
415   std::vector<operand::ICLTensor *> input_allocs;
416   for (auto &ifm_ind : input_indexes)
417     input_allocs.emplace_back(_tensor_builder->at(ifm_ind).get());
418
419   auto fn = nnfw::cpp14::make_unique<::neurun::backend::acl_cl::kernel::ConcatLayer>();
420
421   const auto rank = _ctx.at(ofm_index).shape().rank();
422   const auto frontend_layout = _current_subg_layout;
423   const auto backend_layout = output_alloc->layout();
424   const auto fixed_axis =
425       acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
426
427   fn->configure(input_allocs, fixed_axis, output_alloc);
428
429   auto acl_fn = asAclFunction(std::move(fn));
430
431   _execution_builder->append(std::move(acl_fn));
432 }
433
434 void KernelGenerator::visit(const model::operation::FullyConnected &node)
435 {
436   using model::operation::FullyConnected;
437
438   const auto output_index{node.getOutputs().at(0)};
439   const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
440   const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
441   const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
442
443   const auto input_rank = _ctx.at(input_index).shape().rank();
444   // TODO Currently we are not handling where the case is that the input's rank is 3.
445   // The handling should be added in the future.
446   assert(input_rank != 3);
447
448   const auto output_size = _ctx.at(output_index).shape().dim(1);
449   UNUSED_RELEASE(output_size);
450   assert(_ctx.at(bias_index).shape().dim(0) == output_size);
451   assert(_ctx.at(weight_index).shape().dim(0) == output_size);
452   const auto batch_size = _ctx.at(output_index).shape().dim(0);
453   const auto input_size = _ctx.at(weight_index).shape().dim(1);
454
455   // Check for reshaping input's shape into rank-2
456   bool needs_reshape = false;
457   neurun::model::Shape reshape(2);
458   if (input_rank == 4)
459   {
460     const auto feature_size = _ctx.at(input_index).shape().num_elements();
461
462     UNUSED_RELEASE(feature_size);
463     assert(batch_size >= 0 && input_size >= 0);
464     assert(feature_size == static_cast<uint64_t>(batch_size) * static_cast<uint64_t>(input_size));
465
466     // for reshaping
467     needs_reshape = true;
468     reshape.dim(0) = batch_size; /* H */
469     reshape.dim(1) = input_size; /* W */
470   }
471
472   const auto activation = node.param().activation;
473
474   auto output_alloc = _tensor_builder->at(output_index).get();
475   const auto input_alloc = _tensor_builder->at(input_index).get();
476   const auto weight_alloc = _tensor_builder->at(weight_index).get();
477   const auto bias_alloc = _tensor_builder->at(bias_index).get();
478   const auto frontend_layout = _current_subg_layout;
479   const auto acl_layout = output_alloc->handle()->info()->data_layout();
480
481   auto fn = nnfw::cpp14::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
482       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
483
484   fn->configure(
485       input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
486       needs_reshape,
487       ::neurun::backend::acl_common::asTensorShape(
488           reshape, frontend_layout, ::neurun::backend::acl_common::asRuntimeLayout(acl_layout)));
489
490   auto acl_fn = asAclFunction(std::move(fn));
491
492   _execution_builder->append(std::move(acl_fn));
493
494   ActivationBuilder{*_execution_builder}.append(activation, output_alloc->handle());
495 }
496
497 void KernelGenerator::visit(const model::operation::Mul &node)
498 {
499   const auto ofm_index{node.getOutputs().at(0)};
500   const auto lhs_index{node.getInputs().at(model::operation::Mul::Input::LHS)};
501   const auto rhs_index{node.getInputs().at(model::operation::Mul::Input::RHS)};
502
503   const auto activation = node.param().activation;
504
505   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
506   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
507   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
508
509   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPixelWiseMultiplication>();
510
511   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
512                 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
513
514   auto acl_fn = asAclFunction(std::move(fn));
515
516   _execution_builder->append(std::move(acl_fn));
517
518   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
519 }
520
521 void KernelGenerator::visit(const model::operation::ReduceSum &node)
522 {
523   const auto output_index{node.getOutputs().at(0)};
524   const auto input_index{node.getInputs().at(model::operation::ReduceSum::Input::INPUT)};
525   const auto &axes{node.param().axes};
526
527   auto output_alloc = _tensor_builder->at(output_index).get();
528   auto input_alloc = _tensor_builder->at(input_index).get();
529   const auto frontend_layout = _current_subg_layout;
530   const auto backend_layout = input_alloc->layout();
531
532   // Convert to ACL axes taking into account negative values and possible duplicates.
533   std::set<std::uint32_t> acl_axes;
534   const int input_rank = _ctx.at(input_index).shape().rank();
535   for (int axis : axes)
536   {
537     if (axis < 0)
538       axis += input_rank;
539     acl_axes.insert(
540         acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value());
541   }
542
543   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
544
545   fn->configure(input_alloc->handle(), output_alloc->handle(), acl_axes,
546                 ::arm_compute::ReduceOperation::SUM);
547
548   auto acl_fn = asAclFunction(std::move(fn));
549
550   _execution_builder->append(std::move(acl_fn));
551 }
552
553 void KernelGenerator::visit(const model::operation::Reshape &node)
554 {
555   const auto output_index{node.getOutputs().at(0)};
556   const auto input_index{node.getInputs().at(model::operation::Reshape::Input::INPUT)};
557
558   auto output_alloc = _tensor_builder->at(output_index).get();
559   auto input_alloc = _tensor_builder->at(input_index).get();
560
561   // NOTE This operation must not be changed the layout from frontend to backend
562   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
563   const auto frontend_layout = _current_subg_layout;
564   const auto backend_layout = output_alloc->layout();
565   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
566          frontend_layout == backend_layout);
567   UNUSED_RELEASE(frontend_layout);
568   UNUSED_RELEASE(backend_layout);
569
570   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReshapeLayer>();
571
572   fn->configure(input_alloc->handle(), output_alloc->handle());
573
574   auto acl_fn = asAclFunction(std::move(fn));
575
576   _execution_builder->append(std::move(acl_fn));
577 }
578
579 void KernelGenerator::visit(const model::operation::Squeeze &node)
580 {
581   // Squeeze is identical to reshape except that it has an optional dimensions input.
582   // In addition, optional dims_index is ignored since output tensor already has squeezed shape
583   // by freezer and toco
584   // TODO Support multi-layout for frontend and backend
585   const auto output_index{node.getOutputs().at(0)};
586   const auto input_index{node.getInputs().at(model::operation::Squeeze::Input::INPUT)};
587   const auto dims{node.param().dims};
588   const auto ndim{node.param().ndim};
589   (void)dims;
590   (void)ndim;
591
592   auto output_alloc = _tensor_builder->at(output_index).get();
593   auto input_alloc = _tensor_builder->at(input_index).get();
594   auto fn = nnfw::cpp14::make_unique<arm_compute::CLReshapeLayer>();
595   fn->configure(input_alloc->handle(), output_alloc->handle());
596   auto acl_fn = asAclFunction(std::move(fn));
597   _execution_builder->append(std::move(acl_fn));
598 }
599
600 void KernelGenerator::visit(const model::operation::Tanh &node)
601 {
602   const auto output_index{node.getOutputs().at(0)};
603   const auto input_index{node.getInputs().at(model::operation::Tanh::Input::INPUT)};
604
605   auto output_alloc = _tensor_builder->at(output_index).get();
606   auto input_alloc = _tensor_builder->at(input_index).get();
607
608   auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
609
610   const ::arm_compute::ActivationLayerInfo act_info{
611       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
612
613   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
614
615   auto acl_fn = asAclFunction(std::move(fn));
616
617   _execution_builder->append(std::move(acl_fn));
618 }
619
620 void KernelGenerator::visit(const model::operation::Softmax &node)
621 {
622   const auto output_index{node.getOutputs().at(0)};
623   const auto input_index{node.getInputs().at(model::operation::Softmax::Input::INPUT)};
624
625   const auto beta = node.param().beta;
626
627   auto output_alloc = _tensor_builder->at(output_index).get();
628   auto input_alloc = _tensor_builder->at(input_index).get();
629
630   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSoftmaxLayer>(
631       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
632
633   fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
634
635   auto acl_fn = asAclFunction(std::move(fn));
636
637   _execution_builder->append(std::move(acl_fn));
638 }
639
640 void KernelGenerator::visit(const model::operation::StridedSlice &node)
641 {
642   const auto output_index{node.getOutputs().at(0)};
643   const auto input_index{node.getInputs().at(model::operation::StridedSlice::Input::INPUT)};
644   const auto starts_index{node.getInputs().at(model::operation::StridedSlice::Input::STARTS)};
645   const auto ends_index{node.getInputs().at(model::operation::StridedSlice::Input::ENDS)};
646   const auto strides_index{node.getInputs().at(model::operation::StridedSlice::Input::STRIDES)};
647
648   auto outputData_alloc = _tensor_builder->at(output_index).get();
649   auto inputData_alloc = _tensor_builder->at(input_index).get();
650   const auto frontend_layout = _current_subg_layout;
651   const auto backend_layout = inputData_alloc->layout();
652
653   // Set initializers for indices data such as order of inputData
654   int input_rank = _ctx.at(input_index).shape().rank();
655   std::vector<int32_t> starts;
656   std::vector<int32_t> ends;
657   std::vector<int32_t> strides;
658   starts.resize(input_rank, 0);
659   ends.resize(input_rank, 0);
660   strides.resize(input_rank, 0);
661   {
662     auto startData_base = _ctx.at(starts_index).data().base();
663     auto endData_base = _ctx.at(ends_index).data().base();
664     auto stridesData_base = _ctx.at(strides_index).data().base();
665     const int startData_size = _ctx.at(starts_index).shape().num_elements();
666     const int endData_size = _ctx.at(ends_index).shape().num_elements();
667     const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
668
669     using neurun::model::DataType;
670
671     UNUSED_RELEASE(startData_size);
672     UNUSED_RELEASE(endData_size);
673     UNUSED_RELEASE(stridesData_size);
674
675     assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
676     assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
677     assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
678     assert(startData_size == input_rank);
679     assert(endData_size == input_rank);
680     assert(stridesData_size == input_rank);
681
682     assert(startData_base != nullptr);
683     for (int n = 0; n < input_rank; ++n)
684     {
685       auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
686                                                                   backend_layout)
687                       .value();
688
689       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
690       starts[axis] = start_value;
691
692       int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
693       ends[axis] = end_value;
694
695       int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
696       strides[axis] = strides_value;
697     }
698   }
699
700   // Set mask bits such as order of inputData
701   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
702                                                            frontend_layout, backend_layout);
703   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
704                                                          frontend_layout, backend_layout);
705   const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
706       node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
707
708   ::arm_compute::Coordinates starts_set;
709   ::arm_compute::Coordinates ends_set;
710   ::arm_compute::BiStrides strides_set;
711
712   for (size_t i = 0; i < starts.size(); ++i)
713   {
714     starts_set.set(i, starts[i]);
715     ends_set.set(i, ends[i]);
716     strides_set.set(i, strides[i]);
717   }
718
719   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStridedSlice>();
720
721   fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
722                 strides_set, begin_mask, end_mask, shrink_axis_mask);
723
724   auto acl_fn = asAclFunction(std::move(fn));
725
726   _execution_builder->append(std::move(acl_fn));
727 }
728
729 void KernelGenerator::visit(const model::operation::Transpose &node)
730 {
731   const auto ofm_idx{node.getOutputs().at(0)};
732   const auto ifm_idx{node.getInputs().at(model::operation::Transpose::Input::INPUT)};
733   const auto &perm{node.param().perm};
734
735   const auto rank = _ctx.at(ifm_idx).shape().rank();
736
737   auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
738   auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
739   const auto frontend_layout = _current_subg_layout;
740   const auto backend_layout = ifm_alloc->layout();
741
742   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
743   // Reversed
744   auto backend_pv = ::neurun::backend::acl_common::getARMComputePermutationVector(
745       rank, pv, frontend_layout, backend_layout);
746
747   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
748
749   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
750
751   auto acl_fn = asAclFunction(std::move(fn));
752
753   _execution_builder->append(std::move(acl_fn));
754 }
755
756 void KernelGenerator::visit(const model::operation::Add &node)
757 {
758   const auto ofm_index{node.getOutputs().at(0)};
759   const auto lhs_index{node.getInputs().at(model::operation::Add::Input::LHS)};
760   const auto rhs_index{node.getInputs().at(model::operation::Add::Input::RHS)};
761
762   const auto activation = node.param().activation;
763
764   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
765   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
766   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
767
768   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticAddition>();
769
770   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
771                 arm_compute::ConvertPolicy::SATURATE);
772
773   auto acl_fn = asAclFunction(std::move(fn));
774
775   _execution_builder->append(std::move(acl_fn));
776
777   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
778 }
779
780 void KernelGenerator::visit(const model::operation::Sub &node)
781 {
782   const auto ofm_index{node.getOutputs().at(0)};
783   const auto lhs_index{node.getInputs().at(model::operation::Sub::Input::LHS)};
784   const auto rhs_index{node.getInputs().at(model::operation::Sub::Input::RHS)};
785
786   const auto activation = node.param().activation;
787
788   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
789   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
790   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
791
792   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticSubtraction>();
793
794   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
795                 arm_compute::ConvertPolicy::SATURATE);
796
797   auto acl_fn = asAclFunction(std::move(fn));
798
799   _execution_builder->append(std::move(acl_fn));
800
801   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
802 }
803
804 void KernelGenerator::visit(const model::operation::Div &node)
805 {
806   const auto ofm_index{node.getOutputs().at(0)};
807   const auto lhs_index{node.getInputs().at(model::operation::Div::Input::LHS)};
808   const auto rhs_index{node.getInputs().at(model::operation::Div::Input::RHS)};
809
810   const auto activation = node.param().activation;
811
812   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
813   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
814   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
815
816   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticDivision>();
817
818   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
819
820   auto acl_fn = asAclFunction(std::move(fn));
821
822   _execution_builder->append(std::move(acl_fn));
823
824   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
825 }
826
827 void KernelGenerator::visit(const model::operation::Exp &node)
828 {
829   const auto output_index{node.getOutputs().at(0)};
830   const auto input_index{node.getInputs().at(model::operation::Exp::Input::INPUT)};
831
832   auto output_alloc = _tensor_builder->at(output_index).get();
833   auto input_alloc = _tensor_builder->at(input_index).get();
834
835   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLExpLayer>();
836
837   fn->configure(input_alloc->handle(), output_alloc->handle());
838
839   auto acl_fn = asAclFunction(std::move(fn));
840
841   _execution_builder->append(std::move(acl_fn));
842 }
843
844 void KernelGenerator::visit(const model::operation::InstanceNorm &node)
845 {
846   const auto ofm_index{node.getOutputs().at(0)};
847   const auto ifm_index{node.getInputs().at(model::operation::InstanceNorm::Input::INPUT)};
848   const auto gamma_index{node.getInputs().at(model::operation::InstanceNorm::Input::GAMMA)};
849   const auto beta_index{node.getInputs().at(model::operation::InstanceNorm::Input::BETA)};
850
851   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
852   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
853   auto gamma_alloc = _tensor_builder->at(gamma_index).get();
854   auto beta_alloc = _tensor_builder->at(beta_index).get();
855   auto epsilon = node.param().epsilon;
856   auto activation = node.param().activation;
857
858   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
859
860   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
861                 beta_alloc->handle(), epsilon);
862
863   auto acl_fn = asAclFunction(std::move(fn));
864
865   _execution_builder->append(std::move(acl_fn));
866
867   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
868 }
869
870 void KernelGenerator::visit(const model::operation::Logistic &node)
871 {
872   const auto ofm_index{node.getOutputs().at(0)};
873   const auto ifm_index{node.getInputs().at(model::operation::Logistic::Input::INPUT)};
874
875   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
876   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
877
878   const ::arm_compute::ActivationLayerInfo act_info{
879       ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
880
881   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
882
883   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
884
885   auto acl_fn = asAclFunction(std::move(fn));
886
887   _execution_builder->append(std::move(acl_fn));
888 }
889
890 void KernelGenerator::visit(const model::operation::LogicalAnd &node)
891 {
892   const auto output_index{node.getOutputs().at(0)};
893   const auto input0_index{node.getInputs().at(model::operation::LogicalAnd::Input::INPUT0)};
894   const auto input1_index{node.getInputs().at(model::operation::LogicalAnd::Input::INPUT1)};
895
896   auto output_alloc = _tensor_builder->at(output_index).get();
897   auto input0_alloc = _tensor_builder->at(input0_index).get();
898   auto input1_alloc = _tensor_builder->at(input1_index).get();
899
900   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBinaryLogicalOp>();
901
902   fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
903                 ::arm_compute::BinaryLogicalOperation::AND);
904
905   auto acl_fn = asAclFunction(std::move(fn));
906
907   _execution_builder->append(std::move(acl_fn));
908 }
909
910 void KernelGenerator::visit(const model::operation::LSTM &node)
911 {
912   // TODO Support dynamic rnn
913   // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
914   const auto scratch_buffer_index{
915       node.getOutputs().at(model::operation::LSTM::Output::SCRATCH_BUFFER)};
916   const auto output_state_out_index{
917       node.getOutputs().at(model::operation::LSTM::Output::OUTPUT_STATE_OUT)};
918   const auto cell_state_out_index{
919       node.getOutputs().at(model::operation::LSTM::Output::CELL_STATE_OUT)};
920   const auto output_index{node.getOutputs().at(model::operation::LSTM::Output::OUTPUT)};
921
922   const auto input_index{node.getInputs().at(model::operation::LSTM::Input::INPUT)};
923   const auto input_to_input_weights_index{
924       node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
925   const auto input_to_forget_weights_index{
926       node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
927   const auto input_to_cell_weights_index{
928       node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
929   const auto input_to_output_weights_index{
930       node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
931   const auto recurrent_to_input_weights_index{
932       node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
933   const auto recurrent_to_forget_weights_index{
934       node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
935   const auto recurrent_to_cell_weights_index{
936       node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
937   const auto recurrent_to_output_weights_index{
938       node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
939   const auto cell_to_input_weights_index{
940       node.getInputs().at(model::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
941   const auto cell_to_forget_weights_index{
942       node.getInputs().at(model::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
943   const auto cell_to_output_weights_index{
944       node.getInputs().at(model::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
945   const auto input_gate_bias_index{
946       node.getInputs().at(model::operation::LSTM::Input::INPUT_GATE_BIAS)};
947   const auto forget_gate_bias_index{
948       node.getInputs().at(model::operation::LSTM::Input::FORGET_GATE_BIAS)};
949   const auto cell_bias_index{node.getInputs().at(model::operation::LSTM::Input::CELL_BIAS)};
950   const auto output_gate_bias_index{
951       node.getInputs().at(model::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
952   const auto projection_weights_index{
953       node.getInputs().at(model::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
954   const auto projection_bias_index{
955       node.getInputs().at(model::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
956   const auto output_state_in_index{
957       node.getInputs().at(model::operation::LSTM::Input::OUTPUT_STATE_IN)};
958   const auto cell_state_in_index{node.getInputs().at(model::operation::LSTM::Input::CELL_STATE_IN)};
959   const auto cell_threshold = node.param().cell_threshold;
960   const auto projection_threshold = node.param().projection_threshold;
961
962   bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
963                                     _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
964   bool has_recurrent_to_input_weights =
965       _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
966       _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
967   bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
968   bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
969   bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
970                                 _ctx.at(projection_weights_index).shape().dim(1) != 0;
971   bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
972
973   // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
974   // true: no CIFG
975   // false: CIFG
976   // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
977   bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
978
979   // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
980   // But the cell_to_input_weights does not exist in regular CIFG although peephole.
981   // true: peephole
982   // false: no peephole
983   bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
984
985   // NOTE Although the projection weights has data the projection bias may not have data.
986   bool has_projection_param = has_projection_weights;
987
988   const auto activation = node.param().activation;
989   const auto cell_clip = cell_threshold;
990   const auto projection_clip = projection_threshold;
991   assert(cell_clip >= 0.f && projection_clip >= 0.f);
992
993   auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
994   auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
995   auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
996   auto output_alloc = _tensor_builder->at(output_index).get();
997
998   auto input_alloc = _tensor_builder->at(input_index).get();
999
1000   auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
1001   auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
1002   auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
1003   auto recurrent_to_forget_weights_alloc =
1004       _tensor_builder->at(recurrent_to_forget_weights_index).get();
1005   auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
1006   auto recurrent_to_output_weights_alloc =
1007       _tensor_builder->at(recurrent_to_output_weights_index).get();
1008
1009   auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
1010   auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
1011   auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
1012   auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
1013   auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
1014
1015   auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
1016
1017   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLLSTMLayer>();
1018
1019   ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
1020   if (has_cifg_param)
1021   {
1022     auto input_to_input_weights_alloc =
1023         _tensor_builder->at(input_to_input_weights_index).get(); // optional
1024     auto recurrent_to_input_weights_alloc =
1025         _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
1026     auto cell_to_input_weights_handle =
1027         has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
1028                            : nullptr; // optional (non-cifg && peephole)
1029     auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
1030     lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
1031                                 recurrent_to_input_weights_alloc->handle(),
1032                                 cell_to_input_weights_handle, input_gate_bias_alloc->handle());
1033   }
1034   if (has_peephole_param)
1035   {
1036     auto cell_to_forget_weights_alloc =
1037         _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
1038     auto cell_to_output_weights_alloc =
1039         _tensor_builder->at(cell_to_output_weights_index).get(); // optional
1040     lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
1041                                     cell_to_output_weights_alloc->handle());
1042   }
1043   if (has_projection_param)
1044   {
1045     auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
1046     auto projection_bias_handle = has_projection_bias
1047                                       ? _tensor_builder->at(projection_bias_index).get()->handle()
1048                                       : nullptr; // optional
1049     lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
1050   }
1051
1052   fn->configure(
1053       input_alloc->handle(), input_to_forget_weights_alloc->handle(),
1054       input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
1055       recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
1056       recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
1057       cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
1058       cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
1059       output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
1060       lstm_params, act_info, cell_clip, projection_clip);
1061
1062   auto acl_fn = asAclFunction(std::move(fn));
1063
1064   _execution_builder->append(std::move(acl_fn));
1065 }
1066
1067 void KernelGenerator::visit(const model::operation::ReduceMax &node)
1068 {
1069   const auto output_index{node.getOutputs().at(0)};
1070   const auto input_index{node.getInputs().at(model::operation::ReduceMax::Input::INPUT)};
1071   const auto &axes{node.param().axes};
1072
1073   auto ofm_alloc = _tensor_builder->at(output_index).get();
1074   auto ifm_alloc = _tensor_builder->at(input_index).get();
1075   const auto frontend_layout = _current_subg_layout;
1076   const auto backend_layout = ifm_alloc->layout();
1077
1078   // Convert to ACL axes taking into account negative values and possible duplicates.
1079   std::set<std::uint32_t> acl_axes;
1080   const int ifm_rank = _ctx.at(input_index).shape().rank();
1081   for (int axis : axes)
1082   {
1083     if (axis < 0)
1084       axis += ifm_rank;
1085     acl_axes.insert(
1086         acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
1087   }
1088
1089   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
1090
1091   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), acl_axes,
1092                 arm_compute::ReduceOperation::MAX);
1093
1094   auto acl_fn = asAclFunction(std::move(fn));
1095
1096   _execution_builder->append(std::move(acl_fn));
1097 }
1098
1099 void KernelGenerator::visit(const model::operation::Comparison &node)
1100 {
1101   const auto output_index{node.getOutputs().at(0)};
1102   const auto input0_index{node.getInputs().at(model::operation::Comparison::Input::INPUT0)};
1103   const auto input1_index{node.getInputs().at(model::operation::Comparison::Input::INPUT1)};
1104
1105   const auto comparison_type = node.param().comparison_type;
1106
1107   auto output_alloc = _tensor_builder->at(output_index).get();
1108   auto input0_alloc = _tensor_builder->at(input0_index).get();
1109   auto input1_alloc = _tensor_builder->at(input1_index).get();
1110
1111   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLComparison>();
1112
1113   fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
1114                 (arm_compute::ComparisonOperation)comparison_type);
1115
1116   auto acl_fn = asAclFunction(std::move(fn));
1117
1118   _execution_builder->append(std::move(acl_fn));
1119 }
1120
1121 void KernelGenerator::visit(const model::operation::Pack &node)
1122 {
1123   const auto output_index{node.getOutputs().at(0)};
1124   auto axis{node.param().axis};
1125
1126   const auto output_rank = _ctx.at(output_index).shape().rank();
1127
1128   std::vector<model::OperandIndex> input_indexes;
1129   for (const auto &input_index : node.getInputs())
1130     input_indexes.emplace_back(input_index);
1131
1132   auto output = _tensor_builder->at(output_index).get()->handle();
1133   std::vector<arm_compute::ICLTensor *> inputs;
1134   for (const auto &input_index : input_indexes)
1135     inputs.emplace_back(_tensor_builder->at(input_index)->handle());
1136
1137   const auto frontend_layout = _current_subg_layout;
1138   const auto backend_layout = _tensor_builder->at(output_index).get()->layout();
1139
1140   if (output_rank >= 4 && _current_subg_layout != backend_layout)
1141   {
1142     throw std::runtime_error("ACL CL : Pack does not support different layouts between frontend "
1143                              "and backend in ranks above 4");
1144   }
1145
1146   if (axis < 0)
1147     axis += output_rank;
1148   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1149
1150   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStackLayer>();
1151
1152   fn->configure(inputs, axis, output);
1153
1154   _execution_builder->append(asAclFunction(std::move(fn)));
1155 }
1156
1157 void KernelGenerator::visit(const model::operation::Permute &node)
1158 {
1159   const auto ofm_idx{node.getOutputs().at(0)};
1160   const auto ifm_idx{node.getInputs().at(0)};
1161   const auto permute_type = node.getPermuteType();
1162   auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
1163   auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
1164   const auto rank = _ctx.at(ofm_idx).shape().rank();
1165   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
1166
1167   std::unique_ptr<::arm_compute::IFunction> fn;
1168   arm_compute::PermutationVector pv;
1169   if (permute_type == model::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
1170   {
1171     // WHCN -> CWHN
1172     pv = arm_compute::PermutationVector{2, 0, 1};
1173
1174     auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
1175
1176     l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1177
1178     fn = std::move(l);
1179   }
1180   else if (permute_type == model::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
1181   {
1182     // CWHN -> WHCN
1183     pv = arm_compute::PermutationVector{1, 2, 0};
1184
1185     auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
1186
1187     l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1188
1189     fn = std::move(l);
1190   }
1191   else
1192   {
1193     auto l = nnfw::cpp14::make_unique<::arm_compute::CLCopy>();
1194
1195     l->configure(ifm_alloc->handle(), ofm_alloc->handle());
1196
1197     fn = std::move(l);
1198   }
1199
1200   auto acl_fn = asAclFunction(std::move(fn));
1201
1202   _execution_builder->append(std::move(acl_fn));
1203 }
1204
1205 void KernelGenerator::visit(const model::operation::RSQRT &node)
1206 {
1207   const auto ofm_index{node.getOutputs().at(0)};
1208   const auto ifm_index{node.getInputs().at(model::operation::RSQRT::Input::INPUT)};
1209
1210   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1211   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1212
1213   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRsqrtLayer>();
1214
1215   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1216
1217   _execution_builder->append(asAclFunction(std::move(fn)));
1218 }
1219
1220 void KernelGenerator::visit(const model::operation::ReLU &node)
1221 {
1222   const auto output_index{node.getOutputs().at(0)};
1223   const auto input_index{node.getInputs().at(model::operation::ReLU::Input::INPUT)};
1224
1225   auto output_alloc = _tensor_builder->at(output_index).get();
1226   auto input_alloc = _tensor_builder->at(input_index).get();
1227
1228   auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
1229
1230   const ::arm_compute::ActivationLayerInfo act_info{
1231       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
1232
1233   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1234
1235   auto acl_fn = asAclFunction(std::move(fn));
1236
1237   _execution_builder->append(std::move(acl_fn));
1238 }
1239
1240 void KernelGenerator::visit(const model::operation::ResizeBilinear &node)
1241 {
1242   const auto ofm_index{node.getOutputs().at(0)};
1243
1244   const auto ifm_index{node.getInputs().at(model::operation::ResizeBilinear::Input::INPUT)};
1245
1246   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1247   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1248
1249   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLScale>();
1250
1251   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
1252                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
1253                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
1254
1255   auto acl_fn = asAclFunction(std::move(fn));
1256
1257   _execution_builder->append(std::move(acl_fn));
1258 }
1259
1260 void KernelGenerator::visit(const model::operation::ReLU1 &node)
1261 {
1262   const auto ofm_index{node.getOutputs().at(0)};
1263   const auto ifm_index{node.getInputs().at(model::operation::ReLU1::Input::INPUT)};
1264
1265   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1266   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1267
1268   const ::arm_compute::ActivationLayerInfo act_info{
1269       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
1270
1271   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1272
1273   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1274
1275   auto acl_fn = asAclFunction(std::move(fn));
1276
1277   _execution_builder->append(std::move(acl_fn));
1278 }
1279
1280 void KernelGenerator::visit(const model::operation::ReLU6 &node)
1281 {
1282   const auto ofm_index{node.getOutputs().at(0)};
1283   const auto ifm_index{node.getInputs().at(model::operation::ReLU6::Input::INPUT)};
1284
1285   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1286   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1287
1288   const ::arm_compute::ActivationLayerInfo act_info{
1289       ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
1290
1291   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1292
1293   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1294
1295   auto acl_fn = asAclFunction(std::move(fn));
1296
1297   _execution_builder->append(std::move(acl_fn));
1298 }
1299
1300 void KernelGenerator::visit(const model::operation::RNN &node)
1301 {
1302   const auto output_index{node.getOutputs().at(model::operation::RNN::Output::OUTPUT)};
1303   const auto hidden_state_out_index{
1304       node.getOutputs().at(model::operation::RNN::Output::HIDDEN_STATE_OUT)};
1305
1306   const auto input_index{node.getInputs().at(model::operation::RNN::Input::INPUT)};
1307   const auto weights_index{node.getInputs().at(model::operation::RNN::Input::WEIGHTS)};
1308   const auto recurrent_weights_index{
1309       node.getInputs().at(model::operation::RNN::Input::RECURRENT_WEIGHTS)};
1310   const auto bias_index{node.getInputs().at(model::operation::RNN::Input::BIAS)};
1311   const auto hidden_state_in_index{
1312       node.getInputs().at(model::operation::RNN::Input::HIDDEN_STATE_IN)};
1313
1314   const auto activation = node.param().activation;
1315
1316   auto output_alloc = _tensor_builder->at(output_index).get();
1317   auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
1318
1319   auto input_alloc = _tensor_builder->at(input_index).get();
1320   auto weights_alloc = _tensor_builder->at(weights_index).get();
1321   auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
1322   auto bias_alloc = _tensor_builder->at(bias_index).get();
1323   auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
1324   auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
1325
1326   auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::CLCopy>();
1327   copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
1328   _execution_builder->append(asAclFunction(std::move(copy_layer)));
1329
1330   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRNNLayerEx>(
1331       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1332   fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
1333                 bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
1334                 act_info);
1335   _execution_builder->append(asAclFunction(std::move(fn)));
1336 }
1337
1338 void KernelGenerator::visit(const model::operation::Floor &node)
1339 {
1340   const auto ofm_index{node.getOutputs().at(0)};
1341   const auto ifm_index{node.getInputs().at(model::operation::Floor::Input::INPUT)};
1342
1343   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1344   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1345
1346   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLFloor>();
1347
1348   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1349
1350   auto acl_fn = asAclFunction(std::move(fn));
1351
1352   _execution_builder->append(std::move(acl_fn));
1353 }
1354
1355 void KernelGenerator::visit(const model::operation::SpaceToBatchND &node)
1356 {
1357   const auto ofm_index{node.getOutputs().at(0)};
1358   const auto ifm_index{node.getInputs().at(model::operation::SpaceToBatchND::Input::INPUT)};
1359   const auto block_size_index{
1360       node.getInputs().at(model::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1361   const auto paddings_index{node.getInputs().at(model::operation::SpaceToBatchND::Input::PADDINGS)};
1362
1363   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1364   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1365   auto block_size_alloc = _tensor_builder->at(block_size_index).get();
1366   auto paddings_alloc = _tensor_builder->at(paddings_index).get();
1367
1368   assert(_ctx.at(block_size_index).isConstant());
1369   assert(_ctx.at(paddings_index).isConstant());
1370
1371   std::unique_ptr<::arm_compute::IFunction> fn;
1372   if (_ctx.at(ofm_index).typeInfo().type() == model::DataType::QUANT8_ASYMM)
1373   {
1374     // NOTE CLSpaceToBatchLayer has a bug that padding's values are 0 even when zero point of
1375     // QASYMM8 is not 0.
1376     auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToBatchND>();
1377     l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
1378                  ofm_alloc->handle());
1379     fn = std::move(l);
1380   }
1381   else
1382   {
1383     auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToBatchLayer>();
1384     l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
1385                  ofm_alloc->handle());
1386     fn = std::move(l);
1387   }
1388
1389   auto acl_fn = asAclFunction(std::move(fn));
1390
1391   _execution_builder->append(std::move(acl_fn));
1392 }
1393
1394 void KernelGenerator::visit(const model::operation::SpaceToDepth &node)
1395 {
1396   const auto ofm_index{node.getOutputs().at(0)};
1397   const auto ifm_index{node.getInputs().at(model::operation::SpaceToDepth::Input::INPUT)};
1398
1399   auto block_size = node.param().block_size;
1400
1401   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1402   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1403
1404   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToDepth>();
1405
1406   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
1407
1408   auto acl_fn = asAclFunction(std::move(fn));
1409
1410   _execution_builder->append(std::move(acl_fn));
1411 }
1412
1413 void KernelGenerator::visit(const model::operation::L2Pool2D &node)
1414 {
1415   const auto ofm_index{node.getOutputs().at(0)};
1416   const auto ifm_index{node.getInputs().at(model::operation::L2Pool2D::Input::INPUT)};
1417
1418   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
1419   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
1420
1421   uint32_t kw = node.param().kw;
1422   uint32_t kh = node.param().kh;
1423   const auto stride = node.param().stride;
1424   const auto padding =
1425       neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
1426   const auto activation = node.param().activation;
1427
1428   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1429   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1430
1431   ::arm_compute::PoolingLayerInfo info{
1432       ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
1433       ::neurun::backend::acl_common::asPadStrideInfo(padding, stride)};
1434
1435   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
1436
1437   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
1438
1439   auto acl_fn = asAclFunction(std::move(fn));
1440
1441   _execution_builder->append(std::move(acl_fn));
1442
1443   ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
1444 }
1445
1446 void KernelGenerator::visit(const model::operation::EmbeddingLookup &node)
1447 {
1448   const auto output_index{node.getOutputs().at(0)};
1449   const auto lookups_index{node.getInputs().at(model::operation::EmbeddingLookup::Input::LOOKUPS)};
1450   const auto values_index{node.getInputs().at(model::operation::EmbeddingLookup::Input::VALUES)};
1451
1452   auto output_alloc = _tensor_builder->at(output_index).get();
1453   auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1454   auto values_alloc = _tensor_builder->at(values_index).get();
1455
1456   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLEmbeddingLookup>();
1457
1458   fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
1459
1460   auto acl_fn = asAclFunction(std::move(fn));
1461
1462   _execution_builder->append(std::move(acl_fn));
1463 }
1464
1465 void KernelGenerator::visit(const model::operation::L2Normalization &node)
1466 {
1467   const auto ofm_index{node.getOutputs().at(0)};
1468   const auto ifm_index{node.getInputs().at(model::operation::L2Normalization::Input::INPUT)};
1469
1470   // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1471   // L2 Normalization always performs the reduction along the depth axis
1472   // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1473   // choosing normalization parameters as below
1474
1475   const auto &ifm_shape = _ctx.at(ifm_index).shape();
1476   // TODO Support optional constant dimension that normalization would be performed on
1477   const auto normalization_axis = ifm_shape.rank() - 1;
1478   int32_t radius =
1479       2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1480   float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
1481   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
1482   float bias = 0.0f;                             // Don't offset the reduction.
1483
1484   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1485   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1486
1487   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1488                                                                radius, alpha, beta, bias, false);
1489
1490   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
1491
1492   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1493
1494   auto acl_fn = asAclFunction(std::move(fn));
1495
1496   _execution_builder->append(std::move(acl_fn));
1497 }
1498
1499 void KernelGenerator::visit(const model::operation::HashtableLookup &node)
1500 {
1501   const auto output_index{node.getOutputs().at(model::operation::HashtableLookup::Output::OUTPUT)};
1502   const auto hits_index{node.getOutputs().at(model::operation::HashtableLookup::Output::HITS)};
1503
1504   const auto lookups_index{node.getInputs().at(model::operation::HashtableLookup::Input::LOOKUPS)};
1505   const auto keys_index{node.getInputs().at(model::operation::HashtableLookup::Input::KEYS)};
1506   const auto values_index{node.getInputs().at(model::operation::HashtableLookup::Input::VALUES)};
1507
1508   auto output_alloc = _tensor_builder->at(output_index).get();
1509   auto hits_alloc = _tensor_builder->at(hits_index).get();
1510
1511   auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1512   auto keys_alloc = _tensor_builder->at(keys_index).get();
1513   auto values_alloc = _tensor_builder->at(values_index).get();
1514
1515   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLHashtableLookup>();
1516
1517   fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
1518                 output_alloc->handle(), hits_alloc->handle());
1519
1520   auto acl_fn = asAclFunction(std::move(fn));
1521
1522   _execution_builder->append(std::move(acl_fn));
1523 }
1524
1525 void KernelGenerator::visit(const model::operation::PReLU &node)
1526 {
1527   const auto ofm_index{node.getOutputs().at(0)};
1528   const auto ifm_index{node.getInputs().at(model::operation::PReLU::Input::INPUT)};
1529   const auto alpha_index{node.getInputs().at(model::operation::PReLU::Input::ALPHA)};
1530
1531   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1532   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1533   auto alpha_alloc = _tensor_builder->at(alpha_index).get();
1534
1535   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPReLU>();
1536
1537   fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
1538
1539   auto acl_fn = asAclFunction(std::move(fn));
1540
1541   _execution_builder->append(std::move(acl_fn));
1542 }
1543
1544 void KernelGenerator::visit(const model::operation::TransposeConv &node)
1545 {
1546   const auto ofm_index{node.getOutputs().at(0)};
1547   const auto output_shape_index{
1548       node.getInputs().at(model::operation::TransposeConv::Input::OUTPUT_SHAPE)};
1549   const auto ker_index{node.getInputs().at(model::operation::TransposeConv::Input::KERNEL)};
1550   const auto ifm_index{node.getInputs().at(model::operation::TransposeConv::Input::INPUT)};
1551
1552   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
1553   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
1554   const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_subg_layout);
1555
1556   const auto stride = node.param().stride;
1557
1558   assert((node.param().padding.type == model::PaddingType::SAME) ||
1559          (node.param().padding.type == model::PaddingType::VALID));
1560   auto padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1561                                                 ker_shape.W, ker_shape.H);
1562
1563   uint32_t invalid_horizontal = 0;
1564   uint32_t invalid_vertical = 0;
1565   if (node.param().padding.type == model::PaddingType::VALID)
1566   {
1567     invalid_horizontal =
1568         ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1569     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1570   }
1571
1572   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1573   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1574   auto ker_alloc = _tensor_builder->at(ker_index).get();
1575
1576   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1577
1578   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>(
1579       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1580
1581   fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
1582                 invalid_horizontal, invalid_vertical);
1583
1584   auto acl_fn = asAclFunction(std::move(fn));
1585
1586   _execution_builder->append(std::move(acl_fn));
1587 }
1588
1589 void KernelGenerator::visit(const model::operation::SQRT &node)
1590 {
1591   const auto output_index{node.getOutputs().at(0)};
1592   const auto input_index{node.getInputs().at(model::operation::SQRT::Input::INPUT)};
1593
1594   auto output_alloc = _tensor_builder->at(output_index).get();
1595   auto input_alloc = _tensor_builder->at(input_index).get();
1596
1597   const ::arm_compute::ActivationLayerInfo act_info{
1598       ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
1599
1600   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1601
1602   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1603
1604   auto acl_fn = asAclFunction(std::move(fn));
1605
1606   _execution_builder->append(std::move(acl_fn));
1607 }
1608
1609 void KernelGenerator::visit(const model::operation::LogicalOr &node)
1610 {
1611   const auto output_index{node.getOutputs().at(0)};
1612   const auto input0_index{node.getInputs().at(model::operation::LogicalOr::Input::INPUT0)};
1613   const auto input1_index{node.getInputs().at(model::operation::LogicalOr::Input::INPUT1)};
1614
1615   auto output_alloc = _tensor_builder->at(output_index).get();
1616   auto input0_alloc = _tensor_builder->at(input0_index).get();
1617   auto input1_alloc = _tensor_builder->at(input1_index).get();
1618
1619   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseOr>();
1620
1621   fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
1622
1623   auto acl_fn = asAclFunction(std::move(fn));
1624
1625   _execution_builder->append(std::move(acl_fn));
1626 }
1627
1628 void KernelGenerator::visit(const model::operation::LogicalNot &node)
1629 {
1630   const auto output_index{node.getOutputs().at(0)};
1631   const auto input_index{node.getInputs().at(model::operation::LogicalNot::Input::INPUT)};
1632
1633   auto output_alloc = _tensor_builder->at(output_index).get();
1634   auto input_alloc = _tensor_builder->at(input_index).get();
1635
1636   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseNot>();
1637
1638   fn->configure(input_alloc->handle(), output_alloc->handle());
1639
1640   auto acl_fn = asAclFunction(std::move(fn));
1641
1642   _execution_builder->append(std::move(acl_fn));
1643 }
1644
1645 void KernelGenerator::visit(const model::operation::SquaredDifference &node)
1646 {
1647   const auto ofm_index{node.getOutputs().at(0)};
1648   const auto lhs_index{node.getInputs().at(model::operation::SquaredDifference::Input::LHS)};
1649   const auto rhs_index{node.getInputs().at(model::operation::SquaredDifference::Input::RHS)};
1650
1651   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1652   auto lhs_alloc = _tensor_builder->at(lhs_index).get();
1653   auto rhs_alloc = _tensor_builder->at(rhs_index).get();
1654
1655   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
1656
1657   fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
1658
1659   auto acl_fn = asAclFunction(std::move(fn));
1660
1661   _execution_builder->append(std::move(acl_fn));
1662 }
1663
1664 void KernelGenerator::visit(const model::operation::TopKV2 &node)
1665 {
1666   const auto outputValues_index{
1667       node.getOutputs().at(model::operation::TopKV2::Output::OUTPUT_VALUES)};
1668   const auto outputIndices_index{
1669       node.getOutputs().at(model::operation::TopKV2::Output::OUTPUT_INDICES)};
1670
1671   const auto inputData_index{node.getInputs().at(model::operation::TopKV2::Input::INPUT)};
1672
1673   // Currently, we only support the vector input.
1674   assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1675          _ctx.at(inputData_index).shape().rank() == 2);
1676
1677   const auto k = node.param().k;
1678
1679   auto values_alloc = _tensor_builder->at(outputValues_index).get();
1680   auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
1681   auto input_alloc = _tensor_builder->at(inputData_index).get();
1682
1683   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTopKV2>();
1684
1685   fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
1686
1687   auto acl_fn = asAclFunction(std::move(fn));
1688
1689   _execution_builder->append(std::move(acl_fn));
1690 }
1691
1692 void KernelGenerator::visit(const model::operation::Gather &node)
1693 {
1694   const auto ofm_index{node.getOutputs().at(0)};
1695
1696   const auto ifm_index{node.getInputs().at(model::operation::Gather::Input::INPUT)};
1697   const auto indices_index{node.getInputs().at(model::operation::Gather::Input::INDICES)};
1698
1699   const auto ifm_shape = _ctx.at(ifm_index).shape();
1700
1701   const auto axis_value = node.param().axis;
1702   const int axis =
1703       ::neurun::backend::acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value();
1704
1705   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1706   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1707   auto indices_alloc = _tensor_builder->at(indices_index).get();
1708
1709   // NOTE The frontend layout and backend layout must be the same for this operation.
1710   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1711   //      is not not efficient even if it works well. If so, it would be better to set the
1712   //      layout of these backend tensors to the same layout.
1713   //      There is also one thing we have to think about. This operation depends on the layout of
1714   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1715   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1716   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1717   const auto backend_layout = ofm_alloc->layout();
1718   UNUSED_RELEASE(backend_layout);
1719   assert(backend_layout == ifm_alloc->layout());
1720   assert(backend_layout == indices_alloc->layout());
1721   assert(ifm_shape.rank() < 4 || _current_subg_layout == backend_layout);
1722
1723   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLGatherEx>();
1724
1725   fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
1726
1727   auto acl_fn = asAclFunction(std::move(fn));
1728
1729   _execution_builder->append(std::move(acl_fn));
1730 }
1731
1732 void KernelGenerator::visit(const model::operation::Neg &node)
1733 {
1734   const auto ofm_index{node.getOutputs().at(0)};
1735   const auto ifm_index{node.getInputs().at(model::operation::Neg::Input::INPUT)};
1736
1737   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1738   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1739
1740   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNeg>();
1741
1742   fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1743
1744   auto acl_fn = asAclFunction(std::move(fn));
1745
1746   _execution_builder->append(std::move(acl_fn));
1747 }
1748
1749 void KernelGenerator::visit(const model::operation::Abs &node)
1750 {
1751   const auto output_index{node.getOutputs().at(0)};
1752   const auto input_index{node.getInputs().at(model::operation::Abs::Input::INPUT)};
1753
1754   auto output_alloc = _tensor_builder->at(output_index).get();
1755   auto input_alloc = _tensor_builder->at(input_index).get();
1756
1757   const ::arm_compute::ActivationLayerInfo act_info{
1758       ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
1759
1760   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1761
1762   fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1763
1764   auto acl_fn = asAclFunction(std::move(fn));
1765
1766   _execution_builder->append(std::move(acl_fn));
1767 }
1768
1769 void KernelGenerator::visit(const model::operation::ArgMax &node)
1770 {
1771   const auto ofm_index{node.getOutputs().at(0)};
1772   const auto ifm_index{node.getInputs().at(model::operation::ArgMax::Input::INPUT)};
1773
1774   auto ifm_shape = _ctx.at(ifm_index).shape();
1775   auto ofm_shape = _ctx.at(ofm_index).shape();
1776
1777   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1778
1779   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1780   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1781   const auto ifm_rank = ifm_shape.rank();
1782   auto frontend_layout = _current_subg_layout;
1783   auto backend_layout = ifm_alloc->layout();
1784
1785   int axis_value = node.param().axis;
1786   if (axis_value < 0)
1787   {
1788     axis_value += ifm_rank;
1789   }
1790
1791   auto acl_axis =
1792       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1793
1794   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArgOperation>();
1795
1796   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
1797                 ::arm_compute::ArgOperation::MAX);
1798
1799   auto acl_fn = asAclFunction(std::move(fn));
1800
1801   _execution_builder->append(std::move(acl_fn));
1802 }
1803
1804 void KernelGenerator::visit(const model::operation::Dequantize &node)
1805 {
1806   const auto output_index{node.getOutputs().at(0)};
1807   const auto input_index{node.getInputs().at(model::operation::Dequantize::Input::INPUT)};
1808
1809   auto output_alloc = _tensor_builder->at(output_index).get();
1810   auto input_alloc = _tensor_builder->at(input_index).get();
1811
1812   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
1813
1814   fn->configure(input_alloc->handle(), output_alloc->handle());
1815
1816   auto acl_fn = asAclFunction(std::move(fn));
1817
1818   _execution_builder->append(std::move(acl_fn));
1819 }
1820
1821 void KernelGenerator::visit(const model::operation::Mean &node)
1822 {
1823   const auto ofm_index{node.getOutputs().at(0)};
1824   const auto ifm_index{node.getInputs().at(model::operation::Mean::Input::INPUT)};
1825   const auto &axes{node.param().axes};
1826   const auto keep_dims{node.param().keep_dims};
1827
1828   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1829   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1830   const auto frontend_layout = _current_subg_layout;
1831   const auto backend_layout = ifm_alloc->layout();
1832
1833   // Convert to ACL axes taking into account negative values and possible duplicates.
1834   std::set<std::uint32_t> acl_axes;
1835   const int ifm_rank = _ctx.at(ifm_index).shape().rank();
1836   for (int axis : axes)
1837   {
1838     if (axis < 0)
1839       axis += ifm_rank;
1840     acl_axes.insert(
1841         acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
1842   }
1843
1844   arm_compute::Coordinates reduce_axes;
1845   for (const auto axis : acl_axes)
1846   {
1847     reduce_axes.set(reduce_axes.num_dimensions(), axis);
1848   }
1849
1850   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceMean>();
1851
1852   fn->configure(ifm_alloc->handle(), reduce_axes, keep_dims, ofm_alloc->handle());
1853
1854   auto acl_fn = asAclFunction(std::move(fn));
1855
1856   _execution_builder->append(std::move(acl_fn));
1857 }
1858
1859 void KernelGenerator::visit(const model::operation::LocalResponseNormalization &node)
1860 {
1861   const auto ofm_index{node.getOutputs().at(0)};
1862   const auto ifm_index{
1863       node.getInputs().at(model::operation::LocalResponseNormalization::Input::INPUT)};
1864
1865   auto radius = node.param().radius;
1866   auto alpha = node.param().alpha;
1867   auto beta = node.param().beta;
1868   auto bias = node.param().bias;
1869
1870   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1871   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1872
1873   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1874       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1875
1876   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
1877
1878   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1879
1880   auto acl_fn = asAclFunction(std::move(fn));
1881
1882   _execution_builder->append(std::move(acl_fn));
1883 }
1884
1885 void KernelGenerator::visit(const model::operation::DepthToSpace &node)
1886 {
1887   const auto output_index{node.getOutputs().at(0)};
1888   const auto input_index{node.getInputs().at(model::operation::DepthToSpace::Input::INPUT)};
1889
1890   auto block_size = node.param().block_size;
1891   assert(block_size > 0);
1892
1893   auto output_alloc = _tensor_builder->at(output_index).get();
1894   auto input_alloc = _tensor_builder->at(input_index).get();
1895
1896   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthToSpace>();
1897
1898   fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
1899
1900   auto acl_fn = asAclFunction(std::move(fn));
1901
1902   _execution_builder->append(std::move(acl_fn));
1903 }
1904
1905 void KernelGenerator::visit(const model::operation::ReduceMin &node)
1906 {
1907   const auto ofm_index{node.getOutputs().at(0)};
1908   const auto ifm_index{node.getInputs().at(model::operation::ReduceMin::Input::INPUT)};
1909   const auto &axes{node.param().axes};
1910
1911   auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1912   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1913   const auto frontend_layout = _current_subg_layout;
1914   const auto backend_layout = ifm_alloc->layout();
1915
1916   // Convert to ACL axes taking into account negative values and possible duplicates.
1917   std::set<std::uint32_t> acl_axes;
1918   const int ifm_rank = _ctx.at(ifm_index).shape().rank();
1919   for (int axis : axes)
1920   {
1921     if (axis < 0)
1922       axis += ifm_rank;
1923     acl_axes.insert(
1924         acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
1925   }
1926
1927   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
1928
1929   fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), acl_axes,
1930                 ::arm_compute::ReduceOperation::MIN);
1931
1932   auto acl_fn = asAclFunction(std::move(fn));
1933
1934   _execution_builder->append(std::move(acl_fn));
1935 }
1936
1937 void KernelGenerator::visit(const model::operation::Split &node)
1938 {
1939   const auto ifm_index{node.getInputs().at(model::operation::Split::Input::INPUT)};
1940
1941   assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1942
1943   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1944   std::vector<model::OperandIndex> output_indexes;
1945   for (const auto &output : node.getOutputs())
1946     output_indexes.emplace_back(output);
1947
1948   auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1949   std::vector<arm_compute::ICLTensor *> output_allocs;
1950   for (const auto &ofm_ind : output_indexes)
1951     output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
1952
1953   const auto frontend_layout = _current_subg_layout;
1954   const auto backend_layout = ifm_alloc->layout();
1955   auto axis = node.param().axis;
1956   if (axis < 0)
1957     axis += ifm_rank;
1958   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1959
1960   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSplit>();
1961
1962   fn->configure(ifm_alloc->handle(), output_allocs, axis);
1963
1964   _execution_builder->append(asAclFunction(std::move(fn)));
1965 }
1966
1967 void KernelGenerator::visit(const model::operation::Unpack &node)
1968 {
1969   const auto input_index{node.getInputs().at(model::operation::Unpack::Input::INPUT)};
1970   auto axis{node.param().axis};
1971
1972   const auto input_rank = _ctx.at(input_index).shape().rank();
1973
1974   std::vector<model::OperandIndex> output_indexes;
1975   for (const auto &output_index : node.getOutputs())
1976     output_indexes.emplace_back(output_index);
1977
1978   auto input = _tensor_builder->at(input_index).get()->handle();
1979   std::vector<arm_compute::ICLTensor *> outputs;
1980   for (const auto &output_index : output_indexes)
1981     outputs.emplace_back(_tensor_builder->at(output_index)->handle());
1982
1983   const auto frontend_layout = _current_subg_layout;
1984   const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
1985   if (axis < 0)
1986     axis += input_rank;
1987   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1988
1989   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLUnstack>();
1990
1991   fn->configure(input, outputs, axis);
1992
1993   _execution_builder->append(asAclFunction(std::move(fn)));
1994 }
1995
1996 void KernelGenerator::visit(const model::operation::Pad &node)
1997 {
1998   const auto input_index{node.getInputs().at(model::operation::Pad::Input::INPUT)};
1999   const auto pad_index{node.getInputs().at(model::operation::Pad::Input::PAD)};
2000   const auto output_index{node.getOutputs().at(0)};
2001   assert(_ctx.at(pad_index).isConstant());
2002
2003   auto rank = _ctx.at(pad_index).shape().dim(0);
2004   auto pad_base = _ctx.at(pad_index).data().base();
2005
2006   auto input_type = _ctx.at(input_index).typeInfo();
2007   auto data_type = acl_common::asDataType(input_type.type());
2008   auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
2009   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
2010
2011   auto input = _tensor_builder->at(input_index).get()->handle();
2012   auto output = _tensor_builder->at(output_index).get()->handle();
2013
2014   ::arm_compute::PaddingList padding_list;
2015   padding_list.resize(rank);
2016   for (int32_t n = 0; n < rank; ++n)
2017   {
2018     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
2019
2020     const auto frontend_layout = _current_subg_layout;
2021     const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
2022     const auto axis =
2023         acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
2024     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
2025   }
2026   auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPadLayer>();
2027   fn->configure(input, output, padding_list, pixel_value);
2028
2029   _execution_builder->append(asAclFunction(std::move(fn)));
2030 }
2031
2032 } // namespace acl_cl
2033 } // namespace backend
2034 } // namespace neurun