2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "KernelGenerator.h"
19 #include <arm_compute/runtime/CL/CLFunctions.h> // Include all ARM Compute CL functions
20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
22 #include <AclFunction.h>
26 #include "kernel/ConcatLayer.h"
27 #include "model/Index.h"
28 #include "model/DataType.h"
29 #include "model/InternalType.h"
30 #include "compiler/IExecutionBuilder.h"
31 #include "exec/NopFunction.h"
32 #include "util/logging.h"
33 #include "util/Utils.h"
34 #include "util/Padding.h"
36 using ::neurun::compiler::IExecutionBuilder;
45 using ::neurun::backend::acl_common::asAclFunction;
50 class ActivationBuilder
53 explicit ActivationBuilder(IExecutionBuilder &builder) : _builder(builder)
59 void appendReLU(::arm_compute::ICLTensor *ifm_alloc);
60 void appendReLU1(::arm_compute::ICLTensor *ifm_alloc);
61 void appendReLU6(::arm_compute::ICLTensor *ifm_alloc);
64 void append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc);
67 IExecutionBuilder &_builder;
70 void ActivationBuilder::appendReLU(::arm_compute::ICLTensor *ifm_alloc)
72 const ::arm_compute::ActivationLayerInfo act_info{
73 ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
75 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
77 fn->configure(ifm_alloc, nullptr, act_info);
79 auto acl_fn = asAclFunction(std::move(fn));
81 _builder.append(std::move(acl_fn));
84 void ActivationBuilder::appendReLU1(::arm_compute::ICLTensor *ifm_alloc)
86 const ::arm_compute::ActivationLayerInfo act_info{
87 ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
89 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
91 fn->configure(ifm_alloc, nullptr, act_info);
93 auto acl_fn = asAclFunction(std::move(fn));
95 _builder.append(std::move(acl_fn));
98 void ActivationBuilder::appendReLU6(::arm_compute::ICLTensor *ifm_alloc)
100 const ::arm_compute::ActivationLayerInfo act_info{
101 ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
103 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
105 fn->configure(ifm_alloc, nullptr, act_info);
107 auto acl_fn = asAclFunction(std::move(fn));
109 _builder.append(std::move(acl_fn));
112 void ActivationBuilder::append(model::Activation code, ::arm_compute::ICLTensor *ifm_alloc)
116 case model::Activation::NONE:
121 case model::Activation::RELU:
123 appendReLU(ifm_alloc);
126 case model::Activation::RELU1:
128 appendReLU1(ifm_alloc);
131 case model::Activation::RELU6:
133 appendReLU6(ifm_alloc);
138 throw std::runtime_error("Not supported, yet");
146 KernelGenerator::KernelGenerator(const neurun::model::Operands &ctx,
147 const std::shared_ptr<TensorBuilder> &tensor_builder)
148 : _ctx(ctx), _tensor_builder(tensor_builder), _current_subg_layout(ir::Layout::UNKNOWN)
153 void KernelGenerator::visit(const model::Subgraph &subgraph)
155 _current_subg_layout = subgraph.getLayout();
156 for (const auto &e : subgraph.operations())
158 const auto &node = *(e.node);
159 _tensor_builder->preVisit(node);
161 _tensor_builder->postVisit(node);
165 void KernelGenerator::visit(const model::operation::BatchToSpaceND &node)
167 const auto ofm_index{node.getOutputs().at(0)};
168 const auto ifm_index{node.getInputs().at(model::operation::BatchToSpaceND::Input::INPUT)};
169 const auto block_size_index{
170 node.getInputs().at(model::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
172 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
173 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
174 auto block_size_alloc = _tensor_builder->at(block_size_index).get();
176 assert(_ctx.at(block_size_index).isConstant());
178 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBatchToSpaceLayer>();
180 fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
182 auto acl_fn = asAclFunction(std::move(fn));
184 _execution_builder->append(std::move(acl_fn));
187 void KernelGenerator::visit(const model::operation::Cast &node)
189 const auto ofm_index{node.getOutputs().at(0)};
190 const auto ifm_index{node.getInputs().at(model::operation::Cast::Input::INPUT)};
192 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
193 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
195 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
197 fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
199 auto acl_fn = asAclFunction(std::move(fn));
201 _execution_builder->append(std::move(acl_fn));
204 void KernelGenerator::visit(const model::operation::Conv2D &node)
206 using model::operation::Conv2D;
208 const auto ofm_index{node.getOutputs().at(0)};
209 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
210 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
211 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
213 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
214 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
215 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
216 const auto &ker_shape = _ctx.at(ker_index).shape();
217 const auto ker_height = ker_shape.dim(1);
218 const auto ker_width = ker_shape.dim(2);
220 const auto stride = node.param().stride;
221 const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
222 stride, ker_width, ker_height);
223 const auto activation = node.param().activation;
225 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
226 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
227 auto ker_alloc = _tensor_builder->at(ker_index).get();
228 auto bias_alloc = _tensor_builder->at(bias_index).get();
230 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
231 const auto act_info = acl_common::asActivationLayerInfo(activation);
233 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLConvolutionLayer>(
234 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
236 fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
237 conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
239 _execution_builder->append(asAclFunction(std::move(fn)));
242 void KernelGenerator::visit(const model::operation::DepthwiseConv2D &node)
244 using model::operation::DepthwiseConv2D;
246 const auto ofm_index{node.getOutputs().at(0)};
247 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
248 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
249 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
251 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
252 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
253 // Kernel format is [1, kernel_height, kernel_width, depth_out].
254 const auto &ker_shape = _ctx.at(ker_index).shape();
255 const auto ker_height = ker_shape.dim(1);
256 const auto ker_width = ker_shape.dim(2);
258 const auto stride = node.param().stride;
259 const auto padding = neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape,
260 stride, ker_width, ker_height);
261 const auto multiplier = node.param().multiplier;
262 const auto activation = node.param().activation;
264 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
265 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
266 auto ker_alloc = _tensor_builder->at(ker_index).get();
267 auto bias_alloc = _tensor_builder->at(bias_index).get();
269 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
270 const auto act_info = acl_common::asActivationLayerInfo(activation);
272 if (ker_height == 3 && ker_width == 3)
274 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer3x3>(
275 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
277 fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
278 ofm_alloc->handle(), conv_info, multiplier, act_info);
280 _execution_builder->append(asAclFunction(std::move(fn)));
284 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
286 fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
287 ofm_alloc->handle(), conv_info, multiplier, act_info);
289 _execution_builder->append(asAclFunction(std::move(fn)));
293 void KernelGenerator::visit(const model::operation::MaxPool2D &node)
295 const auto ofm_index{node.getOutputs().at(0)};
296 const auto ifm_index{node.getInputs().at(model::operation::MaxPool2D::Input::INPUT)};
298 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
299 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
301 const auto kh = node.param().kh;
302 const auto kw = node.param().kw;
303 const auto stride = node.param().stride;
305 neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
306 const auto activation = node.param().activation;
308 VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
309 VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
310 VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
311 VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
312 VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
313 VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
314 VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
315 VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
316 VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
317 VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
318 VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
319 VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
321 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
322 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
324 ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
325 ::arm_compute::Size2D{kw, kh},
326 acl_common::asPadStrideInfo(padding, stride)};
328 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
330 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
332 auto acl_fn = asAclFunction(std::move(fn));
334 _execution_builder->append((std::move(acl_fn)));
336 ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
339 void KernelGenerator::visit(const model::operation::AvgPool2D &node)
341 const auto ofm_index{node.getOutputs().at(0)};
342 const auto ifm_index{node.getInputs().at(model::operation::AvgPool2D::Input::INPUT)};
344 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
345 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
347 const auto kh = node.param().kh;
348 const auto kw = node.param().kw;
349 const auto stride = node.param().stride;
351 neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
352 const auto activation = node.param().activation;
354 VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
355 VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
356 VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
357 VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
358 VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
359 VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
360 VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
361 VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
362 VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
363 VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
364 VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
365 VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
367 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
368 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
370 ::arm_compute::PoolingLayerInfo info{
371 ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
372 acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
374 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
376 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
378 auto acl_fn = asAclFunction(std::move(fn));
380 _execution_builder->append((std::move(acl_fn)));
382 ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
385 void KernelGenerator::visit(const model::operation::Concat &node)
387 const auto ofm_index{node.getOutputs().at(0)};
389 std::vector<model::OperandIndex> input_indexes;
391 for (const auto &input : node.getInputs())
392 input_indexes.emplace_back(input);
394 const auto axis = node.param().axis;
396 // If tensor allocator allocate as subtensor
397 bool canEliminate = true;
398 for (auto &ifm_ind : input_indexes)
400 if (!_tensor_builder->isSubTensorOf(ofm_index, ifm_ind))
402 canEliminate = false;
408 // If concat eliminated, return a NOP IFunction
409 _execution_builder->append(nnfw::cpp14::make_unique<exec::NopFunction>());
413 auto output_alloc = _tensor_builder->at(ofm_index).get();
415 std::vector<operand::ICLTensor *> input_allocs;
416 for (auto &ifm_ind : input_indexes)
417 input_allocs.emplace_back(_tensor_builder->at(ifm_ind).get());
419 auto fn = nnfw::cpp14::make_unique<::neurun::backend::acl_cl::kernel::ConcatLayer>();
421 const auto rank = _ctx.at(ofm_index).shape().rank();
422 const auto frontend_layout = _current_subg_layout;
423 const auto backend_layout = output_alloc->layout();
424 const auto fixed_axis =
425 acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
427 fn->configure(input_allocs, fixed_axis, output_alloc);
429 auto acl_fn = asAclFunction(std::move(fn));
431 _execution_builder->append(std::move(acl_fn));
434 void KernelGenerator::visit(const model::operation::FullyConnected &node)
436 using model::operation::FullyConnected;
438 const auto output_index{node.getOutputs().at(0)};
439 const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
440 const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
441 const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
443 const auto input_rank = _ctx.at(input_index).shape().rank();
444 // TODO Currently we are not handling where the case is that the input's rank is 3.
445 // The handling should be added in the future.
446 assert(input_rank != 3);
448 const auto output_size = _ctx.at(output_index).shape().dim(1);
449 UNUSED_RELEASE(output_size);
450 assert(_ctx.at(bias_index).shape().dim(0) == output_size);
451 assert(_ctx.at(weight_index).shape().dim(0) == output_size);
452 const auto batch_size = _ctx.at(output_index).shape().dim(0);
453 const auto input_size = _ctx.at(weight_index).shape().dim(1);
455 // Check for reshaping input's shape into rank-2
456 bool needs_reshape = false;
457 neurun::model::Shape reshape(2);
460 const auto feature_size = _ctx.at(input_index).shape().num_elements();
462 UNUSED_RELEASE(feature_size);
463 assert(batch_size >= 0 && input_size >= 0);
464 assert(feature_size == static_cast<uint64_t>(batch_size) * static_cast<uint64_t>(input_size));
467 needs_reshape = true;
468 reshape.dim(0) = batch_size; /* H */
469 reshape.dim(1) = input_size; /* W */
472 const auto activation = node.param().activation;
474 auto output_alloc = _tensor_builder->at(output_index).get();
475 const auto input_alloc = _tensor_builder->at(input_index).get();
476 const auto weight_alloc = _tensor_builder->at(weight_index).get();
477 const auto bias_alloc = _tensor_builder->at(bias_index).get();
478 const auto frontend_layout = _current_subg_layout;
479 const auto acl_layout = output_alloc->handle()->info()->data_layout();
481 auto fn = nnfw::cpp14::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
482 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
485 input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
487 ::neurun::backend::acl_common::asTensorShape(
488 reshape, frontend_layout, ::neurun::backend::acl_common::asRuntimeLayout(acl_layout)));
490 auto acl_fn = asAclFunction(std::move(fn));
492 _execution_builder->append(std::move(acl_fn));
494 ActivationBuilder{*_execution_builder}.append(activation, output_alloc->handle());
497 void KernelGenerator::visit(const model::operation::Mul &node)
499 const auto ofm_index{node.getOutputs().at(0)};
500 const auto lhs_index{node.getInputs().at(model::operation::Mul::Input::LHS)};
501 const auto rhs_index{node.getInputs().at(model::operation::Mul::Input::RHS)};
503 const auto activation = node.param().activation;
505 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
506 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
507 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
509 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPixelWiseMultiplication>();
511 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
512 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
514 auto acl_fn = asAclFunction(std::move(fn));
516 _execution_builder->append(std::move(acl_fn));
518 ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
521 void KernelGenerator::visit(const model::operation::ReduceSum &node)
523 const auto output_index{node.getOutputs().at(0)};
524 const auto input_index{node.getInputs().at(model::operation::ReduceSum::Input::INPUT)};
525 const auto &axes{node.param().axes};
527 auto output_alloc = _tensor_builder->at(output_index).get();
528 auto input_alloc = _tensor_builder->at(input_index).get();
529 const auto frontend_layout = _current_subg_layout;
530 const auto backend_layout = input_alloc->layout();
532 // Convert to ACL axes taking into account negative values and possible duplicates.
533 std::set<std::uint32_t> acl_axes;
534 const int input_rank = _ctx.at(input_index).shape().rank();
535 for (int axis : axes)
540 acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value());
543 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
545 fn->configure(input_alloc->handle(), output_alloc->handle(), acl_axes,
546 ::arm_compute::ReduceOperation::SUM);
548 auto acl_fn = asAclFunction(std::move(fn));
550 _execution_builder->append(std::move(acl_fn));
553 void KernelGenerator::visit(const model::operation::Reshape &node)
555 const auto output_index{node.getOutputs().at(0)};
556 const auto input_index{node.getInputs().at(model::operation::Reshape::Input::INPUT)};
558 auto output_alloc = _tensor_builder->at(output_index).get();
559 auto input_alloc = _tensor_builder->at(input_index).get();
561 // NOTE This operation must not be changed the layout from frontend to backend
562 // So, PermutationOperationPass makes layouts of frontend and backend the same.
563 const auto frontend_layout = _current_subg_layout;
564 const auto backend_layout = output_alloc->layout();
565 assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
566 frontend_layout == backend_layout);
567 UNUSED_RELEASE(frontend_layout);
568 UNUSED_RELEASE(backend_layout);
570 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReshapeLayer>();
572 fn->configure(input_alloc->handle(), output_alloc->handle());
574 auto acl_fn = asAclFunction(std::move(fn));
576 _execution_builder->append(std::move(acl_fn));
579 void KernelGenerator::visit(const model::operation::Squeeze &node)
581 // Squeeze is identical to reshape except that it has an optional dimensions input.
582 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
583 // by freezer and toco
584 // TODO Support multi-layout for frontend and backend
585 const auto output_index{node.getOutputs().at(0)};
586 const auto input_index{node.getInputs().at(model::operation::Squeeze::Input::INPUT)};
587 const auto dims{node.param().dims};
588 const auto ndim{node.param().ndim};
592 auto output_alloc = _tensor_builder->at(output_index).get();
593 auto input_alloc = _tensor_builder->at(input_index).get();
594 auto fn = nnfw::cpp14::make_unique<arm_compute::CLReshapeLayer>();
595 fn->configure(input_alloc->handle(), output_alloc->handle());
596 auto acl_fn = asAclFunction(std::move(fn));
597 _execution_builder->append(std::move(acl_fn));
600 void KernelGenerator::visit(const model::operation::Tanh &node)
602 const auto output_index{node.getOutputs().at(0)};
603 const auto input_index{node.getInputs().at(model::operation::Tanh::Input::INPUT)};
605 auto output_alloc = _tensor_builder->at(output_index).get();
606 auto input_alloc = _tensor_builder->at(input_index).get();
608 auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
610 const ::arm_compute::ActivationLayerInfo act_info{
611 ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
613 fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
615 auto acl_fn = asAclFunction(std::move(fn));
617 _execution_builder->append(std::move(acl_fn));
620 void KernelGenerator::visit(const model::operation::Softmax &node)
622 const auto output_index{node.getOutputs().at(0)};
623 const auto input_index{node.getInputs().at(model::operation::Softmax::Input::INPUT)};
625 const auto beta = node.param().beta;
627 auto output_alloc = _tensor_builder->at(output_index).get();
628 auto input_alloc = _tensor_builder->at(input_index).get();
630 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSoftmaxLayer>(
631 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
633 fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
635 auto acl_fn = asAclFunction(std::move(fn));
637 _execution_builder->append(std::move(acl_fn));
640 void KernelGenerator::visit(const model::operation::StridedSlice &node)
642 const auto output_index{node.getOutputs().at(0)};
643 const auto input_index{node.getInputs().at(model::operation::StridedSlice::Input::INPUT)};
644 const auto starts_index{node.getInputs().at(model::operation::StridedSlice::Input::STARTS)};
645 const auto ends_index{node.getInputs().at(model::operation::StridedSlice::Input::ENDS)};
646 const auto strides_index{node.getInputs().at(model::operation::StridedSlice::Input::STRIDES)};
648 auto outputData_alloc = _tensor_builder->at(output_index).get();
649 auto inputData_alloc = _tensor_builder->at(input_index).get();
650 const auto frontend_layout = _current_subg_layout;
651 const auto backend_layout = inputData_alloc->layout();
653 // Set initializers for indices data such as order of inputData
654 int input_rank = _ctx.at(input_index).shape().rank();
655 std::vector<int32_t> starts;
656 std::vector<int32_t> ends;
657 std::vector<int32_t> strides;
658 starts.resize(input_rank, 0);
659 ends.resize(input_rank, 0);
660 strides.resize(input_rank, 0);
662 auto startData_base = _ctx.at(starts_index).data().base();
663 auto endData_base = _ctx.at(ends_index).data().base();
664 auto stridesData_base = _ctx.at(strides_index).data().base();
665 const int startData_size = _ctx.at(starts_index).shape().num_elements();
666 const int endData_size = _ctx.at(ends_index).shape().num_elements();
667 const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
669 using neurun::model::DataType;
671 UNUSED_RELEASE(startData_size);
672 UNUSED_RELEASE(endData_size);
673 UNUSED_RELEASE(stridesData_size);
675 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
676 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
677 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
678 assert(startData_size == input_rank);
679 assert(endData_size == input_rank);
680 assert(stridesData_size == input_rank);
682 assert(startData_base != nullptr);
683 for (int n = 0; n < input_rank; ++n)
685 auto axis = ::neurun::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
689 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
690 starts[axis] = start_value;
692 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
693 ends[axis] = end_value;
695 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
696 strides[axis] = strides_value;
700 // Set mask bits such as order of inputData
701 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
702 frontend_layout, backend_layout);
703 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
704 frontend_layout, backend_layout);
705 const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
706 node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
708 ::arm_compute::Coordinates starts_set;
709 ::arm_compute::Coordinates ends_set;
710 ::arm_compute::BiStrides strides_set;
712 for (size_t i = 0; i < starts.size(); ++i)
714 starts_set.set(i, starts[i]);
715 ends_set.set(i, ends[i]);
716 strides_set.set(i, strides[i]);
719 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStridedSlice>();
721 fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
722 strides_set, begin_mask, end_mask, shrink_axis_mask);
724 auto acl_fn = asAclFunction(std::move(fn));
726 _execution_builder->append(std::move(acl_fn));
729 void KernelGenerator::visit(const model::operation::Transpose &node)
731 const auto ofm_idx{node.getOutputs().at(0)};
732 const auto ifm_idx{node.getInputs().at(model::operation::Transpose::Input::INPUT)};
733 const auto &perm{node.param().perm};
735 const auto rank = _ctx.at(ifm_idx).shape().rank();
737 auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
738 auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
739 const auto frontend_layout = _current_subg_layout;
740 const auto backend_layout = ifm_alloc->layout();
742 std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
744 auto backend_pv = ::neurun::backend::acl_common::getARMComputePermutationVector(
745 rank, pv, frontend_layout, backend_layout);
747 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
749 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
751 auto acl_fn = asAclFunction(std::move(fn));
753 _execution_builder->append(std::move(acl_fn));
756 void KernelGenerator::visit(const model::operation::Add &node)
758 const auto ofm_index{node.getOutputs().at(0)};
759 const auto lhs_index{node.getInputs().at(model::operation::Add::Input::LHS)};
760 const auto rhs_index{node.getInputs().at(model::operation::Add::Input::RHS)};
762 const auto activation = node.param().activation;
764 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
765 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
766 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
768 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticAddition>();
770 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
771 arm_compute::ConvertPolicy::SATURATE);
773 auto acl_fn = asAclFunction(std::move(fn));
775 _execution_builder->append(std::move(acl_fn));
777 ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
780 void KernelGenerator::visit(const model::operation::Sub &node)
782 const auto ofm_index{node.getOutputs().at(0)};
783 const auto lhs_index{node.getInputs().at(model::operation::Sub::Input::LHS)};
784 const auto rhs_index{node.getInputs().at(model::operation::Sub::Input::RHS)};
786 const auto activation = node.param().activation;
788 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
789 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
790 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
792 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticSubtraction>();
794 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
795 arm_compute::ConvertPolicy::SATURATE);
797 auto acl_fn = asAclFunction(std::move(fn));
799 _execution_builder->append(std::move(acl_fn));
801 ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
804 void KernelGenerator::visit(const model::operation::Div &node)
806 const auto ofm_index{node.getOutputs().at(0)};
807 const auto lhs_index{node.getInputs().at(model::operation::Div::Input::LHS)};
808 const auto rhs_index{node.getInputs().at(model::operation::Div::Input::RHS)};
810 const auto activation = node.param().activation;
812 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
813 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
814 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
816 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArithmeticDivision>();
818 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
820 auto acl_fn = asAclFunction(std::move(fn));
822 _execution_builder->append(std::move(acl_fn));
824 ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
827 void KernelGenerator::visit(const model::operation::Exp &node)
829 const auto output_index{node.getOutputs().at(0)};
830 const auto input_index{node.getInputs().at(model::operation::Exp::Input::INPUT)};
832 auto output_alloc = _tensor_builder->at(output_index).get();
833 auto input_alloc = _tensor_builder->at(input_index).get();
835 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLExpLayer>();
837 fn->configure(input_alloc->handle(), output_alloc->handle());
839 auto acl_fn = asAclFunction(std::move(fn));
841 _execution_builder->append(std::move(acl_fn));
844 void KernelGenerator::visit(const model::operation::InstanceNorm &node)
846 const auto ofm_index{node.getOutputs().at(0)};
847 const auto ifm_index{node.getInputs().at(model::operation::InstanceNorm::Input::INPUT)};
848 const auto gamma_index{node.getInputs().at(model::operation::InstanceNorm::Input::GAMMA)};
849 const auto beta_index{node.getInputs().at(model::operation::InstanceNorm::Input::BETA)};
851 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
852 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
853 auto gamma_alloc = _tensor_builder->at(gamma_index).get();
854 auto beta_alloc = _tensor_builder->at(beta_index).get();
855 auto epsilon = node.param().epsilon;
856 auto activation = node.param().activation;
858 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
860 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
861 beta_alloc->handle(), epsilon);
863 auto acl_fn = asAclFunction(std::move(fn));
865 _execution_builder->append(std::move(acl_fn));
867 ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
870 void KernelGenerator::visit(const model::operation::Logistic &node)
872 const auto ofm_index{node.getOutputs().at(0)};
873 const auto ifm_index{node.getInputs().at(model::operation::Logistic::Input::INPUT)};
875 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
876 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
878 const ::arm_compute::ActivationLayerInfo act_info{
879 ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
881 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
883 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
885 auto acl_fn = asAclFunction(std::move(fn));
887 _execution_builder->append(std::move(acl_fn));
890 void KernelGenerator::visit(const model::operation::LogicalAnd &node)
892 const auto output_index{node.getOutputs().at(0)};
893 const auto input0_index{node.getInputs().at(model::operation::LogicalAnd::Input::INPUT0)};
894 const auto input1_index{node.getInputs().at(model::operation::LogicalAnd::Input::INPUT1)};
896 auto output_alloc = _tensor_builder->at(output_index).get();
897 auto input0_alloc = _tensor_builder->at(input0_index).get();
898 auto input1_alloc = _tensor_builder->at(input1_index).get();
900 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBinaryLogicalOp>();
902 fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
903 ::arm_compute::BinaryLogicalOperation::AND);
905 auto acl_fn = asAclFunction(std::move(fn));
907 _execution_builder->append(std::move(acl_fn));
910 void KernelGenerator::visit(const model::operation::LSTM &node)
912 // TODO Support dynamic rnn
913 // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
914 const auto scratch_buffer_index{
915 node.getOutputs().at(model::operation::LSTM::Output::SCRATCH_BUFFER)};
916 const auto output_state_out_index{
917 node.getOutputs().at(model::operation::LSTM::Output::OUTPUT_STATE_OUT)};
918 const auto cell_state_out_index{
919 node.getOutputs().at(model::operation::LSTM::Output::CELL_STATE_OUT)};
920 const auto output_index{node.getOutputs().at(model::operation::LSTM::Output::OUTPUT)};
922 const auto input_index{node.getInputs().at(model::operation::LSTM::Input::INPUT)};
923 const auto input_to_input_weights_index{
924 node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
925 const auto input_to_forget_weights_index{
926 node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
927 const auto input_to_cell_weights_index{
928 node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
929 const auto input_to_output_weights_index{
930 node.getInputs().at(model::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
931 const auto recurrent_to_input_weights_index{
932 node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
933 const auto recurrent_to_forget_weights_index{
934 node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
935 const auto recurrent_to_cell_weights_index{
936 node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
937 const auto recurrent_to_output_weights_index{
938 node.getInputs().at(model::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
939 const auto cell_to_input_weights_index{
940 node.getInputs().at(model::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
941 const auto cell_to_forget_weights_index{
942 node.getInputs().at(model::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
943 const auto cell_to_output_weights_index{
944 node.getInputs().at(model::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
945 const auto input_gate_bias_index{
946 node.getInputs().at(model::operation::LSTM::Input::INPUT_GATE_BIAS)};
947 const auto forget_gate_bias_index{
948 node.getInputs().at(model::operation::LSTM::Input::FORGET_GATE_BIAS)};
949 const auto cell_bias_index{node.getInputs().at(model::operation::LSTM::Input::CELL_BIAS)};
950 const auto output_gate_bias_index{
951 node.getInputs().at(model::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
952 const auto projection_weights_index{
953 node.getInputs().at(model::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
954 const auto projection_bias_index{
955 node.getInputs().at(model::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
956 const auto output_state_in_index{
957 node.getInputs().at(model::operation::LSTM::Input::OUTPUT_STATE_IN)};
958 const auto cell_state_in_index{node.getInputs().at(model::operation::LSTM::Input::CELL_STATE_IN)};
959 const auto cell_threshold = node.param().cell_threshold;
960 const auto projection_threshold = node.param().projection_threshold;
962 bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
963 _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
964 bool has_recurrent_to_input_weights =
965 _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
966 _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
967 bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
968 bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
969 bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
970 _ctx.at(projection_weights_index).shape().dim(1) != 0;
971 bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
973 // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
976 // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
977 bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
979 // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
980 // But the cell_to_input_weights does not exist in regular CIFG although peephole.
982 // false: no peephole
983 bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
985 // NOTE Although the projection weights has data the projection bias may not have data.
986 bool has_projection_param = has_projection_weights;
988 const auto activation = node.param().activation;
989 const auto cell_clip = cell_threshold;
990 const auto projection_clip = projection_threshold;
991 assert(cell_clip >= 0.f && projection_clip >= 0.f);
993 auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
994 auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
995 auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
996 auto output_alloc = _tensor_builder->at(output_index).get();
998 auto input_alloc = _tensor_builder->at(input_index).get();
1000 auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
1001 auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
1002 auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
1003 auto recurrent_to_forget_weights_alloc =
1004 _tensor_builder->at(recurrent_to_forget_weights_index).get();
1005 auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
1006 auto recurrent_to_output_weights_alloc =
1007 _tensor_builder->at(recurrent_to_output_weights_index).get();
1009 auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
1010 auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
1011 auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
1012 auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
1013 auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
1015 auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
1017 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLLSTMLayer>();
1019 ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
1022 auto input_to_input_weights_alloc =
1023 _tensor_builder->at(input_to_input_weights_index).get(); // optional
1024 auto recurrent_to_input_weights_alloc =
1025 _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
1026 auto cell_to_input_weights_handle =
1027 has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
1028 : nullptr; // optional (non-cifg && peephole)
1029 auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
1030 lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
1031 recurrent_to_input_weights_alloc->handle(),
1032 cell_to_input_weights_handle, input_gate_bias_alloc->handle());
1034 if (has_peephole_param)
1036 auto cell_to_forget_weights_alloc =
1037 _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
1038 auto cell_to_output_weights_alloc =
1039 _tensor_builder->at(cell_to_output_weights_index).get(); // optional
1040 lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
1041 cell_to_output_weights_alloc->handle());
1043 if (has_projection_param)
1045 auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
1046 auto projection_bias_handle = has_projection_bias
1047 ? _tensor_builder->at(projection_bias_index).get()->handle()
1048 : nullptr; // optional
1049 lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
1053 input_alloc->handle(), input_to_forget_weights_alloc->handle(),
1054 input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
1055 recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
1056 recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
1057 cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
1058 cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
1059 output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
1060 lstm_params, act_info, cell_clip, projection_clip);
1062 auto acl_fn = asAclFunction(std::move(fn));
1064 _execution_builder->append(std::move(acl_fn));
1067 void KernelGenerator::visit(const model::operation::ReduceMax &node)
1069 const auto output_index{node.getOutputs().at(0)};
1070 const auto input_index{node.getInputs().at(model::operation::ReduceMax::Input::INPUT)};
1071 const auto &axes{node.param().axes};
1073 auto ofm_alloc = _tensor_builder->at(output_index).get();
1074 auto ifm_alloc = _tensor_builder->at(input_index).get();
1075 const auto frontend_layout = _current_subg_layout;
1076 const auto backend_layout = ifm_alloc->layout();
1078 // Convert to ACL axes taking into account negative values and possible duplicates.
1079 std::set<std::uint32_t> acl_axes;
1080 const int ifm_rank = _ctx.at(input_index).shape().rank();
1081 for (int axis : axes)
1086 acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
1089 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
1091 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), acl_axes,
1092 arm_compute::ReduceOperation::MAX);
1094 auto acl_fn = asAclFunction(std::move(fn));
1096 _execution_builder->append(std::move(acl_fn));
1099 void KernelGenerator::visit(const model::operation::Comparison &node)
1101 const auto output_index{node.getOutputs().at(0)};
1102 const auto input0_index{node.getInputs().at(model::operation::Comparison::Input::INPUT0)};
1103 const auto input1_index{node.getInputs().at(model::operation::Comparison::Input::INPUT1)};
1105 const auto comparison_type = node.param().comparison_type;
1107 auto output_alloc = _tensor_builder->at(output_index).get();
1108 auto input0_alloc = _tensor_builder->at(input0_index).get();
1109 auto input1_alloc = _tensor_builder->at(input1_index).get();
1111 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLComparison>();
1113 fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
1114 (arm_compute::ComparisonOperation)comparison_type);
1116 auto acl_fn = asAclFunction(std::move(fn));
1118 _execution_builder->append(std::move(acl_fn));
1121 void KernelGenerator::visit(const model::operation::Pack &node)
1123 const auto output_index{node.getOutputs().at(0)};
1124 auto axis{node.param().axis};
1126 const auto output_rank = _ctx.at(output_index).shape().rank();
1128 std::vector<model::OperandIndex> input_indexes;
1129 for (const auto &input_index : node.getInputs())
1130 input_indexes.emplace_back(input_index);
1132 auto output = _tensor_builder->at(output_index).get()->handle();
1133 std::vector<arm_compute::ICLTensor *> inputs;
1134 for (const auto &input_index : input_indexes)
1135 inputs.emplace_back(_tensor_builder->at(input_index)->handle());
1137 const auto frontend_layout = _current_subg_layout;
1138 const auto backend_layout = _tensor_builder->at(output_index).get()->layout();
1140 if (output_rank >= 4 && _current_subg_layout != backend_layout)
1142 throw std::runtime_error("ACL CL : Pack does not support different layouts between frontend "
1143 "and backend in ranks above 4");
1147 axis += output_rank;
1148 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1150 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLStackLayer>();
1152 fn->configure(inputs, axis, output);
1154 _execution_builder->append(asAclFunction(std::move(fn)));
1157 void KernelGenerator::visit(const model::operation::Permute &node)
1159 const auto ofm_idx{node.getOutputs().at(0)};
1160 const auto ifm_idx{node.getInputs().at(0)};
1161 const auto permute_type = node.getPermuteType();
1162 auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
1163 auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
1164 const auto rank = _ctx.at(ofm_idx).shape().rank();
1165 assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
1167 std::unique_ptr<::arm_compute::IFunction> fn;
1168 arm_compute::PermutationVector pv;
1169 if (permute_type == model::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
1172 pv = arm_compute::PermutationVector{2, 0, 1};
1174 auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
1176 l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1180 else if (permute_type == model::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
1183 pv = arm_compute::PermutationVector{1, 2, 0};
1185 auto l = nnfw::cpp14::make_unique<::arm_compute::CLPermute>();
1187 l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1193 auto l = nnfw::cpp14::make_unique<::arm_compute::CLCopy>();
1195 l->configure(ifm_alloc->handle(), ofm_alloc->handle());
1200 auto acl_fn = asAclFunction(std::move(fn));
1202 _execution_builder->append(std::move(acl_fn));
1205 void KernelGenerator::visit(const model::operation::RSQRT &node)
1207 const auto ofm_index{node.getOutputs().at(0)};
1208 const auto ifm_index{node.getInputs().at(model::operation::RSQRT::Input::INPUT)};
1210 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1211 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1213 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRsqrtLayer>();
1215 fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1217 _execution_builder->append(asAclFunction(std::move(fn)));
1220 void KernelGenerator::visit(const model::operation::ReLU &node)
1222 const auto output_index{node.getOutputs().at(0)};
1223 const auto input_index{node.getInputs().at(model::operation::ReLU::Input::INPUT)};
1225 auto output_alloc = _tensor_builder->at(output_index).get();
1226 auto input_alloc = _tensor_builder->at(input_index).get();
1228 auto fn = nnfw::cpp14::make_unique<arm_compute::CLActivationLayer>();
1230 const ::arm_compute::ActivationLayerInfo act_info{
1231 ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
1233 fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1235 auto acl_fn = asAclFunction(std::move(fn));
1237 _execution_builder->append(std::move(acl_fn));
1240 void KernelGenerator::visit(const model::operation::ResizeBilinear &node)
1242 const auto ofm_index{node.getOutputs().at(0)};
1244 const auto ifm_index{node.getInputs().at(model::operation::ResizeBilinear::Input::INPUT)};
1246 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1247 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1249 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLScale>();
1251 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
1252 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
1253 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
1255 auto acl_fn = asAclFunction(std::move(fn));
1257 _execution_builder->append(std::move(acl_fn));
1260 void KernelGenerator::visit(const model::operation::ReLU1 &node)
1262 const auto ofm_index{node.getOutputs().at(0)};
1263 const auto ifm_index{node.getInputs().at(model::operation::ReLU1::Input::INPUT)};
1265 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1266 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1268 const ::arm_compute::ActivationLayerInfo act_info{
1269 ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
1271 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1273 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1275 auto acl_fn = asAclFunction(std::move(fn));
1277 _execution_builder->append(std::move(acl_fn));
1280 void KernelGenerator::visit(const model::operation::ReLU6 &node)
1282 const auto ofm_index{node.getOutputs().at(0)};
1283 const auto ifm_index{node.getInputs().at(model::operation::ReLU6::Input::INPUT)};
1285 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1286 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1288 const ::arm_compute::ActivationLayerInfo act_info{
1289 ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
1291 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1293 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1295 auto acl_fn = asAclFunction(std::move(fn));
1297 _execution_builder->append(std::move(acl_fn));
1300 void KernelGenerator::visit(const model::operation::RNN &node)
1302 const auto output_index{node.getOutputs().at(model::operation::RNN::Output::OUTPUT)};
1303 const auto hidden_state_out_index{
1304 node.getOutputs().at(model::operation::RNN::Output::HIDDEN_STATE_OUT)};
1306 const auto input_index{node.getInputs().at(model::operation::RNN::Input::INPUT)};
1307 const auto weights_index{node.getInputs().at(model::operation::RNN::Input::WEIGHTS)};
1308 const auto recurrent_weights_index{
1309 node.getInputs().at(model::operation::RNN::Input::RECURRENT_WEIGHTS)};
1310 const auto bias_index{node.getInputs().at(model::operation::RNN::Input::BIAS)};
1311 const auto hidden_state_in_index{
1312 node.getInputs().at(model::operation::RNN::Input::HIDDEN_STATE_IN)};
1314 const auto activation = node.param().activation;
1316 auto output_alloc = _tensor_builder->at(output_index).get();
1317 auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
1319 auto input_alloc = _tensor_builder->at(input_index).get();
1320 auto weights_alloc = _tensor_builder->at(weights_index).get();
1321 auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
1322 auto bias_alloc = _tensor_builder->at(bias_index).get();
1323 auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
1324 auto act_info = ::neurun::backend::acl_common::asActivationLayerInfo(activation);
1326 auto copy_layer = nnfw::cpp14::make_unique<::arm_compute::CLCopy>();
1327 copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
1328 _execution_builder->append(asAclFunction(std::move(copy_layer)));
1330 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLRNNLayerEx>(
1331 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1332 fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
1333 bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
1335 _execution_builder->append(asAclFunction(std::move(fn)));
1338 void KernelGenerator::visit(const model::operation::Floor &node)
1340 const auto ofm_index{node.getOutputs().at(0)};
1341 const auto ifm_index{node.getInputs().at(model::operation::Floor::Input::INPUT)};
1343 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1344 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1346 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLFloor>();
1348 fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1350 auto acl_fn = asAclFunction(std::move(fn));
1352 _execution_builder->append(std::move(acl_fn));
1355 void KernelGenerator::visit(const model::operation::SpaceToBatchND &node)
1357 const auto ofm_index{node.getOutputs().at(0)};
1358 const auto ifm_index{node.getInputs().at(model::operation::SpaceToBatchND::Input::INPUT)};
1359 const auto block_size_index{
1360 node.getInputs().at(model::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1361 const auto paddings_index{node.getInputs().at(model::operation::SpaceToBatchND::Input::PADDINGS)};
1363 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1364 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1365 auto block_size_alloc = _tensor_builder->at(block_size_index).get();
1366 auto paddings_alloc = _tensor_builder->at(paddings_index).get();
1368 assert(_ctx.at(block_size_index).isConstant());
1369 assert(_ctx.at(paddings_index).isConstant());
1371 std::unique_ptr<::arm_compute::IFunction> fn;
1372 if (_ctx.at(ofm_index).typeInfo().type() == model::DataType::QUANT8_ASYMM)
1374 // NOTE CLSpaceToBatchLayer has a bug that padding's values are 0 even when zero point of
1375 // QASYMM8 is not 0.
1376 auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToBatchND>();
1377 l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
1378 ofm_alloc->handle());
1383 auto l = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToBatchLayer>();
1384 l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
1385 ofm_alloc->handle());
1389 auto acl_fn = asAclFunction(std::move(fn));
1391 _execution_builder->append(std::move(acl_fn));
1394 void KernelGenerator::visit(const model::operation::SpaceToDepth &node)
1396 const auto ofm_index{node.getOutputs().at(0)};
1397 const auto ifm_index{node.getInputs().at(model::operation::SpaceToDepth::Input::INPUT)};
1399 auto block_size = node.param().block_size;
1401 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1402 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1404 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSpaceToDepth>();
1406 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
1408 auto acl_fn = asAclFunction(std::move(fn));
1410 _execution_builder->append(std::move(acl_fn));
1413 void KernelGenerator::visit(const model::operation::L2Pool2D &node)
1415 const auto ofm_index{node.getOutputs().at(0)};
1416 const auto ifm_index{node.getInputs().at(model::operation::L2Pool2D::Input::INPUT)};
1418 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
1419 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
1421 uint32_t kw = node.param().kw;
1422 uint32_t kh = node.param().kh;
1423 const auto stride = node.param().stride;
1424 const auto padding =
1425 neurun::util::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
1426 const auto activation = node.param().activation;
1428 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1429 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1431 ::arm_compute::PoolingLayerInfo info{
1432 ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
1433 ::neurun::backend::acl_common::asPadStrideInfo(padding, stride)};
1435 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPoolingLayer>();
1437 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
1439 auto acl_fn = asAclFunction(std::move(fn));
1441 _execution_builder->append(std::move(acl_fn));
1443 ActivationBuilder{*_execution_builder}.append(activation, ofm_alloc->handle());
1446 void KernelGenerator::visit(const model::operation::EmbeddingLookup &node)
1448 const auto output_index{node.getOutputs().at(0)};
1449 const auto lookups_index{node.getInputs().at(model::operation::EmbeddingLookup::Input::LOOKUPS)};
1450 const auto values_index{node.getInputs().at(model::operation::EmbeddingLookup::Input::VALUES)};
1452 auto output_alloc = _tensor_builder->at(output_index).get();
1453 auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1454 auto values_alloc = _tensor_builder->at(values_index).get();
1456 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLEmbeddingLookup>();
1458 fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
1460 auto acl_fn = asAclFunction(std::move(fn));
1462 _execution_builder->append(std::move(acl_fn));
1465 void KernelGenerator::visit(const model::operation::L2Normalization &node)
1467 const auto ofm_index{node.getOutputs().at(0)};
1468 const auto ifm_index{node.getInputs().at(model::operation::L2Normalization::Input::INPUT)};
1470 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1471 // L2 Normalization always performs the reduction along the depth axis
1472 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1473 // choosing normalization parameters as below
1475 const auto &ifm_shape = _ctx.at(ifm_index).shape();
1476 // TODO Support optional constant dimension that normalization would be performed on
1477 const auto normalization_axis = ifm_shape.rank() - 1;
1479 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1480 float alpha = 1.0f; // In the implementation to make alpha_ become 1
1481 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
1482 float bias = 0.0f; // Don't offset the reduction.
1484 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1485 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1487 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1488 radius, alpha, beta, bias, false);
1490 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
1492 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1494 auto acl_fn = asAclFunction(std::move(fn));
1496 _execution_builder->append(std::move(acl_fn));
1499 void KernelGenerator::visit(const model::operation::HashtableLookup &node)
1501 const auto output_index{node.getOutputs().at(model::operation::HashtableLookup::Output::OUTPUT)};
1502 const auto hits_index{node.getOutputs().at(model::operation::HashtableLookup::Output::HITS)};
1504 const auto lookups_index{node.getInputs().at(model::operation::HashtableLookup::Input::LOOKUPS)};
1505 const auto keys_index{node.getInputs().at(model::operation::HashtableLookup::Input::KEYS)};
1506 const auto values_index{node.getInputs().at(model::operation::HashtableLookup::Input::VALUES)};
1508 auto output_alloc = _tensor_builder->at(output_index).get();
1509 auto hits_alloc = _tensor_builder->at(hits_index).get();
1511 auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1512 auto keys_alloc = _tensor_builder->at(keys_index).get();
1513 auto values_alloc = _tensor_builder->at(values_index).get();
1515 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLHashtableLookup>();
1517 fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
1518 output_alloc->handle(), hits_alloc->handle());
1520 auto acl_fn = asAclFunction(std::move(fn));
1522 _execution_builder->append(std::move(acl_fn));
1525 void KernelGenerator::visit(const model::operation::PReLU &node)
1527 const auto ofm_index{node.getOutputs().at(0)};
1528 const auto ifm_index{node.getInputs().at(model::operation::PReLU::Input::INPUT)};
1529 const auto alpha_index{node.getInputs().at(model::operation::PReLU::Input::ALPHA)};
1531 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1532 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1533 auto alpha_alloc = _tensor_builder->at(alpha_index).get();
1535 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPReLU>();
1537 fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
1539 auto acl_fn = asAclFunction(std::move(fn));
1541 _execution_builder->append(std::move(acl_fn));
1544 void KernelGenerator::visit(const model::operation::TransposeConv &node)
1546 const auto ofm_index{node.getOutputs().at(0)};
1547 const auto output_shape_index{
1548 node.getInputs().at(model::operation::TransposeConv::Input::OUTPUT_SHAPE)};
1549 const auto ker_index{node.getInputs().at(model::operation::TransposeConv::Input::KERNEL)};
1550 const auto ifm_index{node.getInputs().at(model::operation::TransposeConv::Input::INPUT)};
1552 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_subg_layout);
1553 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_subg_layout);
1554 const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_subg_layout);
1556 const auto stride = node.param().stride;
1558 assert((node.param().padding.type == model::PaddingType::SAME) ||
1559 (node.param().padding.type == model::PaddingType::VALID));
1560 auto padding = neurun::util::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1561 ker_shape.W, ker_shape.H);
1563 uint32_t invalid_horizontal = 0;
1564 uint32_t invalid_vertical = 0;
1565 if (node.param().padding.type == model::PaddingType::VALID)
1567 invalid_horizontal =
1568 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1569 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1572 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1573 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1574 auto ker_alloc = _tensor_builder->at(ker_index).get();
1576 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1578 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTransposeConvLayer>(
1579 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1581 fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
1582 invalid_horizontal, invalid_vertical);
1584 auto acl_fn = asAclFunction(std::move(fn));
1586 _execution_builder->append(std::move(acl_fn));
1589 void KernelGenerator::visit(const model::operation::SQRT &node)
1591 const auto output_index{node.getOutputs().at(0)};
1592 const auto input_index{node.getInputs().at(model::operation::SQRT::Input::INPUT)};
1594 auto output_alloc = _tensor_builder->at(output_index).get();
1595 auto input_alloc = _tensor_builder->at(input_index).get();
1597 const ::arm_compute::ActivationLayerInfo act_info{
1598 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
1600 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1602 fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1604 auto acl_fn = asAclFunction(std::move(fn));
1606 _execution_builder->append(std::move(acl_fn));
1609 void KernelGenerator::visit(const model::operation::LogicalOr &node)
1611 const auto output_index{node.getOutputs().at(0)};
1612 const auto input0_index{node.getInputs().at(model::operation::LogicalOr::Input::INPUT0)};
1613 const auto input1_index{node.getInputs().at(model::operation::LogicalOr::Input::INPUT1)};
1615 auto output_alloc = _tensor_builder->at(output_index).get();
1616 auto input0_alloc = _tensor_builder->at(input0_index).get();
1617 auto input1_alloc = _tensor_builder->at(input1_index).get();
1619 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseOr>();
1621 fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
1623 auto acl_fn = asAclFunction(std::move(fn));
1625 _execution_builder->append(std::move(acl_fn));
1628 void KernelGenerator::visit(const model::operation::LogicalNot &node)
1630 const auto output_index{node.getOutputs().at(0)};
1631 const auto input_index{node.getInputs().at(model::operation::LogicalNot::Input::INPUT)};
1633 auto output_alloc = _tensor_builder->at(output_index).get();
1634 auto input_alloc = _tensor_builder->at(input_index).get();
1636 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLBitwiseNot>();
1638 fn->configure(input_alloc->handle(), output_alloc->handle());
1640 auto acl_fn = asAclFunction(std::move(fn));
1642 _execution_builder->append(std::move(acl_fn));
1645 void KernelGenerator::visit(const model::operation::SquaredDifference &node)
1647 const auto ofm_index{node.getOutputs().at(0)};
1648 const auto lhs_index{node.getInputs().at(model::operation::SquaredDifference::Input::LHS)};
1649 const auto rhs_index{node.getInputs().at(model::operation::SquaredDifference::Input::RHS)};
1651 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1652 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
1653 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
1655 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
1657 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
1659 auto acl_fn = asAclFunction(std::move(fn));
1661 _execution_builder->append(std::move(acl_fn));
1664 void KernelGenerator::visit(const model::operation::TopKV2 &node)
1666 const auto outputValues_index{
1667 node.getOutputs().at(model::operation::TopKV2::Output::OUTPUT_VALUES)};
1668 const auto outputIndices_index{
1669 node.getOutputs().at(model::operation::TopKV2::Output::OUTPUT_INDICES)};
1671 const auto inputData_index{node.getInputs().at(model::operation::TopKV2::Input::INPUT)};
1673 // Currently, we only support the vector input.
1674 assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1675 _ctx.at(inputData_index).shape().rank() == 2);
1677 const auto k = node.param().k;
1679 auto values_alloc = _tensor_builder->at(outputValues_index).get();
1680 auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
1681 auto input_alloc = _tensor_builder->at(inputData_index).get();
1683 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLTopKV2>();
1685 fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
1687 auto acl_fn = asAclFunction(std::move(fn));
1689 _execution_builder->append(std::move(acl_fn));
1692 void KernelGenerator::visit(const model::operation::Gather &node)
1694 const auto ofm_index{node.getOutputs().at(0)};
1696 const auto ifm_index{node.getInputs().at(model::operation::Gather::Input::INPUT)};
1697 const auto indices_index{node.getInputs().at(model::operation::Gather::Input::INDICES)};
1699 const auto ifm_shape = _ctx.at(ifm_index).shape();
1701 const auto axis_value = node.param().axis;
1703 ::neurun::backend::acl_common::ToARMComputeAxis(ifm_shape.rank(), axis_value).value();
1705 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1706 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1707 auto indices_alloc = _tensor_builder->at(indices_index).get();
1709 // NOTE The frontend layout and backend layout must be the same for this operation.
1710 // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1711 // is not not efficient even if it works well. If so, it would be better to set the
1712 // layout of these backend tensors to the same layout.
1713 // There is also one thing we have to think about. This operation depends on the layout of
1714 // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1715 // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1716 // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1717 const auto backend_layout = ofm_alloc->layout();
1718 UNUSED_RELEASE(backend_layout);
1719 assert(backend_layout == ifm_alloc->layout());
1720 assert(backend_layout == indices_alloc->layout());
1721 assert(ifm_shape.rank() < 4 || _current_subg_layout == backend_layout);
1723 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLGatherEx>();
1725 fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
1727 auto acl_fn = asAclFunction(std::move(fn));
1729 _execution_builder->append(std::move(acl_fn));
1732 void KernelGenerator::visit(const model::operation::Neg &node)
1734 const auto ofm_index{node.getOutputs().at(0)};
1735 const auto ifm_index{node.getInputs().at(model::operation::Neg::Input::INPUT)};
1737 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1738 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1740 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNeg>();
1742 fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1744 auto acl_fn = asAclFunction(std::move(fn));
1746 _execution_builder->append(std::move(acl_fn));
1749 void KernelGenerator::visit(const model::operation::Abs &node)
1751 const auto output_index{node.getOutputs().at(0)};
1752 const auto input_index{node.getInputs().at(model::operation::Abs::Input::INPUT)};
1754 auto output_alloc = _tensor_builder->at(output_index).get();
1755 auto input_alloc = _tensor_builder->at(input_index).get();
1757 const ::arm_compute::ActivationLayerInfo act_info{
1758 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
1760 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLActivationLayer>();
1762 fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1764 auto acl_fn = asAclFunction(std::move(fn));
1766 _execution_builder->append(std::move(acl_fn));
1769 void KernelGenerator::visit(const model::operation::ArgMax &node)
1771 const auto ofm_index{node.getOutputs().at(0)};
1772 const auto ifm_index{node.getInputs().at(model::operation::ArgMax::Input::INPUT)};
1774 auto ifm_shape = _ctx.at(ifm_index).shape();
1775 auto ofm_shape = _ctx.at(ofm_index).shape();
1777 assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1779 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1780 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1781 const auto ifm_rank = ifm_shape.rank();
1782 auto frontend_layout = _current_subg_layout;
1783 auto backend_layout = ifm_alloc->layout();
1785 int axis_value = node.param().axis;
1788 axis_value += ifm_rank;
1792 acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1794 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLArgOperation>();
1796 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
1797 ::arm_compute::ArgOperation::MAX);
1799 auto acl_fn = asAclFunction(std::move(fn));
1801 _execution_builder->append(std::move(acl_fn));
1804 void KernelGenerator::visit(const model::operation::Dequantize &node)
1806 const auto output_index{node.getOutputs().at(0)};
1807 const auto input_index{node.getInputs().at(model::operation::Dequantize::Input::INPUT)};
1809 auto output_alloc = _tensor_builder->at(output_index).get();
1810 auto input_alloc = _tensor_builder->at(input_index).get();
1812 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLCast>();
1814 fn->configure(input_alloc->handle(), output_alloc->handle());
1816 auto acl_fn = asAclFunction(std::move(fn));
1818 _execution_builder->append(std::move(acl_fn));
1821 void KernelGenerator::visit(const model::operation::Mean &node)
1823 const auto ofm_index{node.getOutputs().at(0)};
1824 const auto ifm_index{node.getInputs().at(model::operation::Mean::Input::INPUT)};
1825 const auto &axes{node.param().axes};
1826 const auto keep_dims{node.param().keep_dims};
1828 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1829 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1830 const auto frontend_layout = _current_subg_layout;
1831 const auto backend_layout = ifm_alloc->layout();
1833 // Convert to ACL axes taking into account negative values and possible duplicates.
1834 std::set<std::uint32_t> acl_axes;
1835 const int ifm_rank = _ctx.at(ifm_index).shape().rank();
1836 for (int axis : axes)
1841 acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
1844 arm_compute::Coordinates reduce_axes;
1845 for (const auto axis : acl_axes)
1847 reduce_axes.set(reduce_axes.num_dimensions(), axis);
1850 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceMean>();
1852 fn->configure(ifm_alloc->handle(), reduce_axes, keep_dims, ofm_alloc->handle());
1854 auto acl_fn = asAclFunction(std::move(fn));
1856 _execution_builder->append(std::move(acl_fn));
1859 void KernelGenerator::visit(const model::operation::LocalResponseNormalization &node)
1861 const auto ofm_index{node.getOutputs().at(0)};
1862 const auto ifm_index{
1863 node.getInputs().at(model::operation::LocalResponseNormalization::Input::INPUT)};
1865 auto radius = node.param().radius;
1866 auto alpha = node.param().alpha;
1867 auto beta = node.param().beta;
1868 auto bias = node.param().bias;
1870 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1871 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1873 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1874 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1876 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLNormalizationLayer>();
1878 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1880 auto acl_fn = asAclFunction(std::move(fn));
1882 _execution_builder->append(std::move(acl_fn));
1885 void KernelGenerator::visit(const model::operation::DepthToSpace &node)
1887 const auto output_index{node.getOutputs().at(0)};
1888 const auto input_index{node.getInputs().at(model::operation::DepthToSpace::Input::INPUT)};
1890 auto block_size = node.param().block_size;
1891 assert(block_size > 0);
1893 auto output_alloc = _tensor_builder->at(output_index).get();
1894 auto input_alloc = _tensor_builder->at(input_index).get();
1896 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLDepthToSpace>();
1898 fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
1900 auto acl_fn = asAclFunction(std::move(fn));
1902 _execution_builder->append(std::move(acl_fn));
1905 void KernelGenerator::visit(const model::operation::ReduceMin &node)
1907 const auto ofm_index{node.getOutputs().at(0)};
1908 const auto ifm_index{node.getInputs().at(model::operation::ReduceMin::Input::INPUT)};
1909 const auto &axes{node.param().axes};
1911 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1912 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1913 const auto frontend_layout = _current_subg_layout;
1914 const auto backend_layout = ifm_alloc->layout();
1916 // Convert to ACL axes taking into account negative values and possible duplicates.
1917 std::set<std::uint32_t> acl_axes;
1918 const int ifm_rank = _ctx.at(ifm_index).shape().rank();
1919 for (int axis : axes)
1924 acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value());
1927 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLReduceOperation>();
1929 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), acl_axes,
1930 ::arm_compute::ReduceOperation::MIN);
1932 auto acl_fn = asAclFunction(std::move(fn));
1934 _execution_builder->append(std::move(acl_fn));
1937 void KernelGenerator::visit(const model::operation::Split &node)
1939 const auto ifm_index{node.getInputs().at(model::operation::Split::Input::INPUT)};
1941 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1943 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1944 std::vector<model::OperandIndex> output_indexes;
1945 for (const auto &output : node.getOutputs())
1946 output_indexes.emplace_back(output);
1948 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1949 std::vector<arm_compute::ICLTensor *> output_allocs;
1950 for (const auto &ofm_ind : output_indexes)
1951 output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
1953 const auto frontend_layout = _current_subg_layout;
1954 const auto backend_layout = ifm_alloc->layout();
1955 auto axis = node.param().axis;
1958 axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1960 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLSplit>();
1962 fn->configure(ifm_alloc->handle(), output_allocs, axis);
1964 _execution_builder->append(asAclFunction(std::move(fn)));
1967 void KernelGenerator::visit(const model::operation::Unpack &node)
1969 const auto input_index{node.getInputs().at(model::operation::Unpack::Input::INPUT)};
1970 auto axis{node.param().axis};
1972 const auto input_rank = _ctx.at(input_index).shape().rank();
1974 std::vector<model::OperandIndex> output_indexes;
1975 for (const auto &output_index : node.getOutputs())
1976 output_indexes.emplace_back(output_index);
1978 auto input = _tensor_builder->at(input_index).get()->handle();
1979 std::vector<arm_compute::ICLTensor *> outputs;
1980 for (const auto &output_index : output_indexes)
1981 outputs.emplace_back(_tensor_builder->at(output_index)->handle());
1983 const auto frontend_layout = _current_subg_layout;
1984 const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
1987 axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1989 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLUnstack>();
1991 fn->configure(input, outputs, axis);
1993 _execution_builder->append(asAclFunction(std::move(fn)));
1996 void KernelGenerator::visit(const model::operation::Pad &node)
1998 const auto input_index{node.getInputs().at(model::operation::Pad::Input::INPUT)};
1999 const auto pad_index{node.getInputs().at(model::operation::Pad::Input::PAD)};
2000 const auto output_index{node.getOutputs().at(0)};
2001 assert(_ctx.at(pad_index).isConstant());
2003 auto rank = _ctx.at(pad_index).shape().dim(0);
2004 auto pad_base = _ctx.at(pad_index).data().base();
2006 auto input_type = _ctx.at(input_index).typeInfo();
2007 auto data_type = acl_common::asDataType(input_type.type());
2008 auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
2009 const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
2011 auto input = _tensor_builder->at(input_index).get()->handle();
2012 auto output = _tensor_builder->at(output_index).get()->handle();
2014 ::arm_compute::PaddingList padding_list;
2015 padding_list.resize(rank);
2016 for (int32_t n = 0; n < rank; ++n)
2018 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
2020 const auto frontend_layout = _current_subg_layout;
2021 const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
2023 acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
2024 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
2026 auto fn = nnfw::cpp14::make_unique<::arm_compute::CLPadLayer>();
2027 fn->configure(input, output, padding_list, pixel_value);
2029 _execution_builder->append(asAclFunction(std::move(fn)));
2032 } // namespace acl_cl
2033 } // namespace backend
2034 } // namespace neurun