2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "KernelGenerator.h"
19 #include <arm_compute/runtime/NEON/NEFunctions.h> // Include all ARM Compute NEON functions
20 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
22 #include <AclActivationBuilder.h>
23 #include <AclFunction.h>
28 #include "ir/DataType.h"
29 #include "ir/InternalType.h"
30 #include "exec/NopFunction.h"
31 #include "util/logging.h"
32 #include "util/Utils.h"
33 #include "AclKernelGen.h"
42 using ::onert::backend::acl_common::asAclFunction;
43 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
44 ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
46 KernelGenerator::KernelGenerator(
47 const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
48 const std::shared_ptr<TensorBuilder> &tensor_builder,
49 const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
50 : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
51 _tensor_reg(tensor_reg), _current_op_seq_layout(ir::Layout::UNKNOWN)
56 void KernelGenerator::visit(const ir::OpSequence &op_seq)
58 // TODO Move this to IKernelGenerator
59 // (all derivatives have the same implementation for this)
60 assert(!_return_fn_seq);
61 _return_fn_seq = std::make_unique<exec::FunctionSequence>();
62 _return_fn_seq->enableDynamicShapeInferer(false);
64 _current_op_seq_layout = op_seq.getLayout();
65 for (const auto &operation_idx : op_seq.operations())
67 const auto &node = _operations_ctx.at(operation_idx);
69 _return_fn_seq->append(releaseFunction());
73 void KernelGenerator::visit(const ir::operation::ArgMax &node)
75 const auto ofm_index{node.getOutputs().at(0)};
76 const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
77 const auto axis_index{node.getInputs().at(ir::operation::ArgMax::Input::AXIS)};
79 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
81 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
82 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
83 auto frontend_layout = _current_op_seq_layout;
84 auto backend_layout = ifm_tensor->layout();
86 int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
89 axis_value += ifm_rank;
91 assert(axis_value >= 0 && axis_value < ifm_rank);
92 const auto fixed_axis =
93 acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
95 auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
96 ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
97 arm_compute::ReductionOperation::ARG_IDX_MAX);
99 _return_fn = asAclFunction(std::move(fn));
102 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
104 const auto ofm_index{node.getOutputs().at(0)};
105 const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
106 const auto block_size_index{
107 node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
109 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
110 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
111 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
113 assert(_ctx.at(block_size_index).data());
115 auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
116 ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
118 _return_fn = asAclFunction(std::move(fn));
121 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
123 const auto ofm_index{node.getOutputs().at(0)};
124 const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
125 const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
127 const auto activation = node.param().activation;
129 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
130 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
131 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
133 std::unique_ptr<arm_compute::IFunction> fn;
134 switch (node.param().arithmetic_type)
136 case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
138 fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
139 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
140 arm_compute::ConvertPolicy::SATURATE);
143 case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
145 fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
146 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
147 arm_compute::ConvertPolicy::SATURATE);
150 case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
152 // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
153 fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
154 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
155 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
158 case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
160 fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
161 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
165 assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
168 _return_fn = std::make_unique<exec::FunctionSequence>(
169 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
172 void KernelGenerator::visit(const ir::operation::Conv2D &node)
174 using ir::operation::Conv2D;
176 const auto ofm_index{node.getOutputs().at(0)};
177 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
178 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
179 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
181 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
182 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
183 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
184 const auto &ker_shape = _ctx.at(ker_index).shape();
185 const auto ker_height = ker_shape.dim(1);
186 const auto ker_width = ker_shape.dim(2);
188 const auto stride = node.param().stride;
189 const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
190 ker_width, ker_height);
191 const auto activation = node.param().activation;
193 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
194 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
195 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
196 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
198 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
199 const auto act_info = acl_common::asActivationLayerInfo(activation);
201 auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
202 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
203 ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
204 ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
206 _return_fn = asAclFunction(std::move(fn));
209 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
211 const auto output_index{node.getOutputs().at(0)};
212 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
214 auto block_size = node.param().block_size;
215 assert(block_size > 0);
217 auto output_tensor = _tensor_reg->getAclTensor(output_index);
218 auto input_tensor = _tensor_reg->getAclTensor(input_index);
220 auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
221 input_tensor->handle(), output_tensor->handle(), block_size);
223 _return_fn = asAclFunction(std::move(fn));
226 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
228 using ir::operation::DepthwiseConv2D;
230 const auto ofm_index{node.getOutputs().at(0)};
231 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
232 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
233 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
235 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
236 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
237 // Kernel format is [1, kernel_height, kernel_width, depth_out].
238 const auto &ker_shape = _ctx.at(ker_index).shape();
239 const auto ker_height = ker_shape.dim(1);
240 const auto ker_width = ker_shape.dim(2);
242 const auto stride = node.param().stride;
243 const auto dilation = node.param().dilation;
245 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width,
246 ker_height, dilation.width_factor, dilation.height_factor);
247 const auto multiplier = node.param().multiplier;
248 const auto activation = node.param().activation;
250 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
251 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
252 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
253 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
255 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
256 const auto act_info = acl_common::asActivationLayerInfo(activation);
257 const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
259 auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
260 ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
261 conv_info, multiplier, act_info, dilation_info);
263 _return_fn = asAclFunction(std::move(fn));
266 void KernelGenerator::visit(const ir::operation::Concat &node)
268 const auto ofm_index{node.getOutputs().at(0)};
270 std::vector<ir::OperandIndex> input_indexes;
271 for (const auto &input : node.getInputs())
272 input_indexes.emplace_back(input);
274 const auto axis = node.param().axis;
276 // Concat elimination check
277 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
280 // If concat eliminated, return a NOP IFunction
281 VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
282 _return_fn = std::make_unique<exec::NopFunction>();
286 auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
287 std::vector<::arm_compute::ITensor *> input_tensors;
288 for (const auto &ifm_ind : input_indexes)
289 input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
291 std::unique_ptr<::arm_compute::IFunction> fn;
292 if (input_indexes.size() < 2)
294 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensors.at(0),
295 output_tensor->handle());
299 const auto rank = _ctx.at(ofm_index).shape().rank();
300 const auto frontend_layout = _current_op_seq_layout;
301 const auto backend_layout = output_tensor->layout();
302 const auto fixed_axis =
303 acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
304 fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
305 input_tensors, output_tensor->handle(), fixed_axis);
308 _return_fn = asAclFunction(std::move(fn));
311 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
313 const auto ofm_index{node.getOutputs().at(0)};
314 const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
316 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
317 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
319 const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
320 node.param().op_type, node.param().alpha, node.param().beta);
322 std::unique_ptr<arm_compute::IFunction> fn =
323 acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
324 ofm_tensor->handle(), act_info);
326 _return_fn = asAclFunction(std::move(fn));
329 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
331 const auto output_index{node.getOutputs().at(0)};
332 const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
333 const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
335 auto output_tensor = _tensor_reg->getAclTensor(output_index);
336 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
337 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
339 std::unique_ptr<arm_compute::IFunction> fn;
340 switch (node.param().op_type)
342 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
344 fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
345 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
348 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
350 fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
351 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
354 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
356 fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
357 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
360 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
362 fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
363 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
368 std::string err_msg("acl_neon KernelGenerator : " + node.name() +
369 "is not elementwise-binary operations");
370 assert(false && err_msg.c_str());
374 _return_fn = asAclFunction(std::move(fn));
377 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
379 const auto output_index{node.getOutputs().at(0)};
380 const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
382 auto output_tensor = _tensor_reg->getAclTensor(output_index);
383 auto input_tensor = _tensor_reg->getAclTensor(input_index);
385 std::unique_ptr<arm_compute::IFunction> fn;
386 switch (node.param().op_type)
388 case ir::operation::ElementwiseUnary::Type::ABS:
390 const ::arm_compute::ActivationLayerInfo act_info{
391 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
393 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
394 input_tensor->handle(), output_tensor->handle(), act_info);
397 case ir::operation::ElementwiseUnary::Type::CAST:
399 if (input_tensor->data_type() == output_tensor->data_type())
401 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
402 output_tensor->handle());
404 else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
406 fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
407 output_tensor->handle());
411 fn = acl_common::generateLayer<arm_compute::NECast>(
412 input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
416 case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
418 fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
419 output_tensor->handle());
422 case ir::operation::ElementwiseUnary::Type::EXP:
424 fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
425 output_tensor->handle());
428 case ir::operation::ElementwiseUnary::Type::FLOOR:
430 fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
431 output_tensor->handle());
434 case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
436 fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
437 output_tensor->handle());
440 case ir::operation::ElementwiseUnary::Type::NEG:
442 fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
443 output_tensor->handle());
446 case ir::operation::ElementwiseUnary::Type::RSQRT:
448 fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
449 output_tensor->handle());
452 case ir::operation::ElementwiseUnary::Type::SQRT:
454 const ::arm_compute::ActivationLayerInfo act_info{
455 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
457 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
458 input_tensor->handle(), output_tensor->handle(), act_info);
463 throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
464 "is not supported yet");
468 _return_fn = asAclFunction(std::move(fn));
471 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
473 const auto output_index{node.getOutputs().at(0)};
474 const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
475 const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
477 auto output_tensor = _tensor_reg->getAclTensor(output_index);
478 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
479 auto values_tensor = _tensor_reg->getAclTensor(values_index);
481 auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
482 values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
484 _return_fn = asAclFunction(std::move(fn));
487 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
489 const auto output_index{node.getOutputs().at(0)};
490 auto output_tensor = _tensor_reg->getAclTensor(output_index);
491 const auto activation = node.param().activation;
492 if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
493 throw std::runtime_error(
494 "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
496 auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
497 ::arm_compute::NEFullyConnectedReshapingLayer>(
498 node, _ctx, _tensor_builder, _tensor_reg, _current_op_seq_layout);
499 _return_fn = std::make_unique<exec::FunctionSequence>(
500 std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
503 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
505 const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
506 const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
508 const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
509 const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
510 const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
512 auto output_tensor = _tensor_reg->getAclTensor(output_index);
513 auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
515 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
516 auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
517 auto values_tensor = _tensor_reg->getAclTensor(values_index);
519 auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
520 lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
521 output_tensor->handle(), hits_tensor->handle());
523 _return_fn = asAclFunction(std::move(fn));
526 void KernelGenerator::visit(const ir::operation::Gather &node)
528 const auto ofm_index{node.getOutputs().at(0)};
530 const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
531 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
533 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
534 const auto axis_raw = node.param().axis;
535 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
536 // Converting in reverse order
537 const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
539 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
540 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
541 auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
542 const auto backend_layout = ofm_tensor->layout();
543 UNUSED_RELEASE(backend_layout);
545 // NOTE The frontend layout and backend layout must be the same for this operation.
546 // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
547 // is not not efficient even if it works well. If so, it would be better to set the
548 // layout of these backend tensors to the same layout.
549 // There is also one thing we have to think about. This operation depends on the layout of
550 // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
551 // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
552 // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
553 assert(backend_layout == ifm_tensor->layout());
554 assert(backend_layout == indices_tensor->layout());
555 assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
557 // input is n-D, indices k-D, output is (n + k - 1)-D
559 assert(n == ifm_tensor->num_dimensions());
560 size_t k = _ctx.at(indices_index).shape().rank();
561 assert(k == indices_tensor->num_dimensions());
563 // Disable applied dim_correction
564 if (n != ifm_tensor->info()->num_dimensions())
566 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
567 acl_common::disableDimCorrection(ifm_tensor);
569 if (k != indices_tensor->info()->num_dimensions())
571 // This means that high dimension's value is 1 and indices tensor is applied dim_correction
572 acl_common::disableDimCorrection(indices_tensor);
575 auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
576 ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
578 // Revert disabling applied dim_correction
579 if (ifm_tensor->dimension(0) == 1)
581 acl_common::enableDimCorrection(ifm_tensor);
583 if (indices_tensor->dimension(0) == 1)
585 acl_common::enableDimCorrection(indices_tensor);
588 _return_fn = asAclFunction(std::move(fn));
591 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
593 const auto ofm_index{node.getOutputs().at(0)};
594 const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
595 const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
596 const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
598 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
599 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
600 auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
601 auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
602 auto epsilon = node.param().epsilon;
603 auto activation = node.param().activation;
605 auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
606 ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
609 _return_fn = std::make_unique<exec::FunctionSequence>(
610 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
613 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
615 const auto ofm_index{node.getOutputs().at(0)};
616 const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
618 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
619 // L2 Normalization always performs the reduction along the depth axis
620 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
621 // choosing normalization parameters as below
623 const auto &ifm_shape = _ctx.at(ifm_index).shape();
624 // TODO Support optional constant dimension that normalization would be performed on
625 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
627 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
628 float alpha = 1.0f; // In the implementation to make alpha_ become 1
629 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
630 float bias = 0.0f; // Don't offset the reduction.
632 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
633 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
635 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
636 radius, alpha, beta, bias, false);
638 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
639 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
641 _return_fn = asAclFunction(std::move(fn));
644 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
646 const auto ofm_index{node.getOutputs().at(0)};
647 const auto ifm_index{
648 node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
650 auto radius = node.param().radius;
651 auto alpha = node.param().alpha;
652 auto beta = node.param().beta;
653 auto bias = node.param().bias;
655 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
656 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
658 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
659 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
661 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
662 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
664 _return_fn = asAclFunction(std::move(fn));
667 void KernelGenerator::visit(const ir::operation::LSTM &node)
669 _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
670 ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
673 void KernelGenerator::visit(const ir::operation::Pack &node)
675 const auto output_index{node.getOutputs().at(0)};
676 auto axis{node.param().axis};
678 const auto output_rank = _ctx.at(output_index).shape().rank();
680 std::vector<ir::OperandIndex> input_indexes;
681 for (const auto &input_index : node.getInputs())
682 input_indexes.emplace_back(input_index);
684 auto output = _tensor_reg->getAclTensor(output_index)->handle();
685 std::vector<arm_compute::ITensor *> inputs;
686 for (const auto &input_index : input_indexes)
687 inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
689 const auto frontend_layout = _current_op_seq_layout;
690 const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
694 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
696 // Disable applied dim_correction
697 for (const auto &input_index : input_indexes)
699 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
700 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
702 // This means that high dimension's value is 1 and input tensor is applied dim_correction
703 acl_common::disableDimCorrection(input_tensor);
707 auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
709 // Revert disabling applied dim_correction
710 for (const auto &input_index : input_indexes)
712 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
713 if (input_tensor->dimension(0) == 1)
715 acl_common::enableDimCorrection(input_tensor);
719 _return_fn = asAclFunction(std::move(fn));
722 void KernelGenerator::visit(const ir::operation::Pad &node)
724 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
725 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
726 const auto output_index{node.getOutputs().at(0)};
727 assert(_ctx.at(pad_index).data());
729 auto rank = _ctx.at(input_index).shape().rank();
730 auto pad_base = _ctx.at(pad_index).data()->base();
732 auto input = _tensor_reg->getAclTensor(input_index)->handle();
733 auto output = _tensor_reg->getAclTensor(output_index)->handle();
735 ::arm_compute::PaddingList padding_list;
736 padding_list.resize(rank);
737 for (int32_t n = 0; n < rank; ++n)
739 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
741 const auto frontend_layout = _current_op_seq_layout;
742 const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
744 acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
745 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
748 const auto input_type = _ctx.at(input_index).typeInfo();
749 UNUSED_RELEASE(input_type);
750 assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
751 assert(input->info()->quantization_info() ==
752 ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()));
753 const auto pixel_value =
754 ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
757 acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
759 _return_fn = asAclFunction(std::move(fn));
762 void KernelGenerator::visit(const ir::operation::Pool2D &node)
764 auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
765 node, _ctx, _tensor_reg, _current_op_seq_layout,
766 acl_common::convertPoolType(node.param().op_type));
768 const auto ofm_index{node.getOutputs().at(0)};
769 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
770 const auto activation = node.param().activation;
771 _return_fn = std::make_unique<exec::FunctionSequence>(
772 asAclFunction(std::move(raw_fn)),
773 ActivationBuilder::generate(activation, ofm_tensor->handle()));
776 void KernelGenerator::visit(const ir::operation::Permute &node)
778 const auto ofm_idx{node.getOutputs().at(0)};
779 const auto ifm_idx{node.getInputs().at(0)};
780 const auto permute_type = node.getPermuteType();
781 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
782 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
783 const auto rank = _ctx.at(ofm_idx).shape().rank();
784 assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
786 std::unique_ptr<::arm_compute::IFunction> fn;
787 arm_compute::PermutationVector pv;
788 if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
791 pv = arm_compute::PermutationVector{2, 0, 1};
793 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
794 ofm_tensor->handle(), pv);
796 else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
799 pv = arm_compute::PermutationVector{1, 2, 0};
801 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
802 ofm_tensor->handle(), pv);
806 fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
808 _return_fn = asAclFunction(std::move(fn));
811 void KernelGenerator::visit(const ir::operation::PReLU &node)
813 const auto ofm_index{node.getOutputs().at(0)};
814 const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
815 const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
817 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
818 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
819 auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
821 auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
822 ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
824 _return_fn = asAclFunction(std::move(fn));
827 void KernelGenerator::visit(const ir::operation::Reduce &node)
829 const auto output_index{node.getOutputs().at(0)};
830 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
831 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
833 auto output_tensor = _tensor_reg->getAclTensor(output_index);
834 auto input_tensor = _tensor_reg->getAclTensor(input_index);
836 // Convert to ACL axes taking into account negative values and possible duplicates.
837 const auto &axes = _ctx.at(axes_index);
838 const auto input_rank = _ctx.at(input_index).shape().rank();
839 const auto frontend_layout = _current_op_seq_layout;
840 const auto backend_layout = input_tensor->layout();
841 const auto reduce_axes =
842 acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
843 const auto reduce_type = node.param().reduce_type;
844 const auto keep_dims = node.param().keep_dims;
846 std::unique_ptr<::arm_compute::IFunction> fn;
847 if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
849 fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
850 keep_dims, output_tensor->handle());
852 else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
854 fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
855 keep_dims, output_tensor->handle());
859 fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
860 input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
861 acl_common::convertReduceType(reduce_type));
863 _return_fn = asAclFunction(std::move(fn));
866 void KernelGenerator::visit(const ir::operation::Reshape &node)
868 const auto output_index{node.getOutputs().at(0)};
869 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
871 auto output_tensor = _tensor_reg->getAclTensor(output_index);
872 auto input_tensor = _tensor_reg->getAclTensor(input_index);
874 // NOTE This operation must not be changed the layout from frontend to backend
875 // So, PermutationOperationPass makes layouts of frontend and backend the same.
876 const auto frontend_layout = _current_op_seq_layout;
877 const auto backend_layout = output_tensor->layout();
878 assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
879 frontend_layout == backend_layout);
880 UNUSED_RELEASE(frontend_layout);
881 UNUSED_RELEASE(backend_layout);
883 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
884 output_tensor->handle());
886 _return_fn = asAclFunction(std::move(fn));
889 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
891 const auto ofm_index{node.getOutputs().at(0)};
892 const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
894 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
895 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
897 auto fn = acl_common::generateLayer<arm_compute::NEScale>(
898 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
899 ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
900 ::arm_compute::SamplingPolicy::TOP_LEFT);
902 _return_fn = asAclFunction(std::move(fn));
905 void KernelGenerator::visit(const ir::operation::RNN &node)
907 const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
908 const auto hidden_state_out_index{
909 node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
911 const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
912 const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
913 const auto recurrent_weights_index{
914 node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
915 const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
916 const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
918 const auto activation = node.param().activation;
920 auto output_tensor = _tensor_reg->getAclTensor(output_index);
921 auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
923 auto input_tensor = _tensor_reg->getAclTensor(input_index);
924 auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
925 auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
926 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
927 auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
928 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
930 auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
931 hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
932 _return_fn = asAclFunction(std::move(copy_layer));
934 auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
935 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
936 weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
937 hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
938 _return_fn = asAclFunction(std::move(fn));
941 void KernelGenerator::visit(const ir::operation::Squeeze &node)
943 // Squeeze is identical to reshape except that it has an optional dimensions input.
944 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
945 // by freezer and toco
946 const auto output_index{node.getOutputs().at(0)};
947 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
948 const auto dims{node.param().dims};
949 const auto ndim{node.param().ndim};
953 auto output_tensor = _tensor_reg->getAclTensor(output_index);
954 auto input_tensor = _tensor_reg->getAclTensor(input_index);
955 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
956 output_tensor->handle());
957 _return_fn = asAclFunction(std::move(fn));
960 void KernelGenerator::visit(const ir::operation::Softmax &node)
962 const auto output_index{node.getOutputs().at(0)};
963 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
964 const auto beta = node.param().beta;
966 auto output_tensor = _tensor_reg->getAclTensor(output_index);
967 auto input_tensor = _tensor_reg->getAclTensor(input_index);
969 // Disable applied dim_correction
970 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
972 // This means that high dimension's value is 1 and input tensor is applied dim_correction
973 acl_common::disableDimCorrection(input_tensor);
976 auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
977 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
978 output_tensor->handle(), beta);
980 // Revert disabling applied dim_correction
981 if (input_tensor->dimension(0) == 1)
983 acl_common::disableDimCorrection(input_tensor);
986 _return_fn = asAclFunction(std::move(fn));
989 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
991 const auto ofm_index{node.getOutputs().at(0)};
992 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
993 const auto block_size_index{
994 node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
995 const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
997 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
998 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
999 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1000 auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1002 assert(_ctx.at(block_size_index).data());
1003 assert(_ctx.at(paddings_index).data());
1005 auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
1006 ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1007 ofm_tensor->handle());
1009 _return_fn = asAclFunction(std::move(fn));
1012 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1014 const auto ofm_index{node.getOutputs().at(0)};
1015 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1017 auto block_size = node.param().block_size;
1019 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1020 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1022 auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
1023 ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1025 _return_fn = asAclFunction(std::move(fn));
1028 void KernelGenerator::visit(const ir::operation::Split &node)
1030 // TODO Support this op by SubTensor
1031 const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1032 const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1034 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1035 if (!_ctx.at(axis_index).isConstant())
1037 throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
1040 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1041 std::vector<ir::OperandIndex> output_indexes;
1042 for (const auto &output : node.getOutputs())
1043 output_indexes.emplace_back(output);
1045 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1046 std::vector<arm_compute::ITensor *> output_tensors;
1047 for (const auto &ofm_ind : output_indexes)
1048 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1050 const auto frontend_layout = _current_op_seq_layout;
1051 const auto backend_layout = ifm_tensor->layout();
1052 auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1055 axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1058 acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
1060 _return_fn = asAclFunction(std::move(fn));
1063 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1065 const auto ofm_index{node.getOutputs().at(0)};
1066 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1067 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1069 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1070 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1071 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1073 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1074 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1076 _return_fn = asAclFunction(std::move(fn));
1079 void KernelGenerator::visit(const ir::operation::Slice &node)
1081 const auto output_index{node.getOutputs().at(0)};
1082 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1083 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1084 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1086 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1087 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1088 const auto frontend_layout = _current_op_seq_layout;
1089 const auto backend_layout = inputData_tensor->layout();
1091 // Set initializers for indices data such as order of inputData
1092 int input_rank = _ctx.at(input_index).shape().rank();
1093 std::vector<int32_t> starts;
1094 std::vector<int32_t> ends;
1095 starts.resize(input_rank, 0);
1096 ends.resize(input_rank, 0);
1098 auto beginData_base = _ctx.at(begins_index).data()->base();
1099 auto sizeData_base = _ctx.at(sizes_index).data()->base();
1100 const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1101 const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1105 UNUSED_RELEASE(beginData_size);
1106 UNUSED_RELEASE(sizeData_size);
1108 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1109 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1110 assert(beginData_size == input_rank);
1111 assert(sizeData_size == input_rank);
1113 assert(beginData_base != nullptr);
1114 for (int n = 0; n < input_rank; ++n)
1116 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1120 int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1121 starts[axis] = begin_value;
1123 int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1124 ends[axis] = begin_value + size_value;
1128 ::arm_compute::Coordinates starts_set;
1129 ::arm_compute::Coordinates ends_set;
1131 for (size_t i = 0; i < starts.size(); ++i)
1133 starts_set.set(i, starts[i]);
1134 ends_set.set(i, ends[i]);
1137 auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1138 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1140 _return_fn = asAclFunction(std::move(fn));
1143 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1145 const auto output_index{node.getOutputs().at(0)};
1146 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1147 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1148 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1149 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1151 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1152 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1153 const auto frontend_layout = _current_op_seq_layout;
1154 const auto backend_layout = inputData_tensor->layout();
1156 // Set initializers for indices data such as order of inputData
1157 int input_rank = _ctx.at(input_index).shape().rank();
1158 std::vector<int32_t> starts;
1159 std::vector<int32_t> ends;
1160 std::vector<int32_t> strides;
1161 starts.resize(input_rank, 0);
1162 ends.resize(input_rank, 0);
1163 strides.resize(input_rank, 0);
1165 auto startData_base = _ctx.at(starts_index).data()->base();
1166 auto endData_base = _ctx.at(ends_index).data()->base();
1167 auto stridesData_base = _ctx.at(strides_index).data()->base();
1168 const int startData_size = _ctx.at(starts_index).shape().num_elements();
1169 const int endData_size = _ctx.at(ends_index).shape().num_elements();
1170 const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1174 UNUSED_RELEASE(startData_size);
1175 UNUSED_RELEASE(endData_size);
1176 UNUSED_RELEASE(stridesData_size);
1178 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1179 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1180 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1181 assert(startData_size == input_rank);
1182 assert(endData_size == input_rank);
1183 assert(stridesData_size == input_rank);
1185 assert(startData_base != nullptr);
1186 for (int n = 0; n < input_rank; ++n)
1188 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1192 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1193 starts[axis] = start_value;
1195 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1196 ends[axis] = end_value;
1198 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1199 strides[axis] = strides_value;
1203 // Set mask bits such as order of inputData
1204 // FIXME Take the layouts into account.
1205 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1206 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1207 const auto shrink_axis_mask =
1208 acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1210 ::arm_compute::Coordinates starts_set;
1211 ::arm_compute::Coordinates ends_set;
1212 ::arm_compute::BiStrides strides_set;
1214 for (size_t i = 0; i < starts.size(); ++i)
1216 starts_set.set(i, starts[i]);
1217 ends_set.set(i, ends[i]);
1218 strides_set.set(i, strides[i]);
1221 // Disable applied dim_correction
1222 if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
1224 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1225 acl_common::disableDimCorrection(inputData_tensor);
1228 auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1229 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1230 begin_mask, end_mask, shrink_axis_mask);
1232 // Revert disabling applied dim_correction
1233 if (inputData_tensor->dimension(0) == 1)
1235 acl_common::enableDimCorrection(inputData_tensor);
1238 _return_fn = asAclFunction(std::move(fn));
1241 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1243 const auto ofm_index{node.getOutputs().at(0)};
1244 const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1245 const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1247 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1248 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1249 const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
1251 const auto stride = node.param().stride;
1253 assert((node.param().padding.type == ir::PaddingType::SAME) ||
1254 (node.param().padding.type == ir::PaddingType::VALID));
1255 auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1256 ker_shape.W, ker_shape.H);
1258 uint32_t invalid_horizontal = 0;
1259 uint32_t invalid_vertical = 0;
1260 if (node.param().padding.type == ir::PaddingType::VALID)
1262 invalid_horizontal =
1263 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1264 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1267 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1268 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1269 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1271 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1273 auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1274 ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1275 invalid_horizontal, invalid_vertical);
1277 _return_fn = asAclFunction(std::move(fn));
1280 void KernelGenerator::visit(const ir::operation::Transpose &node)
1282 const auto ofm_idx{node.getOutputs().at(0)};
1283 const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1284 const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
1286 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
1287 const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
1288 const auto frontend_layout = _current_op_seq_layout;
1289 const auto backend_layout = ifm_tensor->layout();
1290 const auto rank = _ctx.at(ifm_idx).shape().rank();
1292 const auto &perms = _ctx.at(perm_idx);
1293 std::vector<int32_t> pv;
1294 if (perms.shape() == ir::Shape{0})
1297 std::iota(pv.begin(), pv.end(), 0);
1298 std::reverse(pv.begin(), pv.end());
1302 pv = _ctx.at(perm_idx).asVector<int32_t>();
1305 std::unique_ptr<arm_compute::IFunction> fn;
1308 fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
1312 assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
1313 fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1314 ofm_tensor->handle());
1319 acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
1321 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1322 ofm_tensor->handle(), backend_pv);
1324 _return_fn = asAclFunction(std::move(fn));
1327 void KernelGenerator::visit(const ir::operation::Unpack &node)
1329 const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1330 auto axis{node.param().axis};
1332 const auto input_rank = _ctx.at(input_index).shape().rank();
1334 std::vector<ir::OperandIndex> output_indexes;
1335 for (const auto &output_index : node.getOutputs())
1336 output_indexes.emplace_back(output_index);
1338 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1339 std::vector<arm_compute::ITensor *> outputs;
1340 for (const auto &output_index : output_indexes)
1341 outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1343 const auto frontend_layout = _current_op_seq_layout;
1344 const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1347 axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1349 // Disable applied dim_correction
1350 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1352 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1353 acl_common::disableDimCorrection(input_tensor);
1357 acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
1359 // Revert disabling applied dim_correction
1360 if (input_tensor->dimension(0) == 1)
1362 acl_common::enableDimCorrection(input_tensor);
1365 _return_fn = asAclFunction(std::move(fn));
1368 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1370 const auto output_index{node.getOutputs().at(0)};
1371 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1373 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1374 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1376 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1377 output_tensor->handle());
1379 _return_fn = asAclFunction(std::move(fn));
1382 void KernelGenerator::visit(const ir::operation::Comparison &node)
1384 const auto output_index{node.getOutputs().at(0)};
1385 const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1386 const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1388 const auto comparison_type = node.param().comparison_type;
1390 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1391 auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
1392 auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
1394 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1395 input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1396 (arm_compute::ComparisonOperation)comparison_type);
1398 _return_fn = asAclFunction(std::move(fn));
1401 void KernelGenerator::visit(const ir::operation::OneHot &node)
1403 const auto out_idx{node.getOutputs().at(0)};
1404 const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1405 const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1406 const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1407 const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1409 auto output_tensor = _tensor_reg->getAclTensor(out_idx);
1410 auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
1411 auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
1412 auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
1413 auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
1415 const size_t output_rank = _ctx.at(out_idx).shape().rank();
1416 const auto frontend_layout = _current_op_seq_layout;
1417 const auto backend_layout = output_tensor->layout();
1418 int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
1419 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1421 auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
1422 indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1423 offvalue_tensor->handle(), output_tensor->handle(), axis);
1424 _return_fn = asAclFunction(std::move(fn));
1427 } // namespace acl_neon
1428 } // namespace backend
1429 } // namespace onert