2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "KernelGenerator.h"
19 #include <arm_compute/runtime/NEON/NEFunctions.h> // Include all ARM Compute NEON functions
20 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
22 #include <AclActivationBuilder.h>
23 #include <AclFunction.h>
28 #include "ir/DataType.h"
29 #include "ir/InternalType.h"
30 #include "exec/NopFunction.h"
31 #include "util/logging.h"
32 #include "util/Utils.h"
33 #include "AclKernelGen.h"
42 using ::onert::backend::acl_common::asAclFunction;
43 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
44 ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
46 KernelGenerator::KernelGenerator(
47 const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
48 const std::shared_ptr<TensorBuilder> &tensor_builder,
49 const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
50 : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
51 _tensor_reg(tensor_reg), _current_layout(ir::Layout::UNKNOWN)
56 void KernelGenerator::visit(const ir::OpSequence &op_seq)
58 // TODO Move this to IKernelGenerator
59 // (all derivatives have the same implementation for this)
60 assert(!_return_fn_seq);
61 _return_fn_seq = std::make_unique<exec::FunctionSequence>();
62 _return_fn_seq->enableDynamicShapeInferer(false);
64 _current_layout = op_seq.getLayout();
65 for (const auto &operation_idx : op_seq.operations())
67 const auto &node = _operations_ctx.at(operation_idx);
69 _return_fn_seq->append(releaseFunction());
73 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
75 const auto ofm_index{node.getOutputs().at(0)};
76 const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
77 const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
79 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
81 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
82 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
83 auto frontend_layout = _current_layout;
84 auto backend_layout = ifm_tensor->layout();
86 int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
89 axis_value += ifm_rank;
91 assert(axis_value >= 0 && axis_value < ifm_rank);
92 const auto fixed_axis =
93 acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
94 auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
95 : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
97 auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
98 ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), reduce_type);
100 _return_fn = asAclFunction(std::move(fn));
103 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
105 const auto ofm_index{node.getOutputs().at(0)};
106 const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
107 const auto block_size_index{
108 node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
110 const auto NNApiInputs = 2;
111 if (node.getInputs().size() != NNApiInputs)
113 const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
114 if (!_ctx.at(crops_index).isConstant())
116 throw std::runtime_error("Non-constant crops NYI for acl_neon backend BatchToSpaceND");
119 auto crops = _ctx.at(crops_index).asVector<int32_t>();
120 for (auto crop : crops)
124 throw std::runtime_error("Non-zero crops NYI for acl_neon backend BatchToSpaceND");
129 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
130 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
131 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
133 assert(_ctx.at(block_size_index).data());
135 auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
136 ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
138 _return_fn = asAclFunction(std::move(fn));
141 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
143 const auto ofm_index{node.getOutputs().at(0)};
144 const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
145 const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
147 const auto activation = node.param().activation;
149 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
150 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
151 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
153 std::unique_ptr<arm_compute::IFunction> fn;
154 switch (node.param().arithmetic_type)
156 case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
158 fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
159 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
160 arm_compute::ConvertPolicy::SATURATE);
163 case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
165 fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
166 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
167 arm_compute::ConvertPolicy::SATURATE);
170 case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
172 // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
173 fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
174 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
175 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
178 case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
180 fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
181 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
185 assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
188 _return_fn = std::make_unique<exec::FunctionSequence>(
189 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
192 void KernelGenerator::visit(const ir::operation::Conv2D &node)
194 using ir::operation::Conv2D;
196 const auto ofm_index{node.getOutputs().at(0)};
197 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
198 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
199 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
201 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
202 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
203 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
204 const auto &ker_shape = _ctx.at(ker_index).shape();
205 const auto ker_height = ker_shape.dim(1);
206 const auto ker_width = ker_shape.dim(2);
208 const auto stride = node.param().stride;
209 const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
210 ker_width, ker_height);
211 const auto activation = node.param().activation;
213 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
214 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
215 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
216 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
218 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
219 const auto act_info = acl_common::asActivationLayerInfo(activation);
221 auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
222 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
223 ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
224 ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
226 _return_fn = asAclFunction(std::move(fn));
229 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
231 const auto output_index{node.getOutputs().at(0)};
232 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
234 auto block_size = node.param().block_size;
235 assert(block_size > 0);
237 auto output_tensor = _tensor_reg->getAclTensor(output_index);
238 auto input_tensor = _tensor_reg->getAclTensor(input_index);
240 auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
241 input_tensor->handle(), output_tensor->handle(), block_size);
243 _return_fn = asAclFunction(std::move(fn));
246 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
248 using ir::operation::DepthwiseConv2D;
250 const auto ofm_index{node.getOutputs().at(0)};
251 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
252 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
253 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
255 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
256 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
257 // Kernel format is [1, kernel_height, kernel_width, depth_out].
258 const auto &ker_shape = _ctx.at(ker_index).shape();
259 const auto ker_height = ker_shape.dim(1);
260 const auto ker_width = ker_shape.dim(2);
262 const auto stride = node.param().stride;
263 const auto dilation = node.param().dilation;
265 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width,
266 ker_height, dilation.width_factor, dilation.height_factor);
267 const auto multiplier = node.param().multiplier;
268 const auto activation = node.param().activation;
270 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
271 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
272 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
273 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
275 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
276 const auto act_info = acl_common::asActivationLayerInfo(activation);
277 const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
279 auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
280 ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
281 conv_info, multiplier, act_info, dilation_info);
283 _return_fn = asAclFunction(std::move(fn));
286 void KernelGenerator::visit(const ir::operation::Concat &node)
288 const auto ofm_index{node.getOutputs().at(0)};
290 std::vector<ir::OperandIndex> input_indexes;
291 for (const auto &input : node.getInputs())
292 input_indexes.emplace_back(input);
294 const auto axis = node.param().axis;
296 // Concat elimination check
297 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
300 // If concat eliminated, return a NOP IFunction
301 VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
302 _return_fn = std::make_unique<exec::NopFunction>();
306 auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
307 std::vector<::arm_compute::ITensor *> input_tensors;
308 for (const auto &ifm_ind : input_indexes)
309 input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
311 std::unique_ptr<::arm_compute::IFunction> fn;
312 if (input_indexes.size() < 2)
314 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensors.at(0),
315 output_tensor->handle());
319 const auto rank = _ctx.at(ofm_index).shape().rank();
320 const auto frontend_layout = _current_layout;
321 const auto backend_layout = output_tensor->layout();
322 const auto fixed_axis =
323 acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
324 fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
325 input_tensors, output_tensor->handle(), fixed_axis);
328 _return_fn = asAclFunction(std::move(fn));
331 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
333 const auto ofm_index{node.getOutputs().at(0)};
334 const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
336 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
337 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
339 const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
340 node.param().op_type, node.param().alpha, node.param().beta);
342 std::unique_ptr<arm_compute::IFunction> fn =
343 acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
344 ofm_tensor->handle(), act_info);
346 _return_fn = asAclFunction(std::move(fn));
349 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
351 const auto output_index{node.getOutputs().at(0)};
352 const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
353 const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
355 auto output_tensor = _tensor_reg->getAclTensor(output_index);
356 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
357 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
359 std::unique_ptr<arm_compute::IFunction> fn;
360 switch (node.param().op_type)
362 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
364 fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
365 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
368 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
370 fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
371 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
374 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
376 fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
377 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
380 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
382 fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
383 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
388 std::string err_msg("acl_neon KernelGenerator : " + node.name() +
389 "is not elementwise-binary operations");
390 assert(false && err_msg.c_str());
394 _return_fn = asAclFunction(std::move(fn));
397 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
399 const auto output_index{node.getOutputs().at(0)};
400 const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
402 auto output_tensor = _tensor_reg->getAclTensor(output_index);
403 auto input_tensor = _tensor_reg->getAclTensor(input_index);
405 std::unique_ptr<arm_compute::IFunction> fn;
406 switch (node.param().op_type)
408 case ir::operation::ElementwiseUnary::Type::ABS:
410 const ::arm_compute::ActivationLayerInfo act_info{
411 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
413 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
414 input_tensor->handle(), output_tensor->handle(), act_info);
417 case ir::operation::ElementwiseUnary::Type::CAST:
419 if (input_tensor->data_type() == output_tensor->data_type())
421 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
422 output_tensor->handle());
424 else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
426 fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
427 output_tensor->handle());
431 fn = acl_common::generateLayer<arm_compute::NECast>(
432 input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
436 case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
438 fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
439 output_tensor->handle());
442 case ir::operation::ElementwiseUnary::Type::EXP:
444 fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
445 output_tensor->handle());
448 case ir::operation::ElementwiseUnary::Type::FLOOR:
450 fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
451 output_tensor->handle());
454 case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
456 fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
457 output_tensor->handle());
460 case ir::operation::ElementwiseUnary::Type::NEG:
462 fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
463 output_tensor->handle());
466 case ir::operation::ElementwiseUnary::Type::RSQRT:
468 fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
469 output_tensor->handle());
472 case ir::operation::ElementwiseUnary::Type::SQRT:
474 const ::arm_compute::ActivationLayerInfo act_info{
475 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
477 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
478 input_tensor->handle(), output_tensor->handle(), act_info);
483 throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
484 "is not supported yet");
488 _return_fn = asAclFunction(std::move(fn));
491 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
493 const auto output_index{node.getOutputs().at(0)};
494 const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
495 const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
497 auto output_tensor = _tensor_reg->getAclTensor(output_index);
498 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
499 auto values_tensor = _tensor_reg->getAclTensor(values_index);
501 auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
502 values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
504 _return_fn = asAclFunction(std::move(fn));
507 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
509 const auto output_index{node.getOutputs().at(0)};
510 auto output_tensor = _tensor_reg->getAclTensor(output_index);
511 const auto activation = node.param().activation;
512 if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
513 throw std::runtime_error(
514 "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
516 auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
517 ::arm_compute::NEFullyConnectedReshapingLayer>(
518 node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
519 _return_fn = std::make_unique<exec::FunctionSequence>(
520 std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
523 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
525 const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
526 const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
528 const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
529 const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
530 const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
532 auto output_tensor = _tensor_reg->getAclTensor(output_index);
533 auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
535 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
536 auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
537 auto values_tensor = _tensor_reg->getAclTensor(values_index);
539 auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
540 lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
541 output_tensor->handle(), hits_tensor->handle());
543 _return_fn = asAclFunction(std::move(fn));
546 void KernelGenerator::visit(const ir::operation::Gather &node)
548 const auto ofm_index{node.getOutputs().at(0)};
550 const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
551 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
553 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
554 const auto axis_raw = node.param().axis;
555 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
556 // Converting in reverse order
557 const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
559 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
560 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
561 auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
562 const auto backend_layout = ofm_tensor->layout();
563 UNUSED_RELEASE(backend_layout);
565 // NOTE The frontend layout and backend layout must be the same for this operation.
566 // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
567 // is not not efficient even if it works well. If so, it would be better to set the
568 // layout of these backend tensors to the same layout.
569 // There is also one thing we have to think about. This operation depends on the layout of
570 // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
571 // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
572 // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
573 assert(backend_layout == ifm_tensor->layout());
574 assert(backend_layout == indices_tensor->layout());
575 assert(ifm_rank < 4 || _current_layout == backend_layout);
577 // input is n-D, indices k-D, output is (n + k - 1)-D
579 assert(n == ifm_tensor->num_dimensions());
580 size_t k = _ctx.at(indices_index).shape().rank();
581 assert(k == indices_tensor->num_dimensions());
583 // Disable applied dim_correction
584 if (n != ifm_tensor->info()->num_dimensions())
586 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
587 acl_common::disableDimCorrection(ifm_tensor);
589 if (k != indices_tensor->info()->num_dimensions())
591 // This means that high dimension's value is 1 and indices tensor is applied dim_correction
592 acl_common::disableDimCorrection(indices_tensor);
595 auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
596 ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
598 // Revert disabling applied dim_correction
599 if (ifm_tensor->dimension(0) == 1)
601 acl_common::enableDimCorrection(ifm_tensor);
603 if (indices_tensor->dimension(0) == 1)
605 acl_common::enableDimCorrection(indices_tensor);
608 _return_fn = asAclFunction(std::move(fn));
611 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
613 const auto ofm_index{node.getOutputs().at(0)};
614 const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
615 const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
616 const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
618 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
619 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
620 auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
621 auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
622 auto epsilon = node.param().epsilon;
623 auto activation = node.param().activation;
625 auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
626 ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
629 _return_fn = std::make_unique<exec::FunctionSequence>(
630 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
633 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
635 const auto ofm_index{node.getOutputs().at(0)};
636 const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
638 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
639 // L2 Normalization always performs the reduction along the depth axis
640 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
641 // choosing normalization parameters as below
643 const auto &ifm_shape = _ctx.at(ifm_index).shape();
644 // TODO Support optional constant dimension that normalization would be performed on
645 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
647 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
648 float alpha = 1.0f; // In the implementation to make alpha_ become 1
649 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
650 float bias = 0.0f; // Don't offset the reduction.
652 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
653 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
655 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
656 radius, alpha, beta, bias, false);
658 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
659 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
661 _return_fn = asAclFunction(std::move(fn));
664 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
666 const auto ofm_index{node.getOutputs().at(0)};
667 const auto ifm_index{
668 node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
670 auto radius = node.param().radius;
671 auto alpha = node.param().alpha;
672 auto beta = node.param().beta;
673 auto bias = node.param().bias;
675 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
676 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
678 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
679 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
681 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
682 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
684 _return_fn = asAclFunction(std::move(fn));
687 void KernelGenerator::visit(const ir::operation::LSTM &node)
689 _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
690 ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
693 void KernelGenerator::visit(const ir::operation::Pack &node)
695 const auto output_index{node.getOutputs().at(0)};
696 auto axis{node.param().axis};
698 const auto output_rank = _ctx.at(output_index).shape().rank();
700 std::vector<ir::OperandIndex> input_indexes;
701 for (const auto &input_index : node.getInputs())
702 input_indexes.emplace_back(input_index);
704 auto output = _tensor_reg->getAclTensor(output_index)->handle();
705 std::vector<arm_compute::ITensor *> inputs;
706 for (const auto &input_index : input_indexes)
707 inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
709 const auto frontend_layout = _current_layout;
710 const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
714 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
716 // Disable applied dim_correction
717 for (const auto &input_index : input_indexes)
719 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
720 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
722 // This means that high dimension's value is 1 and input tensor is applied dim_correction
723 acl_common::disableDimCorrection(input_tensor);
727 auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
729 // Revert disabling applied dim_correction
730 for (const auto &input_index : input_indexes)
732 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
733 if (input_tensor->dimension(0) == 1)
735 acl_common::enableDimCorrection(input_tensor);
739 _return_fn = asAclFunction(std::move(fn));
742 void KernelGenerator::visit(const ir::operation::Pad &node)
744 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
745 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
746 const auto output_index{node.getOutputs().at(0)};
747 assert(_ctx.at(pad_index).data());
749 auto rank = _ctx.at(input_index).shape().rank();
750 auto pad_base = _ctx.at(pad_index).data()->base();
752 auto input = _tensor_reg->getAclTensor(input_index)->handle();
753 auto output = _tensor_reg->getAclTensor(output_index)->handle();
755 ::arm_compute::PaddingList padding_list;
756 padding_list.resize(rank);
757 for (int32_t n = 0; n < rank; ++n)
759 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
761 const auto frontend_layout = _current_layout;
762 const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
764 acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
765 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
768 const auto input_type = _ctx.at(input_index).typeInfo();
769 UNUSED_RELEASE(input_type);
770 assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
771 assert(input->info()->quantization_info() ==
772 ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()));
773 const auto pixel_value =
774 ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
777 acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
779 _return_fn = asAclFunction(std::move(fn));
782 void KernelGenerator::visit(const ir::operation::Pool2D &node)
784 auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
785 node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
787 const auto ofm_index{node.getOutputs().at(0)};
788 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
789 const auto activation = node.param().activation;
790 _return_fn = std::make_unique<exec::FunctionSequence>(
791 asAclFunction(std::move(raw_fn)),
792 ActivationBuilder::generate(activation, ofm_tensor->handle()));
795 void KernelGenerator::visit(const ir::operation::Permute &node)
797 const auto ofm_idx{node.getOutputs().at(0)};
798 const auto ifm_idx{node.getInputs().at(0)};
799 const auto permute_type = node.getPermuteType();
800 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
801 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
802 const auto rank = _ctx.at(ofm_idx).shape().rank();
803 assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
805 std::unique_ptr<::arm_compute::IFunction> fn;
806 arm_compute::PermutationVector pv;
807 if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
810 pv = arm_compute::PermutationVector{2, 0, 1};
812 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
813 ofm_tensor->handle(), pv);
815 else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
818 pv = arm_compute::PermutationVector{1, 2, 0};
820 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
821 ofm_tensor->handle(), pv);
825 fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
827 _return_fn = asAclFunction(std::move(fn));
830 void KernelGenerator::visit(const ir::operation::PReLU &node)
832 const auto ofm_index{node.getOutputs().at(0)};
833 const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
834 const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
836 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
837 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
838 auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
840 auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
841 ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
843 _return_fn = asAclFunction(std::move(fn));
846 void KernelGenerator::visit(const ir::operation::Reduce &node)
848 const auto output_index{node.getOutputs().at(0)};
849 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
850 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
852 auto output_tensor = _tensor_reg->getAclTensor(output_index);
853 auto input_tensor = _tensor_reg->getAclTensor(input_index);
855 // Convert to ACL axes taking into account negative values and possible duplicates.
856 const auto &axes = _ctx.at(axes_index);
857 const auto input_rank = _ctx.at(input_index).shape().rank();
858 const auto frontend_layout = _current_layout;
859 const auto backend_layout = input_tensor->layout();
860 const auto reduce_axes =
861 acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
862 const auto reduce_type = node.param().reduce_type;
863 const auto keep_dims = node.param().keep_dims;
865 std::unique_ptr<::arm_compute::IFunction> fn;
866 if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
868 fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
869 keep_dims, output_tensor->handle());
871 else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
873 fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
874 keep_dims, output_tensor->handle());
878 fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
879 input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
880 acl_common::convertReduceType(reduce_type));
882 _return_fn = asAclFunction(std::move(fn));
885 void KernelGenerator::visit(const ir::operation::Reshape &node)
887 const auto output_index{node.getOutputs().at(0)};
888 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
890 auto output_tensor = _tensor_reg->getAclTensor(output_index);
891 auto input_tensor = _tensor_reg->getAclTensor(input_index);
893 // NOTE This operation must not be changed the layout from frontend to backend
894 // So, PermutationOperationPass makes layouts of frontend and backend the same.
895 const auto frontend_layout = _current_layout;
896 const auto backend_layout = output_tensor->layout();
897 assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
898 frontend_layout == backend_layout);
899 UNUSED_RELEASE(frontend_layout);
900 UNUSED_RELEASE(backend_layout);
902 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
903 output_tensor->handle());
905 _return_fn = asAclFunction(std::move(fn));
908 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
910 const auto ofm_index{node.getOutputs().at(0)};
911 const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
913 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
914 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
916 auto fn = acl_common::generateLayer<arm_compute::NEScale>(
917 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
918 ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
919 ::arm_compute::SamplingPolicy::TOP_LEFT);
921 _return_fn = asAclFunction(std::move(fn));
924 void KernelGenerator::visit(const ir::operation::RNN &node)
926 const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
927 const auto hidden_state_out_index{
928 node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
930 const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
931 const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
932 const auto recurrent_weights_index{
933 node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
934 const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
935 const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
937 const auto activation = node.param().activation;
939 auto output_tensor = _tensor_reg->getAclTensor(output_index);
940 auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
942 auto input_tensor = _tensor_reg->getAclTensor(input_index);
943 auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
944 auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
945 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
946 auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
947 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
949 auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
950 hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
951 _return_fn = asAclFunction(std::move(copy_layer));
953 auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
954 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
955 weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
956 hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
957 _return_fn = asAclFunction(std::move(fn));
960 void KernelGenerator::visit(const ir::operation::Squeeze &node)
962 // Squeeze is identical to reshape except that it has an optional dimensions input.
963 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
964 // by freezer and toco
965 const auto output_index{node.getOutputs().at(0)};
966 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
967 const auto dims{node.param().dims};
968 const auto ndim{node.param().ndim};
972 auto output_tensor = _tensor_reg->getAclTensor(output_index);
973 auto input_tensor = _tensor_reg->getAclTensor(input_index);
974 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
975 output_tensor->handle());
976 _return_fn = asAclFunction(std::move(fn));
979 void KernelGenerator::visit(const ir::operation::Softmax &node)
981 const auto output_index{node.getOutputs().at(0)};
982 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
983 const auto beta = node.param().beta;
985 auto output_tensor = _tensor_reg->getAclTensor(output_index);
986 auto input_tensor = _tensor_reg->getAclTensor(input_index);
988 // Disable applied dim_correction
989 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
991 // This means that high dimension's value is 1 and input tensor is applied dim_correction
992 acl_common::disableDimCorrection(input_tensor);
995 auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
996 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
997 output_tensor->handle(), beta);
999 // Revert disabling applied dim_correction
1000 if (input_tensor->dimension(0) == 1)
1002 acl_common::disableDimCorrection(input_tensor);
1005 _return_fn = asAclFunction(std::move(fn));
1008 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1010 const auto ofm_index{node.getOutputs().at(0)};
1011 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
1012 const auto block_size_index{
1013 node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1014 const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1016 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1017 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1018 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1019 auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1021 assert(_ctx.at(block_size_index).data());
1022 assert(_ctx.at(paddings_index).data());
1024 auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
1025 ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1026 ofm_tensor->handle());
1028 _return_fn = asAclFunction(std::move(fn));
1031 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1033 const auto ofm_index{node.getOutputs().at(0)};
1034 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1036 auto block_size = node.param().block_size;
1038 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1039 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1041 auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
1042 ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1044 _return_fn = asAclFunction(std::move(fn));
1047 void KernelGenerator::visit(const ir::operation::Split &node)
1049 // TODO Support this op by SubTensor
1050 const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1051 const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1053 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1054 if (!_ctx.at(axis_index).isConstant())
1056 throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
1059 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1060 std::vector<ir::OperandIndex> output_indexes;
1061 for (const auto &output : node.getOutputs())
1062 output_indexes.emplace_back(output);
1064 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1065 std::vector<arm_compute::ITensor *> output_tensors;
1066 for (const auto &ofm_ind : output_indexes)
1067 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1069 const auto frontend_layout = _current_layout;
1070 const auto backend_layout = ifm_tensor->layout();
1071 auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1074 axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1077 acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
1079 _return_fn = asAclFunction(std::move(fn));
1082 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1084 const auto ofm_index{node.getOutputs().at(0)};
1085 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1086 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1088 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1089 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1090 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1092 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1093 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1095 _return_fn = asAclFunction(std::move(fn));
1098 void KernelGenerator::visit(const ir::operation::Slice &node)
1100 const auto output_index{node.getOutputs().at(0)};
1101 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1102 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1103 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1105 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1106 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1107 const auto frontend_layout = _current_layout;
1108 const auto backend_layout = inputData_tensor->layout();
1110 // Set initializers for indices data such as order of inputData
1111 int input_rank = _ctx.at(input_index).shape().rank();
1112 std::vector<int32_t> starts;
1113 std::vector<int32_t> ends;
1114 starts.resize(input_rank, 0);
1115 ends.resize(input_rank, 0);
1117 auto beginData_base = _ctx.at(begins_index).data()->base();
1118 auto sizeData_base = _ctx.at(sizes_index).data()->base();
1119 const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1120 const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1124 UNUSED_RELEASE(beginData_size);
1125 UNUSED_RELEASE(sizeData_size);
1127 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1128 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1129 assert(beginData_size == input_rank);
1130 assert(sizeData_size == input_rank);
1132 assert(beginData_base != nullptr);
1133 for (int n = 0; n < input_rank; ++n)
1135 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1139 int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1140 starts[axis] = begin_value;
1142 int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1143 ends[axis] = begin_value + size_value;
1147 ::arm_compute::Coordinates starts_set;
1148 ::arm_compute::Coordinates ends_set;
1150 for (size_t i = 0; i < starts.size(); ++i)
1152 starts_set.set(i, starts[i]);
1153 ends_set.set(i, ends[i]);
1156 auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1157 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1159 _return_fn = asAclFunction(std::move(fn));
1162 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1164 const auto output_index{node.getOutputs().at(0)};
1165 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1166 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1167 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1168 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1170 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1171 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1172 const auto frontend_layout = _current_layout;
1173 const auto backend_layout = inputData_tensor->layout();
1175 // Set initializers for indices data such as order of inputData
1176 int input_rank = _ctx.at(input_index).shape().rank();
1177 std::vector<int32_t> starts;
1178 std::vector<int32_t> ends;
1179 std::vector<int32_t> strides;
1180 starts.resize(input_rank, 0);
1181 ends.resize(input_rank, 0);
1182 strides.resize(input_rank, 0);
1184 auto startData_base = _ctx.at(starts_index).data()->base();
1185 auto endData_base = _ctx.at(ends_index).data()->base();
1186 auto stridesData_base = _ctx.at(strides_index).data()->base();
1187 const int startData_size = _ctx.at(starts_index).shape().num_elements();
1188 const int endData_size = _ctx.at(ends_index).shape().num_elements();
1189 const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1193 UNUSED_RELEASE(startData_size);
1194 UNUSED_RELEASE(endData_size);
1195 UNUSED_RELEASE(stridesData_size);
1197 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1198 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1199 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1200 assert(startData_size == input_rank);
1201 assert(endData_size == input_rank);
1202 assert(stridesData_size == input_rank);
1204 assert(startData_base != nullptr);
1205 for (int n = 0; n < input_rank; ++n)
1207 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1211 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1212 starts[axis] = start_value;
1214 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1215 ends[axis] = end_value;
1217 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1218 strides[axis] = strides_value;
1222 // Set mask bits such as order of inputData
1223 // FIXME Take the layouts into account.
1224 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1225 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1226 const auto shrink_axis_mask =
1227 acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1229 ::arm_compute::Coordinates starts_set;
1230 ::arm_compute::Coordinates ends_set;
1231 ::arm_compute::BiStrides strides_set;
1233 for (size_t i = 0; i < starts.size(); ++i)
1235 starts_set.set(i, starts[i]);
1236 ends_set.set(i, ends[i]);
1237 strides_set.set(i, strides[i]);
1240 // Disable applied dim_correction
1241 if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
1243 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1244 acl_common::disableDimCorrection(inputData_tensor);
1247 auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1248 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1249 begin_mask, end_mask, shrink_axis_mask);
1251 // Revert disabling applied dim_correction
1252 if (inputData_tensor->dimension(0) == 1)
1254 acl_common::enableDimCorrection(inputData_tensor);
1257 _return_fn = asAclFunction(std::move(fn));
1260 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1262 const auto ofm_index{node.getOutputs().at(0)};
1263 const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1264 const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1266 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
1267 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
1268 const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
1270 const auto stride = node.param().stride;
1272 assert((node.param().padding.type == ir::PaddingType::SAME) ||
1273 (node.param().padding.type == ir::PaddingType::VALID));
1274 auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1275 ker_shape.W, ker_shape.H);
1277 uint32_t invalid_horizontal = 0;
1278 uint32_t invalid_vertical = 0;
1279 if (node.param().padding.type == ir::PaddingType::VALID)
1281 invalid_horizontal =
1282 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1283 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1286 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1287 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1288 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1290 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1292 auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1293 ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1294 invalid_horizontal, invalid_vertical);
1296 _return_fn = asAclFunction(std::move(fn));
1299 void KernelGenerator::visit(const ir::operation::Transpose &node)
1301 const auto ofm_idx{node.getOutputs().at(0)};
1302 const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1303 const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
1305 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
1306 const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
1307 const auto frontend_layout = _current_layout;
1308 const auto backend_layout = ifm_tensor->layout();
1309 const auto rank = _ctx.at(ifm_idx).shape().rank();
1311 const auto &perms = _ctx.at(perm_idx);
1312 std::vector<int32_t> pv;
1313 if (perms.shape() == ir::Shape{0})
1316 std::iota(pv.begin(), pv.end(), 0);
1317 std::reverse(pv.begin(), pv.end());
1321 pv = _ctx.at(perm_idx).asVector<int32_t>();
1324 std::unique_ptr<arm_compute::IFunction> fn;
1327 fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
1331 assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
1332 fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1333 ofm_tensor->handle());
1338 acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
1340 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1341 ofm_tensor->handle(), backend_pv);
1343 _return_fn = asAclFunction(std::move(fn));
1346 void KernelGenerator::visit(const ir::operation::Unpack &node)
1348 const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1349 auto axis{node.param().axis};
1351 const auto input_rank = _ctx.at(input_index).shape().rank();
1353 std::vector<ir::OperandIndex> output_indexes;
1354 for (const auto &output_index : node.getOutputs())
1355 output_indexes.emplace_back(output_index);
1357 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1358 std::vector<arm_compute::ITensor *> outputs;
1359 for (const auto &output_index : output_indexes)
1360 outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1362 const auto frontend_layout = _current_layout;
1363 const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1366 axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1368 // Disable applied dim_correction
1369 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1371 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1372 acl_common::disableDimCorrection(input_tensor);
1376 acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
1378 // Revert disabling applied dim_correction
1379 if (input_tensor->dimension(0) == 1)
1381 acl_common::enableDimCorrection(input_tensor);
1384 _return_fn = asAclFunction(std::move(fn));
1387 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1389 const auto output_index{node.getOutputs().at(0)};
1390 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1392 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1393 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1395 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1396 output_tensor->handle());
1398 _return_fn = asAclFunction(std::move(fn));
1401 void KernelGenerator::visit(const ir::operation::Comparison &node)
1403 const auto output_index{node.getOutputs().at(0)};
1404 const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1405 const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1407 const auto comparison_type = node.param().comparison_type;
1409 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1410 auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
1411 auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
1413 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1414 input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1415 (arm_compute::ComparisonOperation)comparison_type);
1417 _return_fn = asAclFunction(std::move(fn));
1420 void KernelGenerator::visit(const ir::operation::OneHot &node)
1422 const auto out_idx{node.getOutputs().at(0)};
1423 const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1424 const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1425 const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1426 const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1428 auto output_tensor = _tensor_reg->getAclTensor(out_idx);
1429 auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
1430 auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
1431 auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
1432 auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
1434 const size_t output_rank = _ctx.at(out_idx).shape().rank();
1435 const auto frontend_layout = _current_layout;
1436 const auto backend_layout = output_tensor->layout();
1437 int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
1438 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1440 auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
1441 indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1442 offvalue_tensor->handle(), output_tensor->handle(), axis);
1443 _return_fn = asAclFunction(std::move(fn));
1446 } // namespace acl_neon
1447 } // namespace backend
1448 } // namespace onert