2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "KernelGenerator.h"
19 #include <arm_compute/runtime/NEON/NEFunctions.h> // Include all ARM Compute NEON functions
20 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
22 #include <AclActivationBuilder.h>
23 #include <AclFunction.h>
28 #include "ir/DataType.h"
29 #include "ir/InternalType.h"
30 #include "exec/NopFunction.h"
31 #include "util/logging.h"
32 #include "util/Utils.h"
33 #include "AclKernelGen.h"
42 using ::onert::backend::acl_common::asAclFunction;
43 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
44 ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
46 KernelGenerator::KernelGenerator(
47 const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
48 const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
49 : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()),
50 _operations_ctx(graph.operations()), _current_layout{graph.layout()},
51 _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
56 std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
58 auto ret = std::make_unique<exec::FunctionSequence>();
59 ret->enableDynamicShapeInferer(false);
61 const auto &op = _graph.operations().at(ind);
63 ret->append(releaseFunction());
67 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
69 const auto ofm_index{node.getOutputs().at(0)};
70 const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
71 const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
73 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
75 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
76 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
77 auto frontend_layout = _current_layout;
78 auto backend_layout = ifm_tensor->layout();
80 int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
83 axis_value += ifm_rank;
85 assert(axis_value >= 0 && axis_value < ifm_rank);
86 const auto fixed_axis =
87 acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
88 auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
89 : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
91 auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
92 ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), reduce_type);
94 _return_fn = asAclFunction(std::move(fn));
97 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
99 const auto ofm_index{node.getOutputs().at(0)};
100 const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
101 const auto block_size_index{
102 node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
104 const auto NNApiInputs = 2;
105 if (node.getInputs().size() != NNApiInputs)
107 const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
108 if (!_ctx.at(crops_index).isConstant())
110 throw std::runtime_error("Non-constant crops NYI for acl_neon backend BatchToSpaceND");
113 auto crops = _ctx.at(crops_index).asVector<int32_t>();
114 for (auto &&crop : crops)
118 throw std::runtime_error("Non-zero crops NYI for acl_neon backend BatchToSpaceND");
123 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
124 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
125 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
127 assert(_ctx.at(block_size_index).data());
129 auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
130 ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
132 _return_fn = asAclFunction(std::move(fn));
135 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
137 const auto ofm_index{node.getOutputs().at(0)};
138 const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
139 const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
141 const auto activation = node.param().activation;
143 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
144 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
145 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
147 std::unique_ptr<arm_compute::IFunction> fn;
148 switch (node.param().arithmetic_type)
150 case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
152 fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
153 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
154 arm_compute::ConvertPolicy::SATURATE);
157 case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
159 fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
160 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
161 arm_compute::ConvertPolicy::SATURATE);
164 case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
166 // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
167 fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
168 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
169 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
172 case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
174 fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
175 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
179 assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
182 _return_fn = std::make_unique<exec::FunctionSequence>(
183 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
186 void KernelGenerator::visit(const ir::operation::Conv2D &node)
188 using ir::operation::Conv2D;
190 const auto ofm_index{node.getOutputs().at(0)};
191 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
192 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
193 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
195 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
196 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
197 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
198 const auto &ker_shape = _ctx.at(ker_index).shape();
199 const auto ker_height = ker_shape.dim(1);
200 const auto ker_width = ker_shape.dim(2);
202 const auto stride = node.param().stride;
204 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
205 const auto activation = node.param().activation;
207 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
208 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
209 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
210 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
212 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
213 const auto act_info = acl_common::asActivationLayerInfo(activation);
215 auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
216 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
217 ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
218 ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
220 _return_fn = asAclFunction(std::move(fn));
223 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
225 const auto output_index{node.getOutputs().at(0)};
226 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
228 auto block_size = node.param().block_size;
229 assert(block_size > 0);
231 auto output_tensor = _tensor_reg->getAclTensor(output_index);
232 auto input_tensor = _tensor_reg->getAclTensor(input_index);
234 auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
235 input_tensor->handle(), output_tensor->handle(), block_size);
237 _return_fn = asAclFunction(std::move(fn));
240 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
242 using ir::operation::DepthwiseConv2D;
244 const auto ofm_index{node.getOutputs().at(0)};
245 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
246 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
247 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
249 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
250 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
251 // Kernel format is [1, kernel_height, kernel_width, depth_out].
252 const auto &ker_shape = _ctx.at(ker_index).shape();
253 const auto ker_height = ker_shape.dim(1);
254 const auto ker_width = ker_shape.dim(2);
256 const auto stride = node.param().stride;
257 const auto dilation = node.param().dilation;
259 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
260 dilation.width_factor, dilation.height_factor);
261 const auto multiplier = node.param().multiplier;
262 const auto activation = node.param().activation;
264 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
265 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
266 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
267 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
269 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
270 const auto act_info = acl_common::asActivationLayerInfo(activation);
271 const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
273 auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
274 ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
275 conv_info, multiplier, act_info, dilation_info);
277 _return_fn = asAclFunction(std::move(fn));
280 void KernelGenerator::visit(const ir::operation::Concat &node)
282 const auto ofm_index{node.getOutputs().at(0)};
284 std::vector<ir::OperandIndex> input_indexes;
285 for (const auto &input : node.getInputs())
286 input_indexes.emplace_back(input);
288 const auto axis = node.param().axis;
290 // Concat elimination check
291 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
294 // If concat eliminated, return a NOP IFunction
295 VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
296 _return_fn = std::make_unique<exec::NopFunction>();
300 auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
301 std::vector<const ::arm_compute::ITensor *> input_tensors;
302 for (const auto &ifm_ind : input_indexes)
303 input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
305 std::unique_ptr<::arm_compute::IFunction> fn;
306 if (input_indexes.size() < 2)
308 ::arm_compute::ITensor *input_tesor = _tensor_reg->getAclTensor(input_indexes.at(0))->handle();
309 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tesor, output_tensor->handle());
313 const auto rank = _ctx.at(ofm_index).shape().rank();
314 const auto frontend_layout = _current_layout;
315 const auto backend_layout = output_tensor->layout();
316 const auto fixed_axis =
317 acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
318 fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
319 input_tensors, output_tensor->handle(), fixed_axis);
322 _return_fn = asAclFunction(std::move(fn));
325 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
327 const auto ofm_index{node.getOutputs().at(0)};
328 const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
330 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
331 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
333 const ::arm_compute::ActivationLayerInfo act_info =
334 acl_common::asActivationLayerInfo(node.param().op_type, node.param().alpha, node.param().beta);
336 std::unique_ptr<arm_compute::IFunction> fn =
337 acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
338 ofm_tensor->handle(), act_info);
340 _return_fn = asAclFunction(std::move(fn));
343 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
345 const auto output_index{node.getOutputs().at(0)};
346 const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
347 const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
349 auto output_tensor = _tensor_reg->getAclTensor(output_index);
350 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
351 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
353 std::unique_ptr<arm_compute::IFunction> fn;
354 switch (node.param().op_type)
356 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
358 fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
359 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
362 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
364 fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
365 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
368 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
370 fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
371 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
374 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
376 fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
377 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
382 std::string err_msg("acl_neon KernelGenerator : " + node.name() +
383 "is not elementwise-binary operations");
384 assert(false && err_msg.c_str());
388 _return_fn = asAclFunction(std::move(fn));
391 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
393 const auto output_index{node.getOutputs().at(0)};
394 const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
396 auto output_tensor = _tensor_reg->getAclTensor(output_index);
397 auto input_tensor = _tensor_reg->getAclTensor(input_index);
399 std::unique_ptr<arm_compute::IFunction> fn;
400 switch (node.param().op_type)
402 case ir::operation::ElementwiseUnary::Type::ABS:
404 const ::arm_compute::ActivationLayerInfo act_info{
405 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
407 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
408 input_tensor->handle(), output_tensor->handle(), act_info);
411 case ir::operation::ElementwiseUnary::Type::CAST:
413 if (input_tensor->data_type() == output_tensor->data_type())
415 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
416 output_tensor->handle());
418 else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
420 fn = acl_common::generateLayer<arm_compute::NECastBool>(input_tensor->handle(),
421 output_tensor->handle());
425 fn = acl_common::generateLayer<arm_compute::NECast>(
426 input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
430 case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
432 fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
433 output_tensor->handle());
436 case ir::operation::ElementwiseUnary::Type::EXP:
438 fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
439 output_tensor->handle());
442 case ir::operation::ElementwiseUnary::Type::FLOOR:
444 fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
445 output_tensor->handle());
448 case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
450 fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
451 output_tensor->handle());
454 case ir::operation::ElementwiseUnary::Type::NEG:
456 fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
457 output_tensor->handle());
460 case ir::operation::ElementwiseUnary::Type::RSQRT:
462 fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
463 output_tensor->handle());
466 case ir::operation::ElementwiseUnary::Type::SQRT:
468 const ::arm_compute::ActivationLayerInfo act_info{
469 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
471 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
472 input_tensor->handle(), output_tensor->handle(), act_info);
477 throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
478 "is not supported yet");
482 _return_fn = asAclFunction(std::move(fn));
485 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
487 const auto output_index{node.getOutputs().at(0)};
488 const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
489 const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
491 auto output_tensor = _tensor_reg->getAclTensor(output_index);
492 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
493 auto values_tensor = _tensor_reg->getAclTensor(values_index);
495 auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
496 values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
498 _return_fn = asAclFunction(std::move(fn));
501 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
503 const auto output_index{node.getOutputs().at(0)};
504 auto output_tensor = _tensor_reg->getAclTensor(output_index);
505 const auto activation = node.param().activation;
506 if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
507 throw std::runtime_error(
508 "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
510 auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
511 ::arm_compute::NEFullyConnectedReshapingLayer>(
512 node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
513 _return_fn = std::make_unique<exec::FunctionSequence>(
514 std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
517 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
519 const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
520 const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
522 const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
523 const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
524 const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
526 auto output_tensor = _tensor_reg->getAclTensor(output_index);
527 auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
529 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
530 auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
531 auto values_tensor = _tensor_reg->getAclTensor(values_index);
533 auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
534 lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
535 output_tensor->handle(), hits_tensor->handle());
537 _return_fn = asAclFunction(std::move(fn));
540 void KernelGenerator::visit(const ir::operation::Gather &node)
542 const auto ofm_index{node.getOutputs().at(0)};
544 const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
545 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
547 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
548 const auto axis_raw = node.param().axis;
549 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
550 // Converting in reverse order
551 const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
553 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
554 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
555 auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
556 const auto backend_layout = ofm_tensor->layout();
557 UNUSED_RELEASE(backend_layout);
559 // NOTE The frontend layout and backend layout must be the same for this operation.
560 // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
561 // is not not efficient even if it works well. If so, it would be better to set the
562 // layout of these backend tensors to the same layout.
563 // There is also one thing we have to think about. This operation depends on the layout of
564 // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
565 // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
566 // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
567 assert(backend_layout == ifm_tensor->layout());
568 assert(backend_layout == indices_tensor->layout());
569 assert(ifm_rank < 4 || _current_layout == backend_layout);
571 // input is n-D, indices k-D, output is (n + k - 1)-D
573 assert(n == ifm_tensor->num_dimensions());
574 size_t k = _ctx.at(indices_index).shape().rank();
575 assert(k == indices_tensor->num_dimensions());
577 // Disable applied dim_correction
578 if (n != ifm_tensor->info()->num_dimensions())
580 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
581 acl_common::disableDimCorrection(ifm_tensor);
583 if (k != indices_tensor->info()->num_dimensions())
585 // This means that high dimension's value is 1 and indices tensor is applied dim_correction
586 acl_common::disableDimCorrection(indices_tensor);
589 auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
590 ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
592 // Revert disabling applied dim_correction
593 if (ifm_tensor->dimension(0) == 1)
595 acl_common::enableDimCorrection(ifm_tensor);
597 if (indices_tensor->dimension(0) == 1)
599 acl_common::enableDimCorrection(indices_tensor);
602 _return_fn = asAclFunction(std::move(fn));
605 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
607 const auto ofm_index{node.getOutputs().at(0)};
608 const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
609 const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
610 const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
612 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
613 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
614 auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
615 auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
616 auto epsilon = node.param().epsilon;
617 auto activation = node.param().activation;
619 auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
620 ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
623 _return_fn = std::make_unique<exec::FunctionSequence>(
624 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
627 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
629 const auto ofm_index{node.getOutputs().at(0)};
630 const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
632 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
633 // L2 Normalization always performs the reduction along the depth axis
634 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
635 // choosing normalization parameters as below
637 const auto &ifm_shape = _ctx.at(ifm_index).shape();
638 // TODO Support optional constant dimension that normalization would be performed on
639 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
641 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
642 float alpha = 1.0f; // In the implementation to make alpha_ become 1
643 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
644 float bias = 0.0f; // Don't offset the reduction.
646 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
647 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
649 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
650 radius, alpha, beta, bias, false);
652 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
653 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
655 _return_fn = asAclFunction(std::move(fn));
658 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
660 const auto ofm_index{node.getOutputs().at(0)};
661 const auto ifm_index{
662 node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
664 auto radius = node.param().radius;
665 auto alpha = node.param().alpha;
666 auto beta = node.param().beta;
667 auto bias = node.param().bias;
669 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
670 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
672 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
673 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
675 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
676 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
678 _return_fn = asAclFunction(std::move(fn));
681 void KernelGenerator::visit(const ir::operation::LSTM &node)
683 _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
684 ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
687 void KernelGenerator::visit(const ir::operation::Pack &node)
689 const auto output_index{node.getOutputs().at(0)};
690 auto axis{node.param().axis};
692 const auto output_rank = _ctx.at(output_index).shape().rank();
694 std::vector<ir::OperandIndex> input_indexes;
695 for (const auto &input_index : node.getInputs())
696 input_indexes.emplace_back(input_index);
698 auto output = _tensor_reg->getAclTensor(output_index)->handle();
699 std::vector<arm_compute::ITensor *> inputs;
700 for (const auto &input_index : input_indexes)
701 inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
703 const auto frontend_layout = _current_layout;
704 const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
708 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
710 // Disable applied dim_correction
711 for (const auto &input_index : input_indexes)
713 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
714 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
716 // This means that high dimension's value is 1 and input tensor is applied dim_correction
717 acl_common::disableDimCorrection(input_tensor);
721 auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
723 // Revert disabling applied dim_correction
724 for (const auto &input_index : input_indexes)
726 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
727 if (input_tensor->dimension(0) == 1)
729 acl_common::enableDimCorrection(input_tensor);
733 _return_fn = asAclFunction(std::move(fn));
736 void KernelGenerator::visit(const ir::operation::Pad &node)
738 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
739 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
740 const auto output_index{node.getOutputs().at(0)};
741 assert(_ctx.at(pad_index).data());
743 auto rank = _ctx.at(input_index).shape().rank();
744 auto pad_base = _ctx.at(pad_index).data()->base();
746 auto input = _tensor_reg->getAclTensor(input_index)->handle();
747 auto output = _tensor_reg->getAclTensor(output_index)->handle();
749 ::arm_compute::PaddingList padding_list;
750 padding_list.resize(rank);
751 for (int32_t n = 0; n < rank; ++n)
753 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
755 const auto frontend_layout = _current_layout;
756 const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
758 acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
759 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
762 const auto input_type = _ctx.at(input_index).typeInfo();
763 UNUSED_RELEASE(input_type);
764 assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
765 assert(input->info()->quantization_info() ==
766 ::arm_compute::QuantizationInfo(input_type.scale(), input_type.zero_point()));
767 const auto pixel_value =
768 ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
771 acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
773 _return_fn = asAclFunction(std::move(fn));
776 void KernelGenerator::visit(const ir::operation::Pool2D &node)
778 auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
779 node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
781 const auto ofm_index{node.getOutputs().at(0)};
782 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
783 const auto activation = node.param().activation;
784 _return_fn = std::make_unique<exec::FunctionSequence>(
785 asAclFunction(std::move(raw_fn)),
786 ActivationBuilder::generate(activation, ofm_tensor->handle()));
789 void KernelGenerator::visit(const ir::operation::Permute &node)
791 const auto ofm_idx{node.getOutputs().at(0)};
792 const auto ifm_idx{node.getInputs().at(0)};
793 const auto permute_type = node.getPermuteType();
794 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
795 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
796 const auto rank = _ctx.at(ofm_idx).shape().rank();
797 assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
799 std::unique_ptr<::arm_compute::IFunction> fn;
800 arm_compute::PermutationVector pv;
801 if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
804 pv = arm_compute::PermutationVector{2, 0, 1};
806 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
807 ofm_tensor->handle(), pv);
809 else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
812 pv = arm_compute::PermutationVector{1, 2, 0};
814 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
815 ofm_tensor->handle(), pv);
819 fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
821 _return_fn = asAclFunction(std::move(fn));
824 void KernelGenerator::visit(const ir::operation::PReLU &node)
826 const auto ofm_index{node.getOutputs().at(0)};
827 const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
828 const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
830 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
831 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
832 auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
834 auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
835 ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
837 _return_fn = asAclFunction(std::move(fn));
840 void KernelGenerator::visit(const ir::operation::Reduce &node)
842 const auto output_index{node.getOutputs().at(0)};
843 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
844 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
846 auto output_tensor = _tensor_reg->getAclTensor(output_index);
847 auto input_tensor = _tensor_reg->getAclTensor(input_index);
849 // Convert to ACL axes taking into account negative values and possible duplicates.
850 const auto &axes = _ctx.at(axes_index);
851 const auto input_rank = _ctx.at(input_index).shape().rank();
852 const auto frontend_layout = _current_layout;
853 const auto backend_layout = input_tensor->layout();
854 const auto reduce_axes =
855 acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
856 const auto reduce_type = node.param().reduce_type;
857 const auto keep_dims = node.param().keep_dims;
859 std::unique_ptr<::arm_compute::IFunction> fn;
860 if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
862 fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
863 keep_dims, output_tensor->handle());
865 else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
867 fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
868 keep_dims, output_tensor->handle());
872 fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
873 input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
874 acl_common::convertReduceType(reduce_type));
876 _return_fn = asAclFunction(std::move(fn));
879 void KernelGenerator::visit(const ir::operation::Reshape &node)
881 const auto output_index{node.getOutputs().at(0)};
882 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
884 auto output_tensor = _tensor_reg->getAclTensor(output_index);
885 auto input_tensor = _tensor_reg->getAclTensor(input_index);
887 // NOTE This operation must not be changed the layout from frontend to backend
888 // So, PermutationOperationPass makes layouts of frontend and backend the same.
889 const auto frontend_layout = _current_layout;
890 const auto backend_layout = output_tensor->layout();
891 assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
892 frontend_layout == backend_layout);
893 UNUSED_RELEASE(frontend_layout);
894 UNUSED_RELEASE(backend_layout);
896 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
897 output_tensor->handle());
899 _return_fn = asAclFunction(std::move(fn));
902 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
904 const auto ofm_index{node.getOutputs().at(0)};
905 const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
907 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
908 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
910 auto fn = acl_common::generateLayer<arm_compute::NEScale>(
911 ifm_tensor->handle(), ofm_tensor->handle(),
912 ::arm_compute::ScaleKernelInfo{::arm_compute::InterpolationPolicy::BILINEAR,
913 ::arm_compute::BorderMode::REPLICATE,
914 ::arm_compute::PixelValue(0.f),
915 ::arm_compute::SamplingPolicy::TOP_LEFT, false /*use padding*/});
917 _return_fn = asAclFunction(std::move(fn));
920 void KernelGenerator::visit(const ir::operation::RNN &node)
922 const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
923 const auto hidden_state_out_index{
924 node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
926 const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
927 const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
928 const auto recurrent_weights_index{
929 node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
930 const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
931 const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
933 const auto activation = node.param().activation;
935 auto output_tensor = _tensor_reg->getAclTensor(output_index);
936 auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
938 auto input_tensor = _tensor_reg->getAclTensor(input_index);
939 auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
940 auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
941 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
942 auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
943 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
945 auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
946 hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
947 _return_fn = asAclFunction(std::move(copy_layer));
949 auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
950 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
951 weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
952 hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
953 _return_fn = asAclFunction(std::move(fn));
956 void KernelGenerator::visit(const ir::operation::Squeeze &node)
958 // Squeeze is identical to reshape except that it has an optional dimensions input.
959 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
960 // by freezer and toco
961 const auto output_index{node.getOutputs().at(0)};
962 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
963 const auto dims{node.param().dims};
964 const auto ndim{node.param().ndim};
968 auto output_tensor = _tensor_reg->getAclTensor(output_index);
969 auto input_tensor = _tensor_reg->getAclTensor(input_index);
970 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
971 output_tensor->handle());
972 _return_fn = asAclFunction(std::move(fn));
975 void KernelGenerator::visit(const ir::operation::Softmax &node)
977 const auto output_index{node.getOutputs().at(0)};
978 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
979 const auto beta = node.param().beta;
981 auto output_tensor = _tensor_reg->getAclTensor(output_index);
982 auto input_tensor = _tensor_reg->getAclTensor(input_index);
984 // NOTE NESoftmaxLayer's default axis is -1
985 auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
986 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
987 output_tensor->handle(), beta);
989 _return_fn = asAclFunction(std::move(fn));
992 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
994 const auto ofm_index{node.getOutputs().at(0)};
995 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
996 const auto block_size_index{
997 node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
998 const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1000 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1001 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1002 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1003 auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1005 assert(_ctx.at(block_size_index).data());
1006 assert(_ctx.at(paddings_index).data());
1008 auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
1009 ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1010 ofm_tensor->handle());
1012 _return_fn = asAclFunction(std::move(fn));
1015 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1017 const auto ofm_index{node.getOutputs().at(0)};
1018 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1020 auto block_size = node.param().block_size;
1022 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1023 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1025 auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
1026 ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1028 _return_fn = asAclFunction(std::move(fn));
1031 void KernelGenerator::visit(const ir::operation::Split &node)
1033 // TODO Support this op by SubTensor
1034 const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1035 const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1037 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1038 if (!_ctx.at(axis_index).isConstant())
1040 throw std::runtime_error("Non-constant axis_index NYI for acl_neon backend");
1043 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1044 std::vector<ir::OperandIndex> output_indexes;
1045 for (const auto &output : node.getOutputs())
1046 output_indexes.emplace_back(output);
1048 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1049 std::vector<arm_compute::ITensor *> output_tensors;
1050 for (const auto &ofm_ind : output_indexes)
1051 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1053 const auto frontend_layout = _current_layout;
1054 const auto backend_layout = ifm_tensor->layout();
1055 auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1058 axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1061 acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
1063 _return_fn = asAclFunction(std::move(fn));
1066 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1068 const auto ofm_index{node.getOutputs().at(0)};
1069 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1070 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1072 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1073 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1074 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1076 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1077 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1079 _return_fn = asAclFunction(std::move(fn));
1082 void KernelGenerator::visit(const ir::operation::Slice &node)
1084 const auto output_index{node.getOutputs().at(0)};
1085 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1086 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1087 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1089 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1090 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1091 const auto frontend_layout = _current_layout;
1092 const auto backend_layout = inputData_tensor->layout();
1094 // Set initializers for indices data such as order of inputData
1095 int input_rank = _ctx.at(input_index).shape().rank();
1096 std::vector<int32_t> starts;
1097 std::vector<int32_t> ends;
1098 starts.resize(input_rank, 0);
1099 ends.resize(input_rank, 0);
1101 auto beginData_base = _ctx.at(begins_index).data()->base();
1102 auto sizeData_base = _ctx.at(sizes_index).data()->base();
1103 const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1104 const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1108 UNUSED_RELEASE(beginData_size);
1109 UNUSED_RELEASE(sizeData_size);
1111 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1112 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1113 assert(beginData_size == input_rank);
1114 assert(sizeData_size == input_rank);
1116 assert(beginData_base != nullptr);
1117 for (int n = 0; n < input_rank; ++n)
1119 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1123 int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1124 starts[axis] = begin_value;
1126 int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1127 ends[axis] = begin_value + size_value;
1131 ::arm_compute::Coordinates starts_set;
1132 ::arm_compute::Coordinates ends_set;
1134 for (size_t i = 0; i < starts.size(); ++i)
1136 starts_set.set(i, starts[i]);
1137 ends_set.set(i, ends[i]);
1140 auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1141 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1143 _return_fn = asAclFunction(std::move(fn));
1146 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1148 const auto output_index{node.getOutputs().at(0)};
1149 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1150 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1151 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1152 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1154 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
1155 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
1156 const auto frontend_layout = _current_layout;
1157 const auto backend_layout = inputData_tensor->layout();
1159 // Set initializers for indices data such as order of inputData
1160 int input_rank = _ctx.at(input_index).shape().rank();
1161 std::vector<int32_t> starts;
1162 std::vector<int32_t> ends;
1163 std::vector<int32_t> strides;
1164 starts.resize(input_rank, 0);
1165 ends.resize(input_rank, 0);
1166 strides.resize(input_rank, 0);
1168 auto startData_base = _ctx.at(starts_index).data()->base();
1169 auto endData_base = _ctx.at(ends_index).data()->base();
1170 auto stridesData_base = _ctx.at(strides_index).data()->base();
1171 const int startData_size = _ctx.at(starts_index).shape().num_elements();
1172 const int endData_size = _ctx.at(ends_index).shape().num_elements();
1173 const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1177 UNUSED_RELEASE(startData_size);
1178 UNUSED_RELEASE(endData_size);
1179 UNUSED_RELEASE(stridesData_size);
1181 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1182 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1183 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1184 assert(startData_size == input_rank);
1185 assert(endData_size == input_rank);
1186 assert(stridesData_size == input_rank);
1188 assert(startData_base != nullptr);
1189 for (int n = 0; n < input_rank; ++n)
1191 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1195 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1196 starts[axis] = start_value;
1198 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1199 ends[axis] = end_value;
1201 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1202 strides[axis] = strides_value;
1206 // Set mask bits such as order of inputData
1207 // FIXME Take the layouts into account.
1208 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1209 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1210 const auto shrink_axis_mask =
1211 acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1213 ::arm_compute::Coordinates starts_set;
1214 ::arm_compute::Coordinates ends_set;
1215 ::arm_compute::BiStrides strides_set;
1217 for (size_t i = 0; i < starts.size(); ++i)
1219 starts_set.set(i, starts[i]);
1220 ends_set.set(i, ends[i]);
1221 strides_set.set(i, strides[i]);
1224 // Disable applied dim_correction
1225 if (static_cast<size_t>(inputData_tensor->getShape().rank()) !=
1226 inputData_tensor->info()->num_dimensions())
1228 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1229 acl_common::disableDimCorrection(inputData_tensor);
1232 auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1233 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1234 begin_mask, end_mask, shrink_axis_mask);
1236 // Revert disabling applied dim_correction
1237 if (inputData_tensor->getShape().dim(0) == 1)
1239 acl_common::enableDimCorrection(inputData_tensor);
1242 _return_fn = asAclFunction(std::move(fn));
1245 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1247 const auto ofm_index{node.getOutputs().at(0)};
1248 const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1249 const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1251 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
1252 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
1253 const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
1255 const auto stride = node.param().stride;
1257 assert((node.param().padding.type == ir::PaddingType::SAME) ||
1258 (node.param().padding.type == ir::PaddingType::VALID));
1259 auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1260 ker_shape.W, ker_shape.H);
1262 uint32_t invalid_horizontal = 0;
1263 uint32_t invalid_vertical = 0;
1264 if (node.param().padding.type == ir::PaddingType::VALID)
1266 invalid_horizontal =
1267 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1268 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1271 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1272 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1273 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1275 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1277 auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1278 ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1279 invalid_horizontal, invalid_vertical);
1281 _return_fn = asAclFunction(std::move(fn));
1284 void KernelGenerator::visit(const ir::operation::Transpose &node)
1286 const auto ofm_idx{node.getOutputs().at(0)};
1287 const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1288 const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
1290 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
1291 const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
1292 const auto frontend_layout = _current_layout;
1293 const auto backend_layout = ifm_tensor->layout();
1294 const auto rank = _ctx.at(ifm_idx).shape().rank();
1296 const auto &perms = _ctx.at(perm_idx);
1297 std::vector<int32_t> pv;
1298 if (perms.shape() == ir::Shape{0})
1301 std::iota(pv.begin(), pv.end(), 0);
1302 std::reverse(pv.begin(), pv.end());
1306 pv = _ctx.at(perm_idx).asVector<int32_t>();
1309 std::unique_ptr<arm_compute::IFunction> fn;
1312 fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
1316 assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
1317 fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1318 ofm_tensor->handle());
1323 acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
1325 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1326 ofm_tensor->handle(), backend_pv);
1328 _return_fn = asAclFunction(std::move(fn));
1331 void KernelGenerator::visit(const ir::operation::Unpack &node)
1333 const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1334 auto axis{node.param().axis};
1336 const auto input_rank = _ctx.at(input_index).shape().rank();
1338 std::vector<ir::OperandIndex> output_indexes;
1339 for (const auto &output_index : node.getOutputs())
1340 output_indexes.emplace_back(output_index);
1342 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1343 std::vector<arm_compute::ITensor *> outputs;
1344 for (const auto &output_index : output_indexes)
1345 outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1347 const auto frontend_layout = _current_layout;
1348 const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1351 axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1353 // Disable applied dim_correction
1354 if (static_cast<size_t>(input_tensor->getShape().rank()) !=
1355 input_tensor->info()->num_dimensions())
1357 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1358 acl_common::disableDimCorrection(input_tensor);
1362 acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
1364 // Revert disabling applied dim_correction
1365 if (input_tensor->getShape().dim(0) == 1)
1367 acl_common::enableDimCorrection(input_tensor);
1370 _return_fn = asAclFunction(std::move(fn));
1373 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1375 const auto output_index{node.getOutputs().at(0)};
1376 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1378 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1379 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1381 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1382 output_tensor->handle());
1384 _return_fn = asAclFunction(std::move(fn));
1387 void KernelGenerator::visit(const ir::operation::Comparison &node)
1389 const auto output_index{node.getOutputs().at(0)};
1390 const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1391 const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1393 const auto comparison_type = node.param().comparison_type;
1395 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1396 auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
1397 auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
1399 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1400 input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1401 (arm_compute::ComparisonOperation)comparison_type);
1403 _return_fn = asAclFunction(std::move(fn));
1406 void KernelGenerator::visit(const ir::operation::OneHot &node)
1408 const auto out_idx{node.getOutputs().at(0)};
1409 const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1410 const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1411 const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1412 const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1414 auto output_tensor = _tensor_reg->getAclTensor(out_idx);
1415 auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
1416 auto depth_tensor = _tensor_reg->getAclTensor(depth_idx);
1417 auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
1418 auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
1420 const size_t output_rank = _ctx.at(out_idx).shape().rank();
1421 const auto frontend_layout = _current_layout;
1422 const auto backend_layout = output_tensor->layout();
1423 int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
1424 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1426 auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
1427 indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1428 offvalue_tensor->handle(), output_tensor->handle(), axis);
1429 _return_fn = asAclFunction(std::move(fn));
1432 } // namespace acl_neon
1433 } // namespace backend
1434 } // namespace onert