2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "KernelGenerator.h"
19 #include <arm_compute/runtime/NEON/NEFunctions.h> // Include all ARM Compute NEON functions
20 #include <arm_compute/runtime/NEON/NEFunctionsEx.h> // Include all ARM Compute EX NEON functions
21 #include <arm_compute/runtime/CPP/functions/CPPOneHotEx.h>
23 #include <AclActivationBuilder.h>
24 #include <AclFunction.h>
29 #include "ir/DataType.h"
30 #include "ir/InternalType.h"
31 #include "exec/NopFunction.h"
32 #include "util/logging.h"
33 #include "util/Utils.h"
34 #include "AclKernelGen.h"
43 using ::onert::backend::acl_common::asAclFunction;
44 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
45 ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
47 KernelGenerator::KernelGenerator(
48 const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
49 const std::shared_ptr<TensorBuilder> &tensor_builder,
50 const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
51 : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
52 _tensor_reg(tensor_reg), _current_op_seq_layout(ir::Layout::UNKNOWN)
57 void KernelGenerator::visit(const ir::OpSequence &op_seq)
59 // TODO Move this to IKernelGenerator
60 // (all derivatives have the same implementation for this)
61 assert(!_return_fn_seq);
62 _return_fn_seq = std::make_unique<exec::FunctionSequence>();
63 _return_fn_seq->enableDynamicShapeInferer(false);
65 _current_op_seq_layout = op_seq.getLayout();
66 for (const auto &operation_idx : op_seq.operations())
68 const auto &node = _operations_ctx.at(operation_idx);
70 _return_fn_seq->append(releaseFunction());
74 void KernelGenerator::visit(const ir::operation::ArgMax &node)
76 const auto ofm_index{node.getOutputs().at(0)};
77 const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
79 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
81 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
82 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
83 auto frontend_layout = _current_op_seq_layout;
84 auto backend_layout = ifm_tensor->layout();
86 int axis_value = node.param().axis;
89 axis_value += ifm_rank;
91 assert(axis_value >= 0 && axis_value < ifm_rank);
92 const auto fixed_axis =
93 acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
95 auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
96 ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
97 arm_compute::ReductionOperation::ARG_IDX_MAX);
99 _return_fn = asAclFunction(std::move(fn));
102 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
104 const auto ofm_index{node.getOutputs().at(0)};
105 const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
106 const auto block_size_index{
107 node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
109 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
110 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
111 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
113 assert(_ctx.at(block_size_index).data());
115 auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
116 ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
118 _return_fn = asAclFunction(std::move(fn));
121 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
123 const auto ofm_index{node.getOutputs().at(0)};
124 const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
125 const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
127 const auto activation = node.param().activation;
129 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
130 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
131 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
133 std::unique_ptr<arm_compute::IFunction> fn;
134 switch (node.param().arithmetic_type)
136 case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
138 fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
139 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
140 arm_compute::ConvertPolicy::SATURATE);
143 case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
145 fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
146 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
147 arm_compute::ConvertPolicy::SATURATE);
150 case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
152 // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
153 fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
154 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
155 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
158 case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
160 fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
161 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
165 assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
168 _return_fn = std::make_unique<exec::FunctionSequence>(
169 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
172 void KernelGenerator::visit(const ir::operation::Conv2D &node)
174 using ir::operation::Conv2D;
176 const auto ofm_index{node.getOutputs().at(0)};
177 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
178 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
179 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
181 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
182 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
183 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
184 const auto &ker_shape = _ctx.at(ker_index).shape();
185 const auto ker_height = ker_shape.dim(1);
186 const auto ker_width = ker_shape.dim(2);
188 const auto stride = node.param().stride;
189 const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
190 ker_width, ker_height);
191 const auto activation = node.param().activation;
193 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
194 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
195 auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
196 auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
198 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
199 const auto act_info = acl_common::asActivationLayerInfo(activation);
201 auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
202 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
203 ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
204 ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
206 _return_fn = asAclFunction(std::move(fn));
209 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
211 const auto output_index{node.getOutputs().at(0)};
212 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
214 auto block_size = node.param().block_size;
215 assert(block_size > 0);
217 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
218 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
220 auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
221 input_tensor->handle(), output_tensor->handle(), block_size);
223 _return_fn = asAclFunction(std::move(fn));
226 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
228 using ir::operation::DepthwiseConv2D;
230 const auto ofm_index{node.getOutputs().at(0)};
231 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
232 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
233 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
235 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
236 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
237 // Kernel format is [1, kernel_height, kernel_width, depth_out].
238 const auto &ker_shape = _ctx.at(ker_index).shape();
239 const auto ker_height = ker_shape.dim(1);
240 const auto ker_width = ker_shape.dim(2);
242 const auto stride = node.param().stride;
243 const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
244 ker_width, ker_height);
245 const auto multiplier = node.param().multiplier;
246 const auto activation = node.param().activation;
248 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
249 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
250 auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
251 auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
253 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
254 const auto act_info = acl_common::asActivationLayerInfo(activation);
257 auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
258 ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
259 conv_info, multiplier, act_info);
261 _return_fn = asAclFunction(std::move(fn));
265 void KernelGenerator::visit(const ir::operation::Concat &node)
267 const auto ofm_index{node.getOutputs().at(0)};
269 std::vector<ir::OperandIndex> input_indexes;
270 for (const auto &input : node.getInputs())
271 input_indexes.emplace_back(input);
273 const auto axis = node.param().axis;
275 // Concat elimination check
276 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
279 // If concat eliminated, return a NOP IFunction
280 VERBOSE(acl_neon_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
281 _return_fn = std::make_unique<exec::NopFunction>();
285 auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get();
286 std::vector<::arm_compute::ITensor *> input_tensors;
287 for (const auto &ifm_ind : input_indexes)
288 input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
290 std::unique_ptr<::arm_compute::IFunction> fn;
291 if (input_indexes.size() < 2)
293 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensors.at(0),
294 output_tensor->handle());
298 const auto rank = _ctx.at(ofm_index).shape().rank();
299 const auto frontend_layout = _current_op_seq_layout;
300 const auto backend_layout = output_tensor->layout();
301 const auto fixed_axis =
302 acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
303 fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
304 input_tensors, output_tensor->handle(), fixed_axis);
307 _return_fn = asAclFunction(std::move(fn));
310 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
312 const auto ofm_index{node.getOutputs().at(0)};
313 const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
315 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
316 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
318 const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
319 node.param().op_type, node.param().alpha, node.param().beta);
321 std::unique_ptr<arm_compute::IFunction> fn;
322 if (node.param().op_type == ir::operation::ElementwiseActivation::Type::LOGISTIC)
324 // NOTE NEActivationLayer can generate produce erroneous results. it were caused by
326 // The neon function returns a value outside of the limit of representation in float as 'NaN'
327 // instead of 'INF', and then the result of this op will be errors due to the 'NaN'.
328 fn = acl_common::generateLayer<arm_compute::NEActivationLayerEx>(
329 ifm_tensor->handle(), ofm_tensor->handle(), act_info);
333 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
334 ofm_tensor->handle(), act_info);
337 _return_fn = asAclFunction(std::move(fn));
340 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
342 const auto output_index{node.getOutputs().at(0)};
343 const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
344 const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
346 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
347 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
348 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
350 std::unique_ptr<arm_compute::IFunction> fn;
351 switch (node.param().op_type)
353 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
355 fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
356 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
359 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
361 fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
362 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
365 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
367 fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
368 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
371 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
373 fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
374 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
379 std::string err_msg("acl_neon KernelGenerator : " + node.name() +
380 "is not elementwise-binary operations");
381 assert(false && err_msg.c_str());
385 _return_fn = asAclFunction(std::move(fn));
388 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
390 const auto output_index{node.getOutputs().at(0)};
391 const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
393 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
394 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
396 std::unique_ptr<arm_compute::IFunction> fn;
397 switch (node.param().op_type)
399 case ir::operation::ElementwiseUnary::Type::ABS:
401 const ::arm_compute::ActivationLayerInfo act_info{
402 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
404 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
405 input_tensor->handle(), output_tensor->handle(), act_info);
408 case ir::operation::ElementwiseUnary::Type::CAST:
410 if (input_tensor->data_type() == output_tensor->data_type())
412 fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensor->handle(),
413 output_tensor->handle());
417 fn = acl_common::generateLayer<arm_compute::NECast>(
418 input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
422 case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
424 fn = acl_common::generateLayer<arm_compute::NEDequantizationLayer>(input_tensor->handle(),
425 output_tensor->handle());
428 case ir::operation::ElementwiseUnary::Type::EXP:
430 fn = acl_common::generateLayer<arm_compute::NEExpLayer>(input_tensor->handle(),
431 output_tensor->handle());
434 case ir::operation::ElementwiseUnary::Type::FLOOR:
436 fn = acl_common::generateLayer<arm_compute::NEFloor>(input_tensor->handle(),
437 output_tensor->handle());
440 case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
442 fn = acl_common::generateLayer<arm_compute::NEBitwiseNot>(input_tensor->handle(),
443 output_tensor->handle());
446 case ir::operation::ElementwiseUnary::Type::NEG:
448 fn = acl_common::generateLayer<arm_compute::NENegLayer>(input_tensor->handle(),
449 output_tensor->handle());
452 case ir::operation::ElementwiseUnary::Type::RSQRT:
454 fn = acl_common::generateLayer<arm_compute::NERsqrtLayer>(input_tensor->handle(),
455 output_tensor->handle());
458 case ir::operation::ElementwiseUnary::Type::SQRT:
460 const ::arm_compute::ActivationLayerInfo act_info{
461 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
463 fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
464 input_tensor->handle(), output_tensor->handle(), act_info);
469 throw std::runtime_error("acl_neon KernelGenerator : " + node.name() +
470 "is not supported yet");
474 _return_fn = asAclFunction(std::move(fn));
477 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
479 const auto output_index{node.getOutputs().at(0)};
480 const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
481 const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
483 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
484 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
485 auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
487 auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
488 values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
490 _return_fn = asAclFunction(std::move(fn));
493 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
495 const auto output_index{node.getOutputs().at(0)};
496 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
497 const auto activation = node.param().activation;
499 auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
500 ::arm_compute::NEFullyConnectedReshapingLayer>(
501 node, _ctx, _tensor_builder, _tensor_reg, _current_op_seq_layout);
502 _return_fn = std::make_unique<exec::FunctionSequence>(
503 std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
506 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
508 const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
509 const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
511 const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
512 const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
513 const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
515 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
516 auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get();
518 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
519 auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get();
520 auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
522 auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
523 lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
524 output_tensor->handle(), hits_tensor->handle());
526 _return_fn = asAclFunction(std::move(fn));
529 void KernelGenerator::visit(const ir::operation::Gather &node)
531 const auto ofm_index{node.getOutputs().at(0)};
533 const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
534 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
536 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
537 const auto axis_raw = node.param().axis;
538 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
539 // Converting in reverse order
540 const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
542 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
543 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
544 auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get();
545 const auto backend_layout = ofm_tensor->layout();
546 UNUSED_RELEASE(backend_layout);
548 // NOTE The frontend layout and backend layout must be the same for this operation.
549 // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
550 // is not not efficient even if it works well. If so, it would be better to set the
551 // layout of these backend tensors to the same layout.
552 // There is also one thing we have to think about. This operation depends on the layout of
553 // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
554 // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
555 // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
556 assert(backend_layout == ifm_tensor->layout());
557 assert(backend_layout == indices_tensor->layout());
558 assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
560 // input is n-D, indices k-D, output is (n + k - 1)-D
562 assert(n == ifm_tensor->num_dimensions());
563 size_t k = _ctx.at(indices_index).shape().rank();
564 assert(k == indices_tensor->num_dimensions());
566 // Disable applied dim_correction
567 if (n != ifm_tensor->info()->num_dimensions())
569 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
570 const auto ifm = _ctx.at(ifm_index);
571 ifm_tensor->info()->set_tensor_shape(
572 acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
574 if (k != indices_tensor->info()->num_dimensions())
576 // This means that high dimension's value is 1 and indices tensor is applied dim_correction
577 const auto indices = _ctx.at(indices_index);
578 indices_tensor->info()->set_tensor_shape(
579 acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
582 auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
583 ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
585 // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
586 // use arm_compute::TensorInfo::offset_element_in_bytes()
587 // It would create an error when the kernel accesses high dimension that its value is 1
589 _return_fn = asAclFunction(std::move(fn));
592 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
594 const auto ofm_index{node.getOutputs().at(0)};
595 const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
596 const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
597 const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
599 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
600 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
601 auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get();
602 auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get();
603 auto epsilon = node.param().epsilon;
604 auto activation = node.param().activation;
606 auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
607 ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
610 _return_fn = std::make_unique<exec::FunctionSequence>(
611 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
614 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
616 const auto ofm_index{node.getOutputs().at(0)};
617 const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
619 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
620 // L2 Normalization always performs the reduction along the depth axis
621 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
622 // choosing normalization parameters as below
624 const auto &ifm_shape = _ctx.at(ifm_index).shape();
625 // TODO Support optional constant dimension that normalization would be performed on
626 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
628 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
629 float alpha = 1.0f; // In the implementation to make alpha_ become 1
630 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
631 float bias = 0.0f; // Don't offset the reduction.
633 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
634 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
636 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
637 radius, alpha, beta, bias, false);
639 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
640 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
642 _return_fn = asAclFunction(std::move(fn));
645 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
647 const auto ofm_index{node.getOutputs().at(0)};
648 const auto ifm_index{
649 node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
651 auto radius = node.param().radius;
652 auto alpha = node.param().alpha;
653 auto beta = node.param().beta;
654 auto bias = node.param().bias;
656 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
657 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
659 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
660 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
662 auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
663 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
665 _return_fn = asAclFunction(std::move(fn));
668 void KernelGenerator::visit(const ir::operation::LSTM &node)
670 _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
671 ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_reg);
674 void KernelGenerator::visit(const ir::operation::Pack &node)
676 const auto output_index{node.getOutputs().at(0)};
677 auto axis{node.param().axis};
679 const auto output_rank = _ctx.at(output_index).shape().rank();
681 std::vector<ir::OperandIndex> input_indexes;
682 for (const auto &input_index : node.getInputs())
683 input_indexes.emplace_back(input_index);
685 auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
686 std::vector<arm_compute::ITensor *> inputs;
687 for (const auto &input_index : input_indexes)
688 inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
690 const auto frontend_layout = _current_op_seq_layout;
691 const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout();
695 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
697 // Disable applied dim_correction
698 for (const auto &input_index : input_indexes)
700 size_t input_rank = _ctx.at(input_index).shape().rank();
701 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
702 assert(input_rank == input_tensor->num_dimensions());
703 if (input_rank != input_tensor->info()->num_dimensions())
705 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
706 input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
707 _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
711 auto fn = acl_common::generateLayer<arm_compute::NEStackLayer>(inputs, axis, output);
713 // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
714 // use arm_compute::TensorInfo::offset_element_in_bytes()
715 // It would create an error when the kernel accesses high dimension that its value is 1
717 _return_fn = asAclFunction(std::move(fn));
720 void KernelGenerator::visit(const ir::operation::Pad &node)
722 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
723 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
724 const auto output_index{node.getOutputs().at(0)};
725 assert(_ctx.at(pad_index).data());
727 auto rank = _ctx.at(input_index).shape().rank();
728 auto pad_base = _ctx.at(pad_index).data()->base();
730 auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
731 auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
733 ::arm_compute::PaddingList padding_list;
734 padding_list.resize(rank);
735 for (int32_t n = 0; n < rank; ++n)
737 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
739 const auto frontend_layout = _current_op_seq_layout;
740 const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
742 acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
743 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
746 const auto input_type = _ctx.at(input_index).typeInfo();
747 UNUSED_RELEASE(input_type);
748 assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
749 assert(input->info()->quantization_info() ==
750 ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()));
751 const auto pixel_value =
752 ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
755 acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
757 _return_fn = asAclFunction(std::move(fn));
760 void KernelGenerator::visit(const ir::operation::Pool2D &node)
762 auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
763 node, _ctx, _tensor_reg, _current_op_seq_layout,
764 acl_common::convertPoolType(node.param().op_type));
766 const auto ofm_index{node.getOutputs().at(0)};
767 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
768 const auto activation = node.param().activation;
769 _return_fn = std::make_unique<exec::FunctionSequence>(
770 asAclFunction(std::move(raw_fn)),
771 ActivationBuilder::generate(activation, ofm_tensor->handle()));
774 void KernelGenerator::visit(const ir::operation::Permute &node)
776 const auto ofm_idx{node.getOutputs().at(0)};
777 const auto ifm_idx{node.getInputs().at(0)};
778 const auto permute_type = node.getPermuteType();
779 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
780 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
781 const auto rank = _ctx.at(ofm_idx).shape().rank();
782 assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
784 std::unique_ptr<::arm_compute::IFunction> fn;
785 arm_compute::PermutationVector pv;
786 if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
789 pv = arm_compute::PermutationVector{2, 0, 1};
791 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
792 ofm_tensor->handle(), pv);
794 else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
797 pv = arm_compute::PermutationVector{1, 2, 0};
799 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
800 ofm_tensor->handle(), pv);
804 fn = acl_common::generateLayer<arm_compute::NECopy>(ifm_tensor->handle(), ofm_tensor->handle());
806 _return_fn = asAclFunction(std::move(fn));
809 void KernelGenerator::visit(const ir::operation::PReLU &node)
811 const auto ofm_index{node.getOutputs().at(0)};
812 const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
813 const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
815 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
816 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
817 auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get();
819 auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
820 ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
822 _return_fn = asAclFunction(std::move(fn));
825 void KernelGenerator::visit(const ir::operation::Reduce &node)
827 const auto output_index{node.getOutputs().at(0)};
828 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
829 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
831 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
832 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
834 // Convert to ACL axes taking into account negative values and possible duplicates.
835 const auto &axes = _ctx.at(axes_index);
836 const auto input_rank = _ctx.at(input_index).shape().rank();
837 const auto frontend_layout = _current_op_seq_layout;
838 const auto backend_layout = input_tensor->layout();
839 const auto reduce_axes =
840 acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
841 const auto reduce_type = node.param().reduce_type;
842 const auto keep_dims = node.param().keep_dims;
844 std::unique_ptr<::arm_compute::IFunction> fn;
845 if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
847 fn = acl_common::generateLayer<arm_compute::NEReduceMean>(input_tensor->handle(), reduce_axes,
848 keep_dims, output_tensor->handle());
850 else if (reduce_type == ir::operation::Reduce::ReduceType::SUM)
852 fn = acl_common::generateLayer<arm_compute::NEReduceSum>(input_tensor->handle(), reduce_axes,
853 keep_dims, output_tensor->handle());
857 fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
858 input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
859 acl_common::convertReduceType(reduce_type));
861 _return_fn = asAclFunction(std::move(fn));
864 void KernelGenerator::visit(const ir::operation::Reshape &node)
866 const auto output_index{node.getOutputs().at(0)};
867 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
869 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
870 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
872 // NOTE This operation must not be changed the layout from frontend to backend
873 // So, PermutationOperationPass makes layouts of frontend and backend the same.
874 const auto frontend_layout = _current_op_seq_layout;
875 const auto backend_layout = output_tensor->layout();
876 assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
877 frontend_layout == backend_layout);
878 UNUSED_RELEASE(frontend_layout);
879 UNUSED_RELEASE(backend_layout);
881 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
882 output_tensor->handle());
884 _return_fn = asAclFunction(std::move(fn));
887 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
889 const auto ofm_index{node.getOutputs().at(0)};
891 const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
893 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
894 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
896 auto fn = acl_common::generateLayer<arm_compute::NEScale>(
897 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
898 ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
899 ::arm_compute::SamplingPolicy::TOP_LEFT);
901 _return_fn = asAclFunction(std::move(fn));
904 void KernelGenerator::visit(const ir::operation::RNN &node)
906 const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
907 const auto hidden_state_out_index{
908 node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
910 const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
911 const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
912 const auto recurrent_weights_index{
913 node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
914 const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
915 const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
917 const auto activation = node.param().activation;
919 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
920 auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get();
922 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
923 auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get();
924 auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get();
925 auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
926 auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get();
927 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
929 auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
930 hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
931 _return_fn = asAclFunction(std::move(copy_layer));
933 auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
934 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
935 weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
936 hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
937 _return_fn = asAclFunction(std::move(fn));
940 void KernelGenerator::visit(const ir::operation::Squeeze &node)
942 // Squeeze is identical to reshape except that it has an optional dimensions input.
943 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
944 // by freezer and toco
945 const auto output_index{node.getOutputs().at(0)};
946 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
947 const auto dims{node.param().dims};
948 const auto ndim{node.param().ndim};
952 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
953 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
954 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
955 output_tensor->handle());
956 _return_fn = asAclFunction(std::move(fn));
959 void KernelGenerator::visit(const ir::operation::Softmax &node)
961 const auto output_index{node.getOutputs().at(0)};
962 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
963 const auto beta = node.param().beta;
965 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
966 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
967 const auto frontend_layout = _current_op_seq_layout;
968 const auto backend_layout = input_tensor->layout();
970 // Disable applied dim_correction
971 const size_t input_rank = _ctx.at(input_index).shape().rank();
972 if (input_rank != input_tensor->info()->num_dimensions())
974 // This means that high dimension's value is 1 and input tensor is applied dim_correction
975 const auto input = _ctx.at(input_index);
976 input_tensor->info()->set_tensor_shape(
977 acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false));
980 auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
981 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
982 output_tensor->handle(), beta);
984 _return_fn = asAclFunction(std::move(fn));
987 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
989 const auto ofm_index{node.getOutputs().at(0)};
990 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
991 const auto block_size_index{
992 node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
993 const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
995 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
996 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
997 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
998 auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get();
1000 assert(_ctx.at(block_size_index).data());
1001 assert(_ctx.at(paddings_index).data());
1003 auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
1004 ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1005 ofm_tensor->handle());
1007 _return_fn = asAclFunction(std::move(fn));
1010 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1012 const auto ofm_index{node.getOutputs().at(0)};
1013 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1015 auto block_size = node.param().block_size;
1017 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1018 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1020 auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
1021 ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1023 _return_fn = asAclFunction(std::move(fn));
1026 void KernelGenerator::visit(const ir::operation::Split &node)
1028 // TODO Support this op by SubTensor
1029 const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1031 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1033 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1034 std::vector<ir::OperandIndex> output_indexes;
1035 for (const auto &output : node.getOutputs())
1036 output_indexes.emplace_back(output);
1038 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1039 std::vector<arm_compute::ITensor *> output_tensors;
1040 for (const auto &ofm_ind : output_indexes)
1041 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle());
1043 const auto frontend_layout = _current_op_seq_layout;
1044 const auto backend_layout = ifm_tensor->layout();
1045 auto axis = node.param().axis;
1048 axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1051 acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
1053 _return_fn = asAclFunction(std::move(fn));
1056 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1058 const auto ofm_index{node.getOutputs().at(0)};
1059 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1060 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1062 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1063 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
1064 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
1066 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
1067 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1069 _return_fn = asAclFunction(std::move(fn));
1072 void KernelGenerator::visit(const ir::operation::Slice &node)
1074 const auto output_index{node.getOutputs().at(0)};
1075 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
1076 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
1077 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
1079 auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
1080 auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
1081 const auto frontend_layout = _current_op_seq_layout;
1082 const auto backend_layout = inputData_tensor->layout();
1084 // Set initializers for indices data such as order of inputData
1085 int input_rank = _ctx.at(input_index).shape().rank();
1086 std::vector<int32_t> starts;
1087 std::vector<int32_t> ends;
1088 starts.resize(input_rank, 0);
1089 ends.resize(input_rank, 0);
1091 auto beginData_base = _ctx.at(begins_index).data()->base();
1092 auto sizeData_base = _ctx.at(sizes_index).data()->base();
1093 const int beginData_size = _ctx.at(begins_index).shape().num_elements();
1094 const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
1098 UNUSED_RELEASE(beginData_size);
1099 UNUSED_RELEASE(sizeData_size);
1101 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
1102 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
1103 assert(beginData_size == input_rank);
1104 assert(sizeData_size == input_rank);
1106 assert(beginData_base != nullptr);
1107 for (int n = 0; n < input_rank; ++n)
1109 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1113 int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
1114 starts[axis] = begin_value;
1116 int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
1117 ends[axis] = begin_value + size_value;
1121 ::arm_compute::Coordinates starts_set;
1122 ::arm_compute::Coordinates ends_set;
1124 for (size_t i = 0; i < starts.size(); ++i)
1126 starts_set.set(i, starts[i]);
1127 ends_set.set(i, ends[i]);
1130 auto fn = acl_common::generateLayer<arm_compute::NESlice>(
1131 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
1133 _return_fn = asAclFunction(std::move(fn));
1136 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
1138 const auto output_index{node.getOutputs().at(0)};
1139 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
1140 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
1141 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
1142 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
1144 auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
1145 auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
1146 const auto frontend_layout = _current_op_seq_layout;
1147 const auto backend_layout = inputData_tensor->layout();
1149 // Set initializers for indices data such as order of inputData
1150 int input_rank = _ctx.at(input_index).shape().rank();
1151 std::vector<int32_t> starts;
1152 std::vector<int32_t> ends;
1153 std::vector<int32_t> strides;
1154 starts.resize(input_rank, 0);
1155 ends.resize(input_rank, 0);
1156 strides.resize(input_rank, 0);
1158 auto startData_base = _ctx.at(starts_index).data()->base();
1159 auto endData_base = _ctx.at(ends_index).data()->base();
1160 auto stridesData_base = _ctx.at(strides_index).data()->base();
1161 const int startData_size = _ctx.at(starts_index).shape().num_elements();
1162 const int endData_size = _ctx.at(ends_index).shape().num_elements();
1163 const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
1167 UNUSED_RELEASE(startData_size);
1168 UNUSED_RELEASE(endData_size);
1169 UNUSED_RELEASE(stridesData_size);
1171 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
1172 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
1173 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
1174 assert(startData_size == input_rank);
1175 assert(endData_size == input_rank);
1176 assert(stridesData_size == input_rank);
1178 assert(startData_base != nullptr);
1179 for (int n = 0; n < input_rank; ++n)
1181 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
1185 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
1186 starts[axis] = start_value;
1188 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
1189 ends[axis] = end_value;
1191 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
1192 strides[axis] = strides_value;
1196 // Set mask bits such as order of inputData
1197 // FIXME Take the layouts into account.
1198 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
1199 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
1200 const auto shrink_axis_mask =
1201 acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
1203 ::arm_compute::Coordinates starts_set;
1204 ::arm_compute::Coordinates ends_set;
1205 ::arm_compute::BiStrides strides_set;
1207 for (size_t i = 0; i < starts.size(); ++i)
1209 starts_set.set(i, starts[i]);
1210 ends_set.set(i, ends[i]);
1211 strides_set.set(i, strides[i]);
1214 auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
1215 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
1216 begin_mask, end_mask, shrink_axis_mask);
1218 _return_fn = asAclFunction(std::move(fn));
1221 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1223 const auto ofm_index{node.getOutputs().at(0)};
1224 const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1225 const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1227 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1228 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1229 const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
1231 const auto stride = node.param().stride;
1233 assert((node.param().padding.type == ir::PaddingType::SAME) ||
1234 (node.param().padding.type == ir::PaddingType::VALID));
1235 auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1236 ker_shape.W, ker_shape.H);
1238 uint32_t invalid_horizontal = 0;
1239 uint32_t invalid_vertical = 0;
1240 if (node.param().padding.type == ir::PaddingType::VALID)
1242 invalid_horizontal =
1243 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1244 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1247 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1248 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1249 auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
1251 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1253 auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
1254 ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
1255 invalid_horizontal, invalid_vertical);
1257 _return_fn = asAclFunction(std::move(fn));
1260 void KernelGenerator::visit(const ir::operation::Transpose &node)
1262 const auto ofm_idx{node.getOutputs().at(0)};
1263 const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
1264 const auto &perm{node.param().perm};
1266 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
1267 const auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
1268 const auto frontend_layout = _current_op_seq_layout;
1269 const auto backend_layout = ifm_tensor->layout();
1271 const auto rank = _ctx.at(ifm_idx).shape().rank();
1272 std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
1273 auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
1274 rank, pv, frontend_layout, backend_layout);
1276 std::unique_ptr<::arm_compute::IFunction> fn;
1277 if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2)
1279 fn = acl_common::generateLayer<arm_compute::NETranspose>(ifm_tensor->handle(),
1280 ofm_tensor->handle());
1284 fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
1285 ofm_tensor->handle(), backend_pv);
1287 _return_fn = asAclFunction(std::move(fn));
1290 void KernelGenerator::visit(const ir::operation::Unpack &node)
1292 const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1293 auto axis{node.param().axis};
1295 const auto input_rank = _ctx.at(input_index).shape().rank();
1297 std::vector<ir::OperandIndex> output_indexes;
1298 for (const auto &output_index : node.getOutputs())
1299 output_indexes.emplace_back(output_index);
1301 auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
1302 std::vector<arm_compute::ITensor *> outputs;
1303 for (const auto &output_index : output_indexes)
1304 outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1306 const auto frontend_layout = _current_op_seq_layout;
1307 const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
1310 axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1312 // Disable applied dim_correction
1313 std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
1314 for (const auto &output_index : output_indexes)
1316 size_t output_rank = _ctx.at(output_index).shape().rank();
1317 const auto &output_tensor = _tensor_reg->getAclTensor(output_index);
1318 orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
1319 assert(output_rank == output_tensor->num_dimensions());
1320 if (output_rank != output_tensor->info()->num_dimensions())
1322 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1323 output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
1324 _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
1328 auto fn = acl_common::generateLayer<arm_compute::NEUnstack>(input, outputs, axis);
1330 _return_fn = asAclFunction(std::move(fn));
1333 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
1335 const auto output_index{node.getOutputs().at(0)};
1336 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
1338 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
1339 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
1341 auto fn = acl_common::generateLayer<arm_compute::NEReshapeLayer>(input_tensor->handle(),
1342 output_tensor->handle());
1344 _return_fn = asAclFunction(std::move(fn));
1347 void KernelGenerator::visit(const ir::operation::Comparison &node)
1349 const auto output_index{node.getOutputs().at(0)};
1350 const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1351 const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1353 const auto comparison_type = node.param().comparison_type;
1355 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
1356 auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get();
1357 auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get();
1359 auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
1360 input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
1361 (arm_compute::ComparisonOperation)comparison_type);
1363 _return_fn = asAclFunction(std::move(fn));
1366 void KernelGenerator::visit(const ir::operation::OneHot &node)
1368 const auto out_idx{node.getOutputs().at(0)};
1369 const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
1370 const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
1371 const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
1372 const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
1373 const auto axis = node.param().axis;
1375 auto output_tensor = _tensor_reg->getAclTensor(out_idx).get();
1376 auto indices_tensor = _tensor_reg->getAclTensor(indices_idx).get();
1377 auto depth_tensor = _tensor_reg->getAclTensor(depth_idx).get();
1378 auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx).get();
1379 auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx).get();
1381 auto fn = acl_common::generateLayer<arm_compute::CPPOneHotEx>(
1382 indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
1383 offvalue_tensor->handle(), output_tensor->handle(), axis);
1384 _return_fn = asAclFunction(std::move(fn));
1387 } // namespace acl_neon
1388 } // namespace backend
1389 } // namespace onert