2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "KernelGenerator.h"
19 #include <arm_compute/runtime/CL/CLFunctions.h> // Include all ARM Compute CL functions
20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
22 #include <AclActivationBuilder.h>
23 #include <AclFunction.h>
28 #include "ir/DataType.h"
29 #include "ir/InternalType.h"
30 #include "exec/NopFunction.h"
31 #include "exec/FunctionSequence.h"
32 #include "util/logging.h"
33 #include "util/Utils.h"
34 #include "AclKernelGen.h"
43 using ::onert::backend::acl_common::asAclFunction;
44 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
45 ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclFunction>;
47 KernelGenerator::KernelGenerator(
48 const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
49 const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
50 : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()),
51 _operations_ctx(graph.operations()), _current_layout{graph.layout()},
52 _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
57 std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
59 auto ret = std::make_unique<exec::FunctionSequence>();
60 ret->enableDynamicShapeInferer(false);
62 const auto &op = _graph.operations().at(ind);
64 ret->append(releaseFunction());
68 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
70 const auto ofm_index{node.getOutputs().at(0)};
71 const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
72 const auto block_size_index{
73 node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
75 const auto NNApiInputs = 2;
76 if (node.getInputs().size() != NNApiInputs)
78 const auto crops_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::CROPS_DATA)};
79 if (!_ctx.at(crops_index).isConstant())
81 throw std::runtime_error("Non-constant crops NYI for acl_cl backend BatchToSpaceND");
84 auto crops = _ctx.at(crops_index).asVector<int32_t>();
85 for (auto crop : crops)
89 throw std::runtime_error("Non-zero crops NYI for acl_cl backend BatchToSpaceND");
94 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
95 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
96 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
98 assert(_ctx.at(block_size_index).data());
100 auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
101 ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
103 _return_fn = asAclFunction(std::move(fn));
106 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
108 const auto ofm_index{node.getOutputs().at(0)};
109 const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
110 const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
112 const auto activation = node.param().activation;
114 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
115 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
116 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
118 const auto act_info = acl_common::asActivationLayerInfo(activation);
120 std::unique_ptr<arm_compute::IFunction> fn;
121 switch (node.param().arithmetic_type)
123 case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
125 fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
126 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
127 arm_compute::ConvertPolicy::SATURATE, act_info);
130 case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
132 fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
133 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
134 arm_compute::ConvertPolicy::SATURATE, act_info);
137 case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
139 fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
140 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
141 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
145 case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
147 fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
148 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
152 assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
156 _return_fn = asAclFunction(std::move(fn));
159 void KernelGenerator::visit(const ir::operation::Conv2D &node)
161 using ir::operation::Conv2D;
163 const auto ofm_index{node.getOutputs().at(0)};
164 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
165 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
166 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
168 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
169 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
170 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
171 const auto &ker_shape = _ctx.at(ker_index).shape();
172 const auto ker_height = ker_shape.dim(1);
173 const auto ker_width = ker_shape.dim(2);
175 const auto stride = node.param().stride;
177 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
178 const auto activation = node.param().activation;
180 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
181 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
182 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
183 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
185 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
186 const auto act_info = acl_common::asActivationLayerInfo(activation);
188 auto fn = acl_common::generateLayer<arm_compute::CLConvolutionLayer>(
189 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
190 ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
191 ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
193 _return_fn = asAclFunction(std::move(fn));
196 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
198 using ir::operation::DepthwiseConv2D;
200 const auto ofm_index{node.getOutputs().at(0)};
201 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
202 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
203 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
205 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
206 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
207 // Kernel format is [1, kernel_height, kernel_width, depth_out].
208 const auto &ker_shape = _ctx.at(ker_index).shape();
209 const auto ker_height = ker_shape.dim(1);
210 const auto ker_width = ker_shape.dim(2);
212 const auto stride = node.param().stride;
213 const auto dilation = node.param().dilation;
215 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
216 dilation.width_factor, dilation.height_factor);
217 const auto multiplier = node.param().multiplier;
218 const auto activation = node.param().activation;
220 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
221 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
222 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
223 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
225 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
226 const auto act_info = acl_common::asActivationLayerInfo(activation);
227 const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
229 auto fn = acl_common::generateLayer<arm_compute::CLDepthwiseConvolutionLayer>(
230 ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
231 conv_info, multiplier, act_info, dilation_info);
233 _return_fn = asAclFunction(std::move(fn));
236 void KernelGenerator::visit(const ir::operation::Concat &node)
238 const auto ofm_index{node.getOutputs().at(0)};
240 std::vector<ir::OperandIndex> input_indexes;
242 for (const auto &input : node.getInputs())
243 input_indexes.emplace_back(input);
245 const auto axis = node.param().axis;
247 // Concat elimination check
248 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
251 // If concat eliminated, return a NOP IFunction
252 VERBOSE(acl_cl_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
253 _return_fn = std::make_unique<exec::NopFunction>();
257 auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
258 std::vector<const ::arm_compute::ICLTensor *> input_tensors;
259 for (const auto &ifm_ind : input_indexes)
260 input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
262 std::unique_ptr<::arm_compute::IFunction> fn;
263 if (input_indexes.size() < 2)
265 ::arm_compute::ICLTensor *input_tesor =
266 _tensor_reg->getAclTensor(input_indexes.at(0))->handle();
268 fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tesor, output_tensor->handle());
272 const auto rank = _ctx.at(ofm_index).shape().rank();
273 const auto frontend_layout = _current_layout;
274 const auto backend_layout = output_tensor->layout();
275 const auto fixed_axis =
276 acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
277 fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
278 input_tensors, output_tensor->handle(), fixed_axis);
281 _return_fn = asAclFunction(std::move(fn));
284 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
286 const auto output_index{node.getOutputs().at(0)};
287 auto output_tensor = _tensor_reg->getAclTensor(output_index);
288 const auto activation = node.param().activation;
289 if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
290 throw std::runtime_error(
291 "KernelGenerator(acl_cl): FullyConnected 16x1Float32 weights is not supported.");
293 auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
294 ::arm_compute::CLFullyConnectedReshapingLayer>(
295 node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
296 _return_fn = std::make_unique<exec::FunctionSequence>(
297 std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
300 void KernelGenerator::visit(const ir::operation::Reduce &node)
302 const auto output_index{node.getOutputs().at(0)};
303 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
304 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
305 const auto keep_dims{node.param().keep_dims};
306 const auto reduce_type = node.param().reduce_type;
308 auto output_tensor = _tensor_reg->getAclTensor(output_index);
309 auto input_tensor = _tensor_reg->getAclTensor(input_index);
311 // Convert to ACL axes taking into account negative values and possible duplicates.
312 const auto &axes = _ctx.at(axes_index);
313 const auto input_rank = _ctx.at(input_index).shape().rank();
314 const auto frontend_layout = _current_layout;
315 const auto backend_layout = input_tensor->layout();
317 std::unique_ptr<arm_compute::IFunction> fn;
318 if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
320 const auto acl_axes =
321 acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
322 fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
323 keep_dims, output_tensor->handle());
327 const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
329 fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
330 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
331 output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type));
334 _return_fn = asAclFunction(std::move(fn));
337 void KernelGenerator::visit(const ir::operation::Reshape &node)
339 const auto output_index{node.getOutputs().at(0)};
340 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
342 auto output_tensor = _tensor_reg->getAclTensor(output_index);
343 auto input_tensor = _tensor_reg->getAclTensor(input_index);
345 // NOTE This operation must not be changed the layout from frontend to backend
346 // So, PermutationOperationPass makes layouts of frontend and backend the same.
347 const auto frontend_layout = _current_layout;
348 const auto backend_layout = output_tensor->layout();
349 assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
350 frontend_layout == backend_layout);
351 UNUSED_RELEASE(frontend_layout);
352 UNUSED_RELEASE(backend_layout);
354 auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
355 output_tensor->handle());
357 _return_fn = asAclFunction(std::move(fn));
360 void KernelGenerator::visit(const ir::operation::Squeeze &node)
362 // Squeeze is identical to reshape except that it has an optional dimensions input.
363 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
364 // by freezer and toco
365 // TODO Support multi-layout for frontend and backend
366 const auto output_index{node.getOutputs().at(0)};
367 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
368 const auto dims{node.param().dims};
369 const auto ndim{node.param().ndim};
373 auto output_tensor = _tensor_reg->getAclTensor(output_index);
374 auto input_tensor = _tensor_reg->getAclTensor(input_index);
375 auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
376 output_tensor->handle());
377 _return_fn = asAclFunction(std::move(fn));
380 void KernelGenerator::visit(const ir::operation::Softmax &node)
382 const auto output_index{node.getOutputs().at(0)};
383 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
385 const auto beta = node.param().beta;
387 auto output_tensor = _tensor_reg->getAclTensor(output_index);
388 auto input_tensor = _tensor_reg->getAclTensor(input_index);
390 auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
391 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
392 output_tensor->handle(), beta);
394 _return_fn = asAclFunction(std::move(fn));
397 void KernelGenerator::visit(const ir::operation::Slice &node)
399 const auto output_index{node.getOutputs().at(0)};
400 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
401 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
402 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
404 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
405 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
406 const auto frontend_layout = _current_layout;
407 const auto backend_layout = inputData_tensor->layout();
409 // Set initializers for indices data such as order of inputData
410 int input_rank = _ctx.at(input_index).shape().rank();
411 std::vector<int32_t> starts;
412 std::vector<int32_t> ends;
413 starts.resize(input_rank, 0);
414 ends.resize(input_rank, 0);
416 assert(_ctx.at(begins_index).data());
417 assert(_ctx.at(sizes_index).data());
418 auto beginData_base = _ctx.at(begins_index).data()->base();
419 auto sizeData_base = _ctx.at(sizes_index).data()->base();
420 const int beginData_size = _ctx.at(begins_index).shape().num_elements();
421 const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
425 UNUSED_RELEASE(beginData_size);
426 UNUSED_RELEASE(sizeData_size);
428 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
429 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
430 assert(beginData_size == input_rank);
431 assert(sizeData_size == input_rank);
433 assert(beginData_base != nullptr);
434 for (int n = 0; n < input_rank; ++n)
436 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
440 int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
441 starts[axis] = begin_value;
443 int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
444 ends[axis] = begin_value + size_value;
448 ::arm_compute::Coordinates starts_set;
449 ::arm_compute::Coordinates ends_set;
451 for (size_t i = 0; i < starts.size(); ++i)
453 starts_set.set(i, starts[i]);
454 ends_set.set(i, ends[i]);
457 auto fn = acl_common::generateLayer<arm_compute::CLSlice>(
458 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
460 _return_fn = asAclFunction(std::move(fn));
463 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
465 const auto output_index{node.getOutputs().at(0)};
466 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
467 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
468 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
469 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
471 auto outputData_tensor = _tensor_reg->getAclTensor(output_index);
472 auto inputData_tensor = _tensor_reg->getAclTensor(input_index);
473 const auto frontend_layout = _current_layout;
474 const auto backend_layout = inputData_tensor->layout();
476 // Set initializers for indices data such as order of inputData
477 int input_rank = _ctx.at(input_index).shape().rank();
478 std::vector<int32_t> starts;
479 std::vector<int32_t> ends;
480 std::vector<int32_t> strides;
481 starts.resize(input_rank, 0);
482 ends.resize(input_rank, 0);
483 strides.resize(input_rank, 0);
485 assert(_ctx.at(starts_index).data());
486 assert(_ctx.at(ends_index).data());
487 assert(_ctx.at(strides_index).data());
488 auto startData_base = _ctx.at(starts_index).data()->base();
489 auto endData_base = _ctx.at(ends_index).data()->base();
490 auto stridesData_base = _ctx.at(strides_index).data()->base();
491 const int startData_size = _ctx.at(starts_index).shape().num_elements();
492 const int endData_size = _ctx.at(ends_index).shape().num_elements();
493 const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
497 UNUSED_RELEASE(startData_size);
498 UNUSED_RELEASE(endData_size);
499 UNUSED_RELEASE(stridesData_size);
501 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
502 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
503 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
504 assert(startData_size == input_rank);
505 assert(endData_size == input_rank);
506 assert(stridesData_size == input_rank);
508 assert(startData_base != nullptr);
509 for (int n = 0; n < input_rank; ++n)
511 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
515 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
516 starts[axis] = start_value;
518 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
519 ends[axis] = end_value;
521 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
522 strides[axis] = strides_value;
526 // Set mask bits such as order of inputData
527 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
528 frontend_layout, backend_layout);
529 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
530 frontend_layout, backend_layout);
531 const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
532 node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
534 ::arm_compute::Coordinates starts_set;
535 ::arm_compute::Coordinates ends_set;
536 ::arm_compute::BiStrides strides_set;
538 for (size_t i = 0; i < starts.size(); ++i)
540 starts_set.set(i, starts[i]);
541 ends_set.set(i, ends[i]);
542 strides_set.set(i, strides[i]);
545 // Disable applied dim_correction
546 if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
548 // This means that high dimension's value is 1 and input tensor is applied dim_correction
549 acl_common::disableDimCorrection(inputData_tensor);
552 auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
553 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
554 begin_mask, end_mask, shrink_axis_mask);
556 // Revert disabling applied dim_correction
557 if (inputData_tensor->dimension(0) == 1)
559 acl_common::enableDimCorrection(inputData_tensor);
562 _return_fn = asAclFunction(std::move(fn));
565 void KernelGenerator::visit(const ir::operation::Transpose &node)
567 const auto ofm_idx{node.getOutputs().at(0)};
568 const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
569 const auto perm_idx{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
571 const auto rank = _ctx.at(ifm_idx).shape().rank();
573 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
574 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
575 const auto frontend_layout = _current_layout;
576 const auto backend_layout = ifm_tensor->layout();
578 const auto &perms = _ctx.at(perm_idx);
579 std::vector<int32_t> pv;
580 if (perms.shape() == ir::Shape{0})
583 std::iota(pv.begin(), pv.end(), 0);
584 std::reverse(pv.begin(), pv.end());
588 pv = _ctx.at(perm_idx).asVector<int32_t>();
591 std::unique_ptr<arm_compute::IFunction> fn;
594 fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
598 assert(pv.size() == 2 && pv.at(0) == 1 && pv.at(1) == 0);
599 fn = acl_common::generateLayer<arm_compute::CLTranspose>(ifm_tensor->handle(),
600 ofm_tensor->handle());
605 acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
607 fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
608 ofm_tensor->handle(), backend_pv);
611 _return_fn = asAclFunction(std::move(fn));
614 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
616 const auto ofm_index{node.getOutputs().at(0)};
617 const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
619 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
620 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
622 const ::arm_compute::ActivationLayerInfo act_info =
623 acl_common::asActivationLayerInfo(node.param().op_type, node.param().alpha, node.param().beta);
625 auto fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
626 ifm_tensor->handle(), ofm_tensor->handle(), act_info);
628 _return_fn = asAclFunction(std::move(fn));
631 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
633 const auto output_index{node.getOutputs().at(0)};
634 const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
635 const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
637 auto output_tensor = _tensor_reg->getAclTensor(output_index);
638 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
639 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
641 std::unique_ptr<arm_compute::IFunction> fn;
642 switch (node.param().op_type)
644 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
646 fn = acl_common::generateLayer<arm_compute::CLBinaryLogicalOp>(
647 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle(),
648 arm_compute::BinaryLogicalOperation::AND);
651 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
653 fn = acl_common::generateLayer<arm_compute::CLBitwiseOr>(
654 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
657 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
659 fn = acl_common::generateLayer<arm_compute::CLElementwiseMax>(
660 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
663 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
665 fn = acl_common::generateLayer<arm_compute::CLElementwiseMin>(
666 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
671 std::string err_msg("acl_cl KernelGenerator : " + node.name() +
672 "is not elementwise-binary operations");
673 assert(false && err_msg.c_str());
678 _return_fn = asAclFunction(std::move(fn));
681 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
683 const auto output_index{node.getOutputs().at(0)};
684 const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
686 auto output_tensor = _tensor_reg->getAclTensor(output_index);
687 auto input_tensor = _tensor_reg->getAclTensor(input_index);
689 std::unique_ptr<arm_compute::IFunction> fn;
690 switch (node.param().op_type)
692 case ir::operation::ElementwiseUnary::Type::ABS:
694 const ::arm_compute::ActivationLayerInfo act_info{
695 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
697 fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
698 input_tensor->handle(), output_tensor->handle(), act_info);
701 case ir::operation::ElementwiseUnary::Type::CAST:
703 if (input_tensor->data_type() == output_tensor->data_type())
705 fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
706 output_tensor->handle());
708 else if (_ctx.at(input_index).typeInfo().type() == ir::DataType::BOOL8)
710 fn = acl_common::generateLayer<arm_compute::CLCastBool>(input_tensor->handle(),
711 output_tensor->handle());
715 // TODO Support converting float to int32 as round down
716 fn = acl_common::generateLayer<arm_compute::CLCast>(
717 input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
721 case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
723 fn = acl_common::generateLayer<arm_compute::CLDequantizationLayer>(input_tensor->handle(),
724 output_tensor->handle());
727 case ir::operation::ElementwiseUnary::Type::EXP:
729 fn = acl_common::generateLayer<arm_compute::CLExpLayer>(input_tensor->handle(),
730 output_tensor->handle());
733 case ir::operation::ElementwiseUnary::Type::FLOOR:
735 fn = acl_common::generateLayer<arm_compute::CLFloor>(input_tensor->handle(),
736 output_tensor->handle());
739 case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
741 fn = acl_common::generateLayer<arm_compute::CLBitwiseNot>(input_tensor->handle(),
742 output_tensor->handle());
745 case ir::operation::ElementwiseUnary::Type::NEG:
747 fn = acl_common::generateLayer<arm_compute::CLNeg>(input_tensor->handle(),
748 output_tensor->handle());
751 case ir::operation::ElementwiseUnary::Type::RSQRT:
753 fn = acl_common::generateLayer<arm_compute::CLRsqrtLayer>(input_tensor->handle(),
754 output_tensor->handle());
757 case ir::operation::ElementwiseUnary::Type::SQRT:
759 const ::arm_compute::ActivationLayerInfo act_info{
760 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
762 fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
763 input_tensor->handle(), output_tensor->handle(), act_info);
768 throw std::runtime_error("acl_cl KernelGenerator : " + node.name() + "is not supported yet");
773 auto acl_fn = asAclFunction(std::move(fn));
775 _return_fn = std::move(acl_fn);
778 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
780 const auto output_index{node.getOutputs().at(0)};
781 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
783 auto output_tensor = _tensor_reg->getAclTensor(output_index);
784 auto input_tensor = _tensor_reg->getAclTensor(input_index);
786 auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
787 output_tensor->handle());
789 _return_fn = asAclFunction(std::move(fn));
792 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
794 const auto ofm_index{node.getOutputs().at(0)};
795 const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
796 const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
797 const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
799 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
800 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
801 auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index);
802 auto beta_tensor = _tensor_reg->getAclTensor(beta_index);
803 auto epsilon = node.param().epsilon;
804 auto activation = node.param().activation;
806 auto fn = acl_common::generateLayer<arm_compute::CLInstanceNormalizationLayerEx>(
807 ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
810 _return_fn = std::make_unique<exec::FunctionSequence>(
811 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
814 void KernelGenerator::visit(const ir::operation::LSTM &node)
816 _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ICLTensor,
817 ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_reg);
820 void KernelGenerator::visit(const ir::operation::Comparison &node)
822 const auto output_index{node.getOutputs().at(0)};
823 const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
824 const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
826 const auto comparison_type = node.param().comparison_type;
828 auto output_tensor = _tensor_reg->getAclTensor(output_index);
829 auto input0_tensor = _tensor_reg->getAclTensor(input0_index);
830 auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
832 auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
833 input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
834 (arm_compute::ComparisonOperation)comparison_type);
836 _return_fn = asAclFunction(std::move(fn));
839 void KernelGenerator::visit(const ir::operation::OneHot &node)
841 const auto output_idx{node.getOutputs().at(0)};
842 const auto indices_idx{node.getInputs().at(ir::operation::OneHot::Input::INDICES)};
843 const auto depth_idx{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
844 const auto onvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
845 const auto offvalue_idx{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
846 const auto depth = _ctx.at(depth_idx).asScalar<int32_t>();
849 auto output_tensor = _tensor_reg->getAclTensor(output_idx);
850 auto indices_tensor = _tensor_reg->getAclTensor(indices_idx);
851 auto onvalue_tensor = _tensor_reg->getAclTensor(onvalue_idx);
853 const size_t output_rank = _ctx.at(output_idx).shape().rank();
854 const auto frontend_layout = _current_layout;
855 const auto backend_layout = output_tensor->layout();
856 int32_t axis = node.param().axis == -1 ? output_rank - 1 : node.param().axis;
857 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
859 if (output_tensor->num_dimensions() != output_tensor->info()->num_dimensions())
861 // This means that high dimension's value is 1 and output_tensor is applied dim_correction
862 acl_common::disableDimCorrection(output_tensor);
865 std::unique_ptr<::arm_compute::IFunction> fn;
866 const auto &offvalue = _ctx.at(offvalue_idx);
867 if (offvalue.isConstant())
869 fn = acl_common::generateLayer<arm_compute::CLOneHot>(
870 indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(),
871 acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis);
875 auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
876 fn = acl_common::generateLayer<arm_compute::CLOneHot>(
877 indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
878 output_tensor->handle(), static_cast<uint32_t>(depth), axis);
881 if (output_tensor->dimension(0) == 1)
883 acl_common::enableDimCorrection(output_tensor);
886 _return_fn = asAclFunction(std::move(fn));
889 void KernelGenerator::visit(const ir::operation::Pack &node)
891 const auto output_index{node.getOutputs().at(0)};
892 auto axis{node.param().axis};
894 const auto output_rank = _ctx.at(output_index).shape().rank();
896 std::vector<ir::OperandIndex> input_indexes;
897 for (const auto &input_index : node.getInputs())
898 input_indexes.emplace_back(input_index);
900 auto output = _tensor_reg->getAclTensor(output_index)->handle();
901 std::vector<arm_compute::ICLTensor *> inputs;
902 for (const auto &input_index : input_indexes)
903 inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
905 const auto frontend_layout = _current_layout;
906 const auto backend_layout = _tensor_reg->getAclTensor(output_index)->layout();
910 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
912 // Disable applied dim_correction
913 for (const auto &input_index : input_indexes)
915 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
916 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
918 // This means that high dimension's value is 1 and input tensor is applied dim_correction
919 acl_common::disableDimCorrection(input_tensor);
923 auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
925 // Revert disabling applied dim_correction
926 for (const auto &input_index : input_indexes)
928 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
929 if (input_tensor->dimension(0) == 1)
931 acl_common::enableDimCorrection(input_tensor);
935 _return_fn = asAclFunction(std::move(fn));
938 void KernelGenerator::visit(const ir::operation::Pool2D &node)
940 auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
941 node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
943 const auto ofm_index{node.getOutputs().at(0)};
944 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
945 const auto activation = node.param().activation;
946 _return_fn = std::make_unique<exec::FunctionSequence>(
947 asAclFunction(std::move(raw_fn)),
948 ActivationBuilder::generate(activation, ofm_tensor->handle()));
951 void KernelGenerator::visit(const ir::operation::Permute &node)
953 const auto ofm_idx{node.getOutputs().at(0)};
954 const auto ifm_idx{node.getInputs().at(0)};
955 const auto permute_type = node.getPermuteType();
956 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx);
957 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx);
958 const auto rank = _ctx.at(ofm_idx).shape().rank();
959 assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
961 std::unique_ptr<::arm_compute::IFunction> fn;
962 arm_compute::PermutationVector pv;
963 if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
966 pv = arm_compute::PermutationVector{2, 0, 1};
968 fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
969 ofm_tensor->handle(), pv);
971 else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
974 pv = arm_compute::PermutationVector{1, 2, 0};
976 fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
977 ofm_tensor->handle(), pv);
981 fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
984 _return_fn = asAclFunction(std::move(fn));
987 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
989 const auto ofm_index{node.getOutputs().at(0)};
990 const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
992 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
993 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
995 auto fn = acl_common::generateLayer<arm_compute::CLScale>(
996 ifm_tensor->handle(), ofm_tensor->handle(),
997 ::arm_compute::ScaleKernelInfo{
998 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
999 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT});
1001 _return_fn = asAclFunction(std::move(fn));
1004 void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
1006 const auto ofm_index{node.getOutputs().at(0)};
1007 const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)};
1009 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1010 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1012 auto fn = acl_common::generateLayer<arm_compute::CLScale>(
1013 ifm_tensor->handle(), ofm_tensor->handle(),
1014 ::arm_compute::ScaleKernelInfo{
1015 ::arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, ::arm_compute::BorderMode::REPLICATE,
1016 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT});
1018 _return_fn = asAclFunction(std::move(fn));
1021 void KernelGenerator::visit(const ir::operation::RNN &node)
1023 const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
1024 const auto hidden_state_out_index{
1025 node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
1027 const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
1028 const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
1029 const auto recurrent_weights_index{
1030 node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
1031 const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
1032 const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
1034 const auto activation = node.param().activation;
1036 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1037 auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index);
1039 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1040 auto weights_tensor = _tensor_reg->getAclTensor(weights_index);
1041 auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index);
1042 auto bias_tensor = _tensor_reg->getAclTensor(bias_index);
1043 auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index);
1044 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
1046 auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
1047 hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
1048 _return_fn = asAclFunction(std::move(copy_layer));
1050 auto fn = acl_common::generateLayer<arm_compute::CLRNNLayer>(
1051 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
1052 weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
1053 hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
1054 _return_fn = asAclFunction(std::move(fn));
1057 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1059 const auto ofm_index{node.getOutputs().at(0)};
1060 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
1061 const auto block_size_index{
1062 node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1063 const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1065 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1066 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1067 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index);
1068 auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index);
1070 assert(_ctx.at(block_size_index).data());
1071 assert(_ctx.at(paddings_index).data());
1073 auto fn = acl_common::generateLayer<arm_compute::CLSpaceToBatchLayer>(
1074 ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
1075 ofm_tensor->handle());
1077 _return_fn = asAclFunction(std::move(fn));
1080 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1082 const auto ofm_index{node.getOutputs().at(0)};
1083 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1085 auto block_size = node.param().block_size;
1087 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1088 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1090 auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
1091 ifm_tensor->handle(), ofm_tensor->handle(), block_size);
1093 _return_fn = asAclFunction(std::move(fn));
1096 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
1098 const auto output_index{node.getOutputs().at(0)};
1099 const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
1100 const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
1102 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1103 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1104 auto values_tensor = _tensor_reg->getAclTensor(values_index);
1106 auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
1107 values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
1109 _return_fn = asAclFunction(std::move(fn));
1112 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1114 const auto ofm_index{node.getOutputs().at(0)};
1115 const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
1117 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1118 // L2 Normalization always performs the reduction along the depth axis
1119 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1120 // choosing normalization parameters as below
1122 const auto &ifm_shape = _ctx.at(ifm_index).shape();
1123 // TODO Support optional constant dimension that normalization would be performed on
1124 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1126 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1127 float alpha = 1.0f; // In the implementation to make alpha_ become 1
1128 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
1129 float bias = 0.0f; // Don't offset the reduction.
1131 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1132 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1134 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1135 radius, alpha, beta, bias, false);
1137 auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1138 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1140 _return_fn = asAclFunction(std::move(fn));
1143 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
1145 const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
1146 const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
1148 const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
1149 const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
1150 const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
1152 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1153 auto hits_tensor = _tensor_reg->getAclTensor(hits_index);
1155 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index);
1156 auto keys_tensor = _tensor_reg->getAclTensor(keys_index);
1157 auto values_tensor = _tensor_reg->getAclTensor(values_index);
1159 auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
1160 lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
1161 output_tensor->handle(), hits_tensor->handle());
1163 _return_fn = asAclFunction(std::move(fn));
1166 void KernelGenerator::visit(const ir::operation::PReLU &node)
1168 const auto ofm_index{node.getOutputs().at(0)};
1169 const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
1170 const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
1172 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1173 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1174 auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
1176 auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
1177 ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
1179 _return_fn = asAclFunction(std::move(fn));
1182 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1184 const auto ofm_index{node.getOutputs().at(0)};
1185 const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1186 const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1188 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
1189 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
1190 const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_layout);
1192 const auto stride = node.param().stride;
1194 assert((node.param().padding.type == ir::PaddingType::SAME) ||
1195 (node.param().padding.type == ir::PaddingType::VALID));
1196 auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1197 ker_shape.W, ker_shape.H);
1198 uint32_t invalid_horizontal = 0;
1199 uint32_t invalid_vertical = 0;
1200 if (node.param().padding.type == ir::PaddingType::VALID)
1202 invalid_horizontal =
1203 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1204 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1207 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1208 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1209 auto ker_tensor = _tensor_reg->getAclTensor(ker_index);
1211 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1213 auto fn = acl_common::generateLayer<arm_compute::CLTransposeConvLayer>(
1214 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
1215 ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
1218 _return_fn = asAclFunction(std::move(fn));
1221 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1223 const auto ofm_index{node.getOutputs().at(0)};
1224 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1225 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1227 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1228 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index);
1229 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
1231 auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
1232 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1234 _return_fn = asAclFunction(std::move(fn));
1237 void KernelGenerator::visit(const ir::operation::TopKV2 &node)
1239 const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
1240 const auto outputIndices_index{
1241 node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
1243 const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
1245 // Currently, we only support the vector input.
1246 assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1247 _ctx.at(inputData_index).shape().rank() == 2);
1249 const auto k = node.param().k;
1251 auto values_tensor = _tensor_reg->getAclTensor(outputValues_index);
1252 auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index);
1253 auto input_tensor = _tensor_reg->getAclTensor(inputData_index);
1255 auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
1256 input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
1258 _return_fn = asAclFunction(std::move(fn));
1261 void KernelGenerator::visit(const ir::operation::Gather &node)
1263 const auto ofm_index{node.getOutputs().at(0)};
1265 const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
1266 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
1268 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1269 const auto axis_raw = node.param().axis;
1270 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1271 const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
1273 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1274 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1275 auto indices_tensor = _tensor_reg->getAclTensor(indices_index);
1277 // NOTE The frontend layout and backend layout must be the same for this operation.
1278 // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1279 // is not not efficient even if it works well. If so, it would be better to set the
1280 // layout of these backend tensors to the same layout.
1281 // There is also one thing we have to think about. This operation depends on the layout of
1282 // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1283 // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1284 // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1285 const auto backend_layout = ofm_tensor->layout();
1286 UNUSED_RELEASE(backend_layout);
1287 assert(backend_layout == ifm_tensor->layout());
1288 assert(backend_layout == indices_tensor->layout());
1289 assert(ifm_rank < 4 || _current_layout == backend_layout);
1291 // input is n-D, indices k-D, output is (n + k - 1)-D
1292 size_t n = ifm_rank;
1293 assert(n == ifm_tensor->num_dimensions());
1294 size_t k = _ctx.at(indices_index).shape().rank();
1295 assert(k == indices_tensor->num_dimensions());
1297 // Disable applied dim_correction
1298 if (n != ifm_tensor->info()->num_dimensions())
1300 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1301 acl_common::disableDimCorrection(ifm_tensor);
1303 if (k != indices_tensor->info()->num_dimensions())
1305 // This means that high dimension's value is 1 and indices tensor is applied dim_correction
1306 acl_common::disableDimCorrection(indices_tensor);
1309 auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
1310 ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
1312 // Revert disabling applied dim_correction
1313 if (ifm_tensor->dimension(0) == 1)
1315 acl_common::enableDimCorrection(ifm_tensor);
1317 if (indices_tensor->dimension(0) == 1)
1319 acl_common::enableDimCorrection(indices_tensor);
1322 _return_fn = asAclFunction(std::move(fn));
1325 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
1327 const auto ofm_index{node.getOutputs().at(0)};
1328 const auto ifm_index{node.getInputs().at(ir::operation::ArgMinMax::Input::INPUT)};
1329 const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::Input::AXIS)};
1331 auto ifm_shape = _ctx.at(ifm_index).shape();
1332 auto ofm_shape = _ctx.at(ofm_index).shape();
1334 assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1336 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1337 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1338 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1339 auto frontend_layout = _current_layout;
1340 auto backend_layout = ifm_tensor->layout();
1342 int axis_value = _ctx.at(axis_index).asScalar<int32_t>();
1345 axis_value += ifm_rank;
1349 acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1350 auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
1351 : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
1352 auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
1353 ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type);
1355 _return_fn = asAclFunction(std::move(fn));
1358 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
1360 const auto ofm_index{node.getOutputs().at(0)};
1361 const auto ifm_index{
1362 node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
1364 auto radius = node.param().radius;
1365 auto alpha = node.param().alpha;
1366 auto beta = node.param().beta;
1367 auto bias = node.param().bias;
1369 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1370 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1372 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1373 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1375 auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1376 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1378 _return_fn = asAclFunction(std::move(fn));
1381 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1383 const auto output_index{node.getOutputs().at(0)};
1384 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1386 auto block_size = node.param().block_size;
1387 assert(block_size > 0);
1389 auto output_tensor = _tensor_reg->getAclTensor(output_index);
1390 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1392 auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
1393 input_tensor->handle(), output_tensor->handle(), block_size);
1395 _return_fn = asAclFunction(std::move(fn));
1398 void KernelGenerator::visit(const ir::operation::Split &node)
1400 const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1401 const auto axis_index{node.getInputs().at(ir::operation::Split::Input::AXIS)};
1403 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1404 if (!_ctx.at(axis_index).isConstant())
1406 throw std::runtime_error("Non-constant axis_index NYI for acl_cl backend");
1409 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1410 std::vector<ir::OperandIndex> output_indexes;
1411 for (const auto &output : node.getOutputs())
1412 output_indexes.emplace_back(output);
1414 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1415 std::vector<arm_compute::ICLTensor *> output_tensors;
1416 for (const auto &ofm_ind : output_indexes)
1417 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1419 const auto frontend_layout = _current_layout;
1420 const auto backend_layout = ifm_tensor->layout();
1421 auto axis = _ctx.at(axis_index).asScalar<int32_t>();
1424 axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1427 acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
1429 _return_fn = asAclFunction(std::move(fn));
1432 void KernelGenerator::visit(const ir::operation::SplitV &node)
1434 const auto ifm_index{node.getInputs().at(ir::operation::SplitV::Input::INPUT)};
1435 const auto size_split_index{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
1436 const auto split_dim_index{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
1438 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1440 const size_t ifm_rank = _ctx.at(ifm_index).shape().rank();
1441 std::vector<ir::OperandIndex> output_indexes;
1442 for (const auto &output : node.getOutputs())
1443 output_indexes.emplace_back(output);
1445 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1446 auto size_split_tensor = _tensor_reg->getAclTensor(size_split_index);
1448 std::vector<arm_compute::ICLTensor *> output_tensors;
1449 for (const auto &ofm_ind : output_indexes)
1450 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind)->handle());
1452 auto fn = std::make_unique<arm_compute::CLSplitVEx>();
1453 const auto &split_dim_op = _ctx.at(split_dim_index);
1454 if (split_dim_op.isConstant())
1456 int32_t split_dim = split_dim_op.asScalar<int32_t>();
1457 uint32_t split_dim_revised = (split_dim < 0) ? (split_dim + ifm_rank) : split_dim;
1458 const auto frontend_layout = _current_layout;
1459 const auto backend_layout = ifm_tensor->layout();
1461 if (ifm_tensor->num_dimensions() != ifm_tensor->info()->num_dimensions())
1463 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1464 acl_common::disableDimCorrection(ifm_tensor);
1468 acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout)
1470 fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
1471 output_tensors, node.param().num_splits);
1473 if (ifm_tensor->dimension(0) == 1)
1475 acl_common::enableDimCorrection(ifm_tensor);
1480 throw std::runtime_error("Non-constant split_dim NYI for acl_cl backend");
1483 _return_fn = asAclFunction(std::move(fn));
1486 void KernelGenerator::visit(const ir::operation::Unpack &node)
1488 const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1489 auto axis{node.param().axis};
1491 const auto input_rank = _ctx.at(input_index).shape().rank();
1493 std::vector<ir::OperandIndex> output_indexes;
1494 for (const auto &output_index : node.getOutputs())
1495 output_indexes.emplace_back(output_index);
1497 auto input_tensor = _tensor_reg->getAclTensor(input_index);
1498 std::vector<arm_compute::ICLTensor *> outputs;
1499 for (const auto &output_index : output_indexes)
1500 outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1502 const auto frontend_layout = _current_layout;
1503 const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1506 axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1508 // Disable applied dim_correction
1509 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1511 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1512 acl_common::disableDimCorrection(input_tensor);
1516 acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
1518 // Revert disabling applied dim_correction
1519 if (input_tensor->dimension(0) == 1)
1521 acl_common::enableDimCorrection(input_tensor);
1524 _return_fn = asAclFunction(std::move(fn));
1527 void KernelGenerator::visit(const ir::operation::Pad &node)
1529 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
1530 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
1531 const auto output_index{node.getOutputs().at(0)};
1532 assert(_ctx.at(pad_index).data());
1534 auto rank = _ctx.at(input_index).shape().rank();
1535 auto pad_base = _ctx.at(pad_index).data()->base();
1537 auto input_type = _ctx.at(input_index).typeInfo();
1538 auto data_type = acl_common::asDataType(input_type.type());
1539 auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.zero_point());
1540 const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1542 auto input = _tensor_reg->getAclTensor(input_index)->handle();
1543 auto output = _tensor_reg->getAclTensor(output_index)->handle();
1545 const auto frontend_layout = _current_layout;
1546 const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
1548 ::arm_compute::PaddingList padding_list;
1549 padding_list.resize(rank);
1550 for (int32_t n = 0; n < rank; ++n)
1552 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
1555 acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
1556 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1559 // Disable applied dim_correction
1560 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
1561 if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
1563 // This means that high dimension's value is 1 and input tensor is applied dim_correction
1564 acl_common::disableDimCorrection(input_tensor);
1568 acl_common::generateLayer<arm_compute::CLPadLayerEx>(input, output, padding_list, pixel_value);
1570 // NOTE Do not revert disabling applied dim_correction for 4D.
1571 // It would produce a mistach of result by incorrect offset_first_element in
1572 // ICLKernel::add_tensor_argument<3>().
1573 // We have to disable applied dim_correction and not to revert enabling for the kernel that slices
1574 // 4D to 3D because slicing arm_compute::Window can causes incorrect offset_first_element if the
1575 // used tensor is 4D and the tensor's high dimention is 1
1576 if (input_tensor->num_dimensions() < 4 && input_tensor->dimension(0) == 1)
1578 acl_common::enableDimCorrection(input_tensor);
1581 _return_fn = asAclFunction(std::move(fn));
1584 void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
1586 const auto ofm_index{node.getOutputs().at(0)};
1587 const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
1589 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1590 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1592 auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1593 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1595 _return_fn = asAclFunction(std::move(fn));
1598 void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
1600 const auto ofm_index{node.getOutputs().at(0)};
1601 const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
1603 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1604 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1606 auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1607 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1609 _return_fn = asAclFunction(std::move(fn));
1612 void KernelGenerator::visit(const ir::operation::Reverse &node)
1614 const auto ofm_index{node.getOutputs().at(0)};
1615 const auto ifm_index{node.getInputs().at(ir::operation::Reverse::Input::INPUT)};
1616 const auto axis_index{node.getInputs().at(ir::operation::Reverse::Input::AXIS)};
1618 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
1619 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
1620 auto axis_tensor = _tensor_reg->getAclTensor(axis_index);
1622 // WORKAROUND: acl-cl backend only allow U32 type for axis
1623 // ConstantInitializer will resolve S32 type to U32 type
1624 if (_ctx.at(axis_index).isConstant() &&
1625 (axis_tensor->handle()->info()->data_type() == arm_compute::DataType::S32))
1627 axis_tensor->handle()->info()->set_data_type(arm_compute::DataType::U32);
1630 auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
1631 ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
1633 _return_fn = asAclFunction(std::move(fn));
1636 } // namespace acl_cl
1637 } // namespace backend
1638 } // namespace onert