2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "KernelGenerator.h"
19 #include <arm_compute/runtime/CL/CLFunctions.h> // Include all ARM Compute CL functions
20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
22 #include <AclActivationBuilder.h>
23 #include <AclFunction.h>
28 #include "ir/DataType.h"
29 #include "ir/InternalType.h"
30 #include "exec/NopFunction.h"
31 #include "exec/FunctionSequence.h"
32 #include "util/logging.h"
33 #include "util/Utils.h"
34 #include "AclKernelGen.h"
43 using ::onert::backend::acl_common::asAclFunction;
44 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
45 ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclFunction>;
47 KernelGenerator::KernelGenerator(
48 const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
49 const std::shared_ptr<TensorBuilder> &tensor_builder,
50 const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
51 : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
52 _tensor_reg(tensor_reg), _current_op_seq_layout(ir::Layout::UNKNOWN)
57 void KernelGenerator::visit(const ir::OpSequence &op_seq)
59 // TODO Move this to IKernelGenerator
60 // (all derivatives have the same implementation for this)
61 assert(!_return_fn_seq);
62 _return_fn_seq = std::make_unique<exec::FunctionSequence>();
63 _return_fn_seq->enableDynamicShapeInferer(false);
65 _current_op_seq_layout = op_seq.getLayout();
66 for (const auto &operation_idx : op_seq.operations())
68 const auto &node = _operations_ctx.at(operation_idx);
70 _return_fn_seq->append(releaseFunction());
74 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
76 const auto ofm_index{node.getOutputs().at(0)};
77 const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
78 const auto block_size_index{
79 node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
81 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
82 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
83 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
85 assert(_ctx.at(block_size_index).data());
87 auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
88 ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
90 _return_fn = asAclFunction(std::move(fn));
93 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
95 const auto ofm_index{node.getOutputs().at(0)};
96 const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
97 const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
99 const auto activation = node.param().activation;
101 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
102 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
103 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
105 const auto act_info = acl_common::asActivationLayerInfo(activation);
107 std::unique_ptr<arm_compute::IFunction> fn;
108 switch (node.param().arithmetic_type)
110 case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
112 fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
113 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
114 arm_compute::ConvertPolicy::SATURATE, act_info);
117 case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
119 fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
120 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
121 arm_compute::ConvertPolicy::SATURATE, act_info);
124 case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
126 fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
127 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
128 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
132 case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
134 fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
135 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
139 assert(false && "The BinaryArithmetic operation supports only binary arithmetic operations");
143 _return_fn = asAclFunction(std::move(fn));
146 void KernelGenerator::visit(const ir::operation::Conv2D &node)
148 using ir::operation::Conv2D;
150 const auto ofm_index{node.getOutputs().at(0)};
151 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
152 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
153 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
155 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
156 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
157 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
158 const auto &ker_shape = _ctx.at(ker_index).shape();
159 const auto ker_height = ker_shape.dim(1);
160 const auto ker_width = ker_shape.dim(2);
162 const auto stride = node.param().stride;
163 const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
164 ker_width, ker_height);
165 const auto activation = node.param().activation;
167 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
168 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
169 auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
170 auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
172 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
173 const auto act_info = acl_common::asActivationLayerInfo(activation);
175 auto fn = acl_common::generateLayer<arm_compute::CLConvolutionLayer>(
176 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
177 ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
178 ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
180 _return_fn = asAclFunction(std::move(fn));
183 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
185 using ir::operation::DepthwiseConv2D;
187 const auto ofm_index{node.getOutputs().at(0)};
188 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
189 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
190 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
192 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
193 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
194 // Kernel format is [1, kernel_height, kernel_width, depth_out].
195 const auto &ker_shape = _ctx.at(ker_index).shape();
196 const auto ker_height = ker_shape.dim(1);
197 const auto ker_width = ker_shape.dim(2);
199 const auto stride = node.param().stride;
200 const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
201 ker_width, ker_height);
202 const auto multiplier = node.param().multiplier;
203 const auto activation = node.param().activation;
205 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
206 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
207 auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
208 auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
210 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
211 const auto act_info = acl_common::asActivationLayerInfo(activation);
214 auto fn = acl_common::generateLayer<arm_compute::CLDepthwiseConvolutionLayer>(
215 ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
216 conv_info, multiplier, act_info);
218 _return_fn = asAclFunction(std::move(fn));
222 void KernelGenerator::visit(const ir::operation::Concat &node)
224 const auto ofm_index{node.getOutputs().at(0)};
226 std::vector<ir::OperandIndex> input_indexes;
228 for (const auto &input : node.getInputs())
229 input_indexes.emplace_back(input);
231 const auto axis = node.param().axis;
233 // Concat elimination check
234 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
237 // If concat eliminated, return a NOP IFunction
238 VERBOSE(acl_cl_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
239 _return_fn = std::make_unique<exec::NopFunction>();
243 auto output_tensor = _tensor_reg->getAclTensor(ofm_index).get();
244 std::vector<::arm_compute::ICLTensor *> input_tensors;
245 for (auto &ifm_ind : input_indexes)
246 input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
248 std::unique_ptr<::arm_compute::IFunction> fn;
249 if (input_indexes.size() < 2)
251 fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensors.at(0),
252 output_tensor->handle());
256 const auto rank = _ctx.at(ofm_index).shape().rank();
257 const auto frontend_layout = _current_op_seq_layout;
258 const auto backend_layout = output_tensor->layout();
259 const auto fixed_axis =
260 acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
261 fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
262 input_tensors, output_tensor->handle(), fixed_axis);
265 _return_fn = asAclFunction(std::move(fn));
268 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
270 const auto output_index{node.getOutputs().at(0)};
271 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
272 const auto activation = node.param().activation;
274 auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
275 ::arm_compute::CLFullyConnectedReshapingLayer>(
276 node, _ctx, _tensor_builder, _tensor_reg, _current_op_seq_layout);
277 _return_fn = std::make_unique<exec::FunctionSequence>(
278 std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
281 void KernelGenerator::visit(const ir::operation::Reduce &node)
283 const auto output_index{node.getOutputs().at(0)};
284 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
285 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
286 const auto keep_dims{node.param().keep_dims};
287 const auto reduce_type = node.param().reduce_type;
289 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
290 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
292 // Convert to ACL axes taking into account negative values and possible duplicates.
293 const auto &axes = _ctx.at(axes_index);
294 const auto input_rank = _ctx.at(input_index).shape().rank();
295 const auto frontend_layout = _current_op_seq_layout;
296 const auto backend_layout = input_tensor->layout();
298 std::unique_ptr<arm_compute::IFunction> fn;
299 if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
301 const auto acl_axes =
302 acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
303 fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
304 keep_dims, output_tensor->handle());
308 const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
310 fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
311 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
312 output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type));
315 _return_fn = asAclFunction(std::move(fn));
318 void KernelGenerator::visit(const ir::operation::Reshape &node)
320 const auto output_index{node.getOutputs().at(0)};
321 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
323 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
324 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
326 // NOTE This operation must not be changed the layout from frontend to backend
327 // So, PermutationOperationPass makes layouts of frontend and backend the same.
328 const auto frontend_layout = _current_op_seq_layout;
329 const auto backend_layout = output_tensor->layout();
330 assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
331 frontend_layout == backend_layout);
332 UNUSED_RELEASE(frontend_layout);
333 UNUSED_RELEASE(backend_layout);
335 auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
336 output_tensor->handle());
338 _return_fn = asAclFunction(std::move(fn));
341 void KernelGenerator::visit(const ir::operation::Squeeze &node)
343 // Squeeze is identical to reshape except that it has an optional dimensions input.
344 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
345 // by freezer and toco
346 // TODO Support multi-layout for frontend and backend
347 const auto output_index{node.getOutputs().at(0)};
348 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
349 const auto dims{node.param().dims};
350 const auto ndim{node.param().ndim};
354 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
355 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
356 auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
357 output_tensor->handle());
358 _return_fn = asAclFunction(std::move(fn));
361 void KernelGenerator::visit(const ir::operation::Softmax &node)
363 const auto output_index{node.getOutputs().at(0)};
364 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
366 const auto beta = node.param().beta;
368 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
369 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
371 auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
372 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
373 output_tensor->handle(), beta);
375 _return_fn = asAclFunction(std::move(fn));
378 void KernelGenerator::visit(const ir::operation::Slice &node)
380 const auto output_index{node.getOutputs().at(0)};
381 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
382 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
383 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
385 auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
386 auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
387 const auto frontend_layout = _current_op_seq_layout;
388 const auto backend_layout = inputData_tensor->layout();
390 // Set initializers for indices data such as order of inputData
391 int input_rank = _ctx.at(input_index).shape().rank();
392 std::vector<int32_t> starts;
393 std::vector<int32_t> ends;
394 starts.resize(input_rank, 0);
395 ends.resize(input_rank, 0);
397 assert(_ctx.at(begins_index).data());
398 assert(_ctx.at(sizes_index).data());
399 auto beginData_base = _ctx.at(begins_index).data()->base();
400 auto sizeData_base = _ctx.at(sizes_index).data()->base();
401 const int beginData_size = _ctx.at(begins_index).shape().num_elements();
402 const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
406 UNUSED_RELEASE(beginData_size);
407 UNUSED_RELEASE(sizeData_size);
409 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
410 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
411 assert(beginData_size == input_rank);
412 assert(sizeData_size == input_rank);
414 assert(beginData_base != nullptr);
415 for (int n = 0; n < input_rank; ++n)
417 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
421 int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
422 starts[axis] = begin_value;
424 int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
425 ends[axis] = begin_value + size_value;
429 ::arm_compute::Coordinates starts_set;
430 ::arm_compute::Coordinates ends_set;
432 for (size_t i = 0; i < starts.size(); ++i)
434 starts_set.set(i, starts[i]);
435 ends_set.set(i, ends[i]);
438 auto fn = acl_common::generateLayer<arm_compute::CLSlice>(
439 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
441 _return_fn = asAclFunction(std::move(fn));
444 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
446 const auto output_index{node.getOutputs().at(0)};
447 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
448 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
449 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
450 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
452 auto outputData_tensor = _tensor_reg->getAclTensor(output_index).get();
453 auto inputData_tensor = _tensor_reg->getAclTensor(input_index).get();
454 const auto frontend_layout = _current_op_seq_layout;
455 const auto backend_layout = inputData_tensor->layout();
457 // Set initializers for indices data such as order of inputData
458 int input_rank = _ctx.at(input_index).shape().rank();
459 std::vector<int32_t> starts;
460 std::vector<int32_t> ends;
461 std::vector<int32_t> strides;
462 starts.resize(input_rank, 0);
463 ends.resize(input_rank, 0);
464 strides.resize(input_rank, 0);
466 assert(_ctx.at(starts_index).data());
467 assert(_ctx.at(ends_index).data());
468 assert(_ctx.at(strides_index).data());
469 auto startData_base = _ctx.at(starts_index).data()->base();
470 auto endData_base = _ctx.at(ends_index).data()->base();
471 auto stridesData_base = _ctx.at(strides_index).data()->base();
472 const int startData_size = _ctx.at(starts_index).shape().num_elements();
473 const int endData_size = _ctx.at(ends_index).shape().num_elements();
474 const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
478 UNUSED_RELEASE(startData_size);
479 UNUSED_RELEASE(endData_size);
480 UNUSED_RELEASE(stridesData_size);
482 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
483 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
484 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
485 assert(startData_size == input_rank);
486 assert(endData_size == input_rank);
487 assert(stridesData_size == input_rank);
489 assert(startData_base != nullptr);
490 for (int n = 0; n < input_rank; ++n)
492 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
496 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
497 starts[axis] = start_value;
499 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
500 ends[axis] = end_value;
502 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
503 strides[axis] = strides_value;
507 // Set mask bits such as order of inputData
508 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
509 frontend_layout, backend_layout);
510 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
511 frontend_layout, backend_layout);
512 const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
513 node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
515 ::arm_compute::Coordinates starts_set;
516 ::arm_compute::Coordinates ends_set;
517 ::arm_compute::BiStrides strides_set;
519 for (size_t i = 0; i < starts.size(); ++i)
521 starts_set.set(i, starts[i]);
522 ends_set.set(i, ends[i]);
523 strides_set.set(i, strides[i]);
526 auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
527 inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
528 begin_mask, end_mask, shrink_axis_mask);
530 _return_fn = asAclFunction(std::move(fn));
533 void KernelGenerator::visit(const ir::operation::Transpose &node)
535 const auto ofm_idx{node.getOutputs().at(0)};
536 const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
537 const auto &perm{node.param().perm};
539 const auto rank = _ctx.at(ifm_idx).shape().rank();
541 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
542 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
543 const auto frontend_layout = _current_op_seq_layout;
544 const auto backend_layout = ifm_tensor->layout();
546 std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
548 auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
549 rank, pv, frontend_layout, backend_layout);
551 auto fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
552 ofm_tensor->handle(), backend_pv);
554 _return_fn = asAclFunction(std::move(fn));
557 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
559 const auto ofm_index{node.getOutputs().at(0)};
560 const auto ifm_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
562 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
563 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
565 const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
566 node.param().op_type, node.param().alpha, node.param().beta);
568 auto fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
569 ifm_tensor->handle(), ofm_tensor->handle(), act_info);
571 _return_fn = asAclFunction(std::move(fn));
574 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
576 const auto output_index{node.getOutputs().at(0)};
577 const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
578 const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
580 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
581 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
582 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
584 std::unique_ptr<arm_compute::IFunction> fn;
585 switch (node.param().op_type)
587 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
589 fn = acl_common::generateLayer<arm_compute::CLBinaryLogicalOp>(
590 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle(),
591 arm_compute::BinaryLogicalOperation::AND);
594 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
596 fn = acl_common::generateLayer<arm_compute::CLBitwiseOr>(
597 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
600 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
602 fn = acl_common::generateLayer<arm_compute::CLElementwiseMax>(
603 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
606 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
608 fn = acl_common::generateLayer<arm_compute::CLElementwiseMin>(
609 lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
614 std::string err_msg("acl_cl KernelGenerator : " + node.name() +
615 "is not elementwise-binary operations");
616 assert(false && err_msg.c_str());
621 _return_fn = asAclFunction(std::move(fn));
624 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
626 const auto output_index{node.getOutputs().at(0)};
627 const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
629 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
630 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
632 std::unique_ptr<arm_compute::IFunction> fn;
633 switch (node.param().op_type)
635 case ir::operation::ElementwiseUnary::Type::ABS:
637 const ::arm_compute::ActivationLayerInfo act_info{
638 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
640 fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
641 input_tensor->handle(), output_tensor->handle(), act_info);
644 case ir::operation::ElementwiseUnary::Type::CAST:
646 if (input_tensor->data_type() == output_tensor->data_type())
648 fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensor->handle(),
649 output_tensor->handle());
654 // TODO Support converting float to int32 as round down
655 fn = acl_common::generateLayer<arm_compute::CLCast>(
656 input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
660 case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
662 fn = acl_common::generateLayer<arm_compute::CLDequantizationLayer>(input_tensor->handle(),
663 output_tensor->handle());
666 case ir::operation::ElementwiseUnary::Type::EXP:
668 fn = acl_common::generateLayer<arm_compute::CLExpLayer>(input_tensor->handle(),
669 output_tensor->handle());
672 case ir::operation::ElementwiseUnary::Type::FLOOR:
674 fn = acl_common::generateLayer<arm_compute::CLFloor>(input_tensor->handle(),
675 output_tensor->handle());
678 case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
680 fn = acl_common::generateLayer<arm_compute::CLBitwiseNot>(input_tensor->handle(),
681 output_tensor->handle());
684 case ir::operation::ElementwiseUnary::Type::NEG:
686 fn = acl_common::generateLayer<arm_compute::CLNeg>(input_tensor->handle(),
687 output_tensor->handle());
690 case ir::operation::ElementwiseUnary::Type::RSQRT:
692 fn = acl_common::generateLayer<arm_compute::CLRsqrtLayer>(input_tensor->handle(),
693 output_tensor->handle());
696 case ir::operation::ElementwiseUnary::Type::SQRT:
698 const ::arm_compute::ActivationLayerInfo act_info{
699 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
701 fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
702 input_tensor->handle(), output_tensor->handle(), act_info);
707 throw std::runtime_error("acl_cl KernelGenerator : " + node.name() + "is not supported yet");
712 auto acl_fn = asAclFunction(std::move(fn));
714 _return_fn = std::move(acl_fn);
717 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
719 const auto output_index{node.getOutputs().at(0)};
720 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
722 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
723 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
725 auto fn = acl_common::generateLayer<arm_compute::CLReshapeLayer>(input_tensor->handle(),
726 output_tensor->handle());
728 _return_fn = asAclFunction(std::move(fn));
731 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
733 const auto ofm_index{node.getOutputs().at(0)};
734 const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
735 const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
736 const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
738 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
739 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
740 auto gamma_tensor = _tensor_reg->getAclTensor(gamma_index).get();
741 auto beta_tensor = _tensor_reg->getAclTensor(beta_index).get();
742 auto epsilon = node.param().epsilon;
743 auto activation = node.param().activation;
745 auto fn = acl_common::generateLayer<arm_compute::CLInstanceNormalizationLayerEx>(
746 ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
749 _return_fn = std::make_unique<exec::FunctionSequence>(
750 asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
753 void KernelGenerator::visit(const ir::operation::LSTM &node)
755 _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ICLTensor,
756 ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_reg);
759 void KernelGenerator::visit(const ir::operation::Comparison &node)
761 const auto output_index{node.getOutputs().at(0)};
762 const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
763 const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
765 const auto comparison_type = node.param().comparison_type;
767 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
768 auto input0_tensor = _tensor_reg->getAclTensor(input0_index).get();
769 auto input1_tensor = _tensor_reg->getAclTensor(input1_index).get();
771 auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
772 input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
773 (arm_compute::ComparisonOperation)comparison_type);
775 _return_fn = asAclFunction(std::move(fn));
778 void KernelGenerator::visit(const ir::operation::Pack &node)
780 const auto output_index{node.getOutputs().at(0)};
781 auto axis{node.param().axis};
783 const auto output_rank = _ctx.at(output_index).shape().rank();
785 std::vector<ir::OperandIndex> input_indexes;
786 for (const auto &input_index : node.getInputs())
787 input_indexes.emplace_back(input_index);
789 auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
790 std::vector<arm_compute::ICLTensor *> inputs;
791 for (const auto &input_index : input_indexes)
792 inputs.emplace_back(_tensor_reg->getAclTensor(input_index)->handle());
794 const auto frontend_layout = _current_op_seq_layout;
795 const auto backend_layout = _tensor_reg->getAclTensor(output_index).get()->layout();
799 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
801 // Disable applied dim_correction
802 std::vector<arm_compute::TensorShape> orig_inputs_acl_tensor_shapes;
803 for (const auto &input_index : input_indexes)
805 size_t input_rank = _ctx.at(input_index).shape().rank();
806 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
807 orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape());
808 assert(input_rank == input_tensor->num_dimensions());
809 if (input_rank != input_tensor->info()->num_dimensions())
811 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
812 input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
813 _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
817 auto fn = acl_common::generateLayer<arm_compute::CLStackLayer>(inputs, axis, output);
819 // Revert disabling applied dim_correction
820 assert(inputs.size() == orig_inputs_acl_tensor_shapes.size());
821 for (size_t i = 0; i < inputs.size(); ++i)
823 inputs.at(i)->info()->set_tensor_shape(orig_inputs_acl_tensor_shapes.at(i));
826 _return_fn = asAclFunction(std::move(fn));
829 void KernelGenerator::visit(const ir::operation::Pool2D &node)
831 auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
832 node, _ctx, _tensor_reg, _current_op_seq_layout,
833 acl_common::convertPoolType(node.param().op_type));
835 const auto ofm_index{node.getOutputs().at(0)};
836 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
837 const auto activation = node.param().activation;
838 _return_fn = std::make_unique<exec::FunctionSequence>(
839 asAclFunction(std::move(raw_fn)),
840 ActivationBuilder::generate(activation, ofm_tensor->handle()));
843 void KernelGenerator::visit(const ir::operation::Permute &node)
845 const auto ofm_idx{node.getOutputs().at(0)};
846 const auto ifm_idx{node.getInputs().at(0)};
847 const auto permute_type = node.getPermuteType();
848 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_idx).get();
849 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_idx).get();
850 const auto rank = _ctx.at(ofm_idx).shape().rank();
851 assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
853 std::unique_ptr<::arm_compute::IFunction> fn;
854 arm_compute::PermutationVector pv;
855 if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
858 pv = arm_compute::PermutationVector{2, 0, 1};
860 fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
861 ofm_tensor->handle(), pv);
863 else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
866 pv = arm_compute::PermutationVector{1, 2, 0};
868 fn = acl_common::generateLayer<::arm_compute::CLPermute>(ifm_tensor->handle(),
869 ofm_tensor->handle(), pv);
873 fn = acl_common::generateLayer<arm_compute::CLCopy>(ifm_tensor->handle(), ofm_tensor->handle());
876 _return_fn = asAclFunction(std::move(fn));
879 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
881 const auto ofm_index{node.getOutputs().at(0)};
883 const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
885 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
886 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
888 auto fn = acl_common::generateLayer<arm_compute::CLScale>(
889 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
890 ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
891 ::arm_compute::SamplingPolicy::TOP_LEFT);
893 _return_fn = asAclFunction(std::move(fn));
896 void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
898 const auto ofm_index{node.getOutputs().at(0)};
900 const auto ifm_index{node.getInputs().at(ir::operation::ResizeNearestNeighbor::Input::INPUT)};
902 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
903 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
905 auto fn = acl_common::generateLayer<arm_compute::CLScale>(
906 ifm_tensor->handle(), ofm_tensor->handle(),
907 ::arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, ::arm_compute::BorderMode::REPLICATE,
908 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
910 _return_fn = asAclFunction(std::move(fn));
913 void KernelGenerator::visit(const ir::operation::RNN &node)
915 const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
916 const auto hidden_state_out_index{
917 node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
919 const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
920 const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
921 const auto recurrent_weights_index{
922 node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
923 const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
924 const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
926 const auto activation = node.param().activation;
928 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
929 auto hidden_state_out_tensor = _tensor_reg->getAclTensor(hidden_state_out_index).get();
931 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
932 auto weights_tensor = _tensor_reg->getAclTensor(weights_index).get();
933 auto recurrent_weights_tensor = _tensor_reg->getAclTensor(recurrent_weights_index).get();
934 auto bias_tensor = _tensor_reg->getAclTensor(bias_index).get();
935 auto hidden_state_in_tensor = _tensor_reg->getAclTensor(hidden_state_in_index).get();
936 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
938 auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
939 hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
940 _return_fn = asAclFunction(std::move(copy_layer));
942 auto fn = acl_common::generateLayer<arm_compute::CLRNNLayer>(
943 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
944 weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
945 hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
946 _return_fn = asAclFunction(std::move(fn));
949 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
951 const auto ofm_index{node.getOutputs().at(0)};
952 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
953 const auto block_size_index{
954 node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
955 const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
957 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
958 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
959 auto block_size_tensor = _tensor_reg->getAclTensor(block_size_index).get();
960 auto paddings_tensor = _tensor_reg->getAclTensor(paddings_index).get();
962 assert(_ctx.at(block_size_index).data());
963 assert(_ctx.at(paddings_index).data());
965 auto fn = acl_common::generateLayer<arm_compute::CLSpaceToBatchLayer>(
966 ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
967 ofm_tensor->handle());
969 _return_fn = asAclFunction(std::move(fn));
972 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
974 const auto ofm_index{node.getOutputs().at(0)};
975 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
977 auto block_size = node.param().block_size;
979 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
980 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
982 auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
983 ifm_tensor->handle(), ofm_tensor->handle(), block_size);
985 _return_fn = asAclFunction(std::move(fn));
988 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
990 const auto output_index{node.getOutputs().at(0)};
991 const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
992 const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
994 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
995 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
996 auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
998 auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
999 values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
1001 _return_fn = asAclFunction(std::move(fn));
1004 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1006 const auto ofm_index{node.getOutputs().at(0)};
1007 const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
1009 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1010 // L2 Normalization always performs the reduction along the depth axis
1011 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1012 // choosing normalization parameters as below
1014 const auto &ifm_shape = _ctx.at(ifm_index).shape();
1015 // TODO Support optional constant dimension that normalization would be performed on
1016 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1018 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1019 float alpha = 1.0f; // In the implementation to make alpha_ become 1
1020 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
1021 float bias = 0.0f; // Don't offset the reduction.
1023 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1024 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1026 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1027 radius, alpha, beta, bias, false);
1029 auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1030 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1032 _return_fn = asAclFunction(std::move(fn));
1035 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
1037 const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
1038 const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
1040 const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
1041 const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
1042 const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
1044 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
1045 auto hits_tensor = _tensor_reg->getAclTensor(hits_index).get();
1047 auto lookups_tensor = _tensor_reg->getAclTensor(lookups_index).get();
1048 auto keys_tensor = _tensor_reg->getAclTensor(keys_index).get();
1049 auto values_tensor = _tensor_reg->getAclTensor(values_index).get();
1051 auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
1052 lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
1053 output_tensor->handle(), hits_tensor->handle());
1055 _return_fn = asAclFunction(std::move(fn));
1058 void KernelGenerator::visit(const ir::operation::PReLU &node)
1060 const auto ofm_index{node.getOutputs().at(0)};
1061 const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
1062 const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
1064 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1065 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1066 auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index).get();
1068 auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
1069 ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
1071 _return_fn = asAclFunction(std::move(fn));
1074 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1076 const auto ofm_index{node.getOutputs().at(0)};
1077 const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1078 const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1080 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1081 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1082 const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
1084 const auto stride = node.param().stride;
1086 assert((node.param().padding.type == ir::PaddingType::SAME) ||
1087 (node.param().padding.type == ir::PaddingType::VALID));
1088 auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1089 ker_shape.W, ker_shape.H);
1090 uint32_t invalid_horizontal = 0;
1091 uint32_t invalid_vertical = 0;
1092 if (node.param().padding.type == ir::PaddingType::VALID)
1094 invalid_horizontal =
1095 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1096 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1099 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1100 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1101 auto ker_tensor = _tensor_reg->getAclTensor(ker_index).get();
1103 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1105 auto fn = acl_common::generateLayer<arm_compute::CLTransposeConvLayer>(
1106 _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
1107 ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
1110 _return_fn = asAclFunction(std::move(fn));
1113 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1115 const auto ofm_index{node.getOutputs().at(0)};
1116 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1117 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1119 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1120 auto lhs_tensor = _tensor_reg->getAclTensor(lhs_index).get();
1121 auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index).get();
1123 auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
1124 lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
1126 _return_fn = asAclFunction(std::move(fn));
1129 void KernelGenerator::visit(const ir::operation::TopKV2 &node)
1131 const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
1132 const auto outputIndices_index{
1133 node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
1135 const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
1137 // Currently, we only support the vector input.
1138 assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1139 _ctx.at(inputData_index).shape().rank() == 2);
1141 const auto k = node.param().k;
1143 auto values_tensor = _tensor_reg->getAclTensor(outputValues_index).get();
1144 auto indices_tensor = _tensor_reg->getAclTensor(outputIndices_index).get();
1145 auto input_tensor = _tensor_reg->getAclTensor(inputData_index).get();
1147 auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
1148 input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
1150 _return_fn = asAclFunction(std::move(fn));
1153 void KernelGenerator::visit(const ir::operation::Gather &node)
1155 const auto ofm_index{node.getOutputs().at(0)};
1157 const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
1158 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
1160 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1161 const auto axis_raw = node.param().axis;
1162 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1163 const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
1165 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1166 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1167 auto indices_tensor = _tensor_reg->getAclTensor(indices_index).get();
1169 // NOTE The frontend layout and backend layout must be the same for this operation.
1170 // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1171 // is not not efficient even if it works well. If so, it would be better to set the
1172 // layout of these backend tensors to the same layout.
1173 // There is also one thing we have to think about. This operation depends on the layout of
1174 // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1175 // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1176 // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1177 const auto backend_layout = ofm_tensor->layout();
1178 UNUSED_RELEASE(backend_layout);
1179 assert(backend_layout == ifm_tensor->layout());
1180 assert(backend_layout == indices_tensor->layout());
1181 assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
1183 // input is n-D, indices k-D, output is (n + k - 1)-D
1184 size_t n = ifm_rank;
1185 assert(n == ifm_tensor->num_dimensions());
1186 size_t k = _ctx.at(indices_index).shape().rank();
1187 assert(k == indices_tensor->num_dimensions());
1189 // Disable applied dim_correction
1190 const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape();
1191 if (n != ifm_tensor->info()->num_dimensions())
1193 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1194 const auto ifm = _ctx.at(ifm_index);
1195 ifm_tensor->info()->set_tensor_shape(
1196 acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
1198 const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape();
1199 if (k != indices_tensor->info()->num_dimensions())
1201 // This means that high dimension's value is 1 and indices tensor is applied dim_correction
1202 const auto indices = _ctx.at(indices_index);
1203 indices_tensor->info()->set_tensor_shape(
1204 acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
1207 auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
1208 ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
1210 // Revert disabling applied dim_correction
1211 ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
1212 indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
1214 _return_fn = asAclFunction(std::move(fn));
1217 void KernelGenerator::visit(const ir::operation::ArgMax &node)
1219 const auto ofm_index{node.getOutputs().at(0)};
1220 const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
1222 auto ifm_shape = _ctx.at(ifm_index).shape();
1223 auto ofm_shape = _ctx.at(ofm_index).shape();
1225 assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1227 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1228 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1229 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1230 auto frontend_layout = _current_op_seq_layout;
1231 auto backend_layout = ifm_tensor->layout();
1233 int axis_value = node.param().axis;
1236 axis_value += ifm_rank;
1240 acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1242 auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayer>(
1243 ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
1244 ::arm_compute::ReductionOperation::ARG_IDX_MAX);
1246 _return_fn = asAclFunction(std::move(fn));
1249 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
1251 const auto ofm_index{node.getOutputs().at(0)};
1252 const auto ifm_index{
1253 node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
1255 auto radius = node.param().radius;
1256 auto alpha = node.param().alpha;
1257 auto beta = node.param().beta;
1258 auto bias = node.param().bias;
1260 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1261 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1263 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1264 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1266 auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
1267 ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
1269 _return_fn = asAclFunction(std::move(fn));
1272 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1274 const auto output_index{node.getOutputs().at(0)};
1275 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1277 auto block_size = node.param().block_size;
1278 assert(block_size > 0);
1280 auto output_tensor = _tensor_reg->getAclTensor(output_index).get();
1281 auto input_tensor = _tensor_reg->getAclTensor(input_index).get();
1283 auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
1284 input_tensor->handle(), output_tensor->handle(), block_size);
1286 _return_fn = asAclFunction(std::move(fn));
1289 void KernelGenerator::visit(const ir::operation::Split &node)
1291 const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1293 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1295 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1296 std::vector<ir::OperandIndex> output_indexes;
1297 for (const auto &output : node.getOutputs())
1298 output_indexes.emplace_back(output);
1300 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1301 std::vector<arm_compute::ICLTensor *> output_tensors;
1302 for (const auto &ofm_ind : output_indexes)
1303 output_tensors.emplace_back(_tensor_reg->getAclTensor(ofm_ind).get()->handle());
1305 const auto frontend_layout = _current_op_seq_layout;
1306 const auto backend_layout = ifm_tensor->layout();
1307 auto axis = node.param().axis;
1310 axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1313 acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
1315 _return_fn = asAclFunction(std::move(fn));
1318 void KernelGenerator::visit(const ir::operation::Unpack &node)
1320 const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1321 auto axis{node.param().axis};
1323 const auto input_rank = _ctx.at(input_index).shape().rank();
1325 std::vector<ir::OperandIndex> output_indexes;
1326 for (const auto &output_index : node.getOutputs())
1327 output_indexes.emplace_back(output_index);
1329 auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
1330 std::vector<arm_compute::ICLTensor *> outputs;
1331 for (const auto &output_index : output_indexes)
1332 outputs.emplace_back(_tensor_reg->getAclTensor(output_index)->handle());
1334 const auto frontend_layout = _current_op_seq_layout;
1335 const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
1338 axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1340 // Disable applied dim_correction
1341 std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
1342 for (const auto &output_index : output_indexes)
1344 size_t output_rank = _ctx.at(output_index).shape().rank();
1345 const auto &output_tensor = _tensor_reg->getAclTensor(output_index);
1346 orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
1347 assert(output_rank == output_tensor->num_dimensions());
1348 if (output_rank != output_tensor->info()->num_dimensions())
1350 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1351 output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
1352 _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
1356 auto fn = acl_common::generateLayer<arm_compute::CLUnstack>(input, outputs, axis);
1358 _return_fn = asAclFunction(std::move(fn));
1361 void KernelGenerator::visit(const ir::operation::Pad &node)
1363 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
1364 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
1365 const auto output_index{node.getOutputs().at(0)};
1366 assert(_ctx.at(pad_index).data());
1368 auto rank = _ctx.at(input_index).shape().rank();
1369 auto pad_base = _ctx.at(pad_index).data()->base();
1371 auto input_type = _ctx.at(input_index).typeInfo();
1372 auto data_type = acl_common::asDataType(input_type.type());
1373 auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
1374 const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1376 auto input = _tensor_reg->getAclTensor(input_index).get()->handle();
1377 auto output = _tensor_reg->getAclTensor(output_index).get()->handle();
1379 const auto frontend_layout = _current_op_seq_layout;
1380 const auto backend_layout = _tensor_reg->getAclTensor(input_index).get()->layout();
1382 ::arm_compute::PaddingList padding_list;
1383 padding_list.resize(rank);
1384 for (int32_t n = 0; n < rank; ++n)
1386 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
1389 acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
1390 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1393 // Disable applied dim_correction
1394 size_t input_rank = _ctx.at(input_index).shape().rank();
1395 const auto &input_tensor = _tensor_reg->getAclTensor(input_index);
1396 assert(input_rank == input_tensor->num_dimensions());
1397 if (input_rank != input_tensor->info()->num_dimensions())
1399 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1400 input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
1401 _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
1405 acl_common::generateLayer<arm_compute::CLPadLayer>(input, output, padding_list, pixel_value);
1407 // Do not revert disabling applied dim_correction CLPadKernel has cl kernel for 4-dimension
1408 // It would produce a mistach of result
1410 _return_fn = asAclFunction(std::move(fn));
1413 void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
1415 const auto ofm_index{node.getOutputs().at(0)};
1416 const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
1418 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1419 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1421 auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1422 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1424 _return_fn = asAclFunction(std::move(fn));
1427 void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
1429 const auto ofm_index{node.getOutputs().at(0)};
1430 const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
1432 auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index).get();
1433 auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index).get();
1435 auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
1436 ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
1438 _return_fn = asAclFunction(std::move(fn));
1441 } // namespace acl_cl
1442 } // namespace backend
1443 } // namespace onert