2 * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "KernelGenerator.h"
19 #include <arm_compute/runtime/CL/CLFunctions.h> // Include all ARM Compute CL functions
20 #include <arm_compute/runtime/CL/CLFunctionsEx.h> // Include all ARM Compute EX CL functions
22 #include <AclActivationBuilder.h>
23 #include <AclFunction.h>
28 #include "ir/DataType.h"
29 #include "ir/InternalType.h"
30 #include "exec/NopFunction.h"
31 #include "exec/FunctionSequence.h"
32 #include "util/logging.h"
33 #include "util/Utils.h"
42 using ::onert::backend::acl_common::asAclClFunction;
43 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
44 ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclClFunction>;
46 KernelGenerator::KernelGenerator(const ir::Operands &operands_ctx,
47 const ir::Operations &operations_ctx,
48 const std::shared_ptr<TensorBuilder> &tensor_builder)
49 : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
50 _current_op_seq_layout(ir::Layout::UNKNOWN)
55 void KernelGenerator::visit(const ir::OpSequence &op_seq)
57 // TODO Move this to IKernelGenerator
58 // (all derivatives have the same implementation for this)
59 assert(!_return_fn_seq);
60 _return_fn_seq = std::make_unique<exec::FunctionSequence>();
61 _return_fn_seq->enableDynamicShapeInferer(false);
63 _current_op_seq_layout = op_seq.getLayout();
64 for (const auto &operation_idx : op_seq.operations())
66 const auto &node = _operations_ctx.at(operation_idx);
68 _return_fn_seq->append(releaseFunction());
72 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
74 const auto ofm_index{node.getOutputs().at(0)};
75 const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
76 const auto block_size_index{
77 node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
79 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
80 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
81 auto block_size_alloc = _tensor_builder->at(block_size_index).get();
83 assert(_ctx.at(block_size_index).data());
85 auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>();
87 fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
89 auto acl_fn = asAclClFunction(std::move(fn));
91 _return_fn = std::move(acl_fn);
94 void KernelGenerator::visit(const ir::operation::Cast &node)
96 const auto ofm_index{node.getOutputs().at(0)};
97 const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
99 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
100 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
101 const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
102 ? arm_compute::SubDataType::BOOL
103 : arm_compute::SubDataType::NONE;
105 auto fn = std::make_unique<::arm_compute::CLCast>();
107 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
109 auto acl_fn = asAclClFunction(std::move(fn));
111 _return_fn = std::move(acl_fn);
114 void KernelGenerator::visit(const ir::operation::Conv2D &node)
116 using ir::operation::Conv2D;
118 const auto ofm_index{node.getOutputs().at(0)};
119 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
120 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
121 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
123 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
124 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
125 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
126 const auto &ker_shape = _ctx.at(ker_index).shape();
127 const auto ker_height = ker_shape.dim(1);
128 const auto ker_width = ker_shape.dim(2);
130 const auto stride = node.param().stride;
131 const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
132 ker_width, ker_height);
133 const auto activation = node.param().activation;
135 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
136 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
137 auto ker_alloc = _tensor_builder->at(ker_index).get();
138 auto bias_alloc = _tensor_builder->at(bias_index).get();
140 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
141 const auto act_info = acl_common::asActivationLayerInfo(activation);
143 auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>(
144 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
146 fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
147 conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
149 _return_fn = asAclClFunction(std::move(fn));
152 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
154 using ir::operation::DepthwiseConv2D;
156 const auto ofm_index{node.getOutputs().at(0)};
157 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
158 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
159 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
161 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
162 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
163 // Kernel format is [1, kernel_height, kernel_width, depth_out].
164 const auto &ker_shape = _ctx.at(ker_index).shape();
165 const auto ker_height = ker_shape.dim(1);
166 const auto ker_width = ker_shape.dim(2);
168 const auto stride = node.param().stride;
169 const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
170 ker_width, ker_height);
171 const auto multiplier = node.param().multiplier;
172 const auto activation = node.param().activation;
174 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
175 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
176 auto ker_alloc = _tensor_builder->at(ker_index).get();
177 auto bias_alloc = _tensor_builder->at(bias_index).get();
179 const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
180 const auto act_info = acl_common::asActivationLayerInfo(activation);
183 auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
185 fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
186 ofm_alloc->handle(), conv_info, multiplier, act_info);
188 _return_fn = asAclClFunction(std::move(fn));
192 void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
194 const auto ofm_index{node.getOutputs().at(0)};
195 const auto ifm_index{node.getInputs().at(ir::operation::MaxPool2D::Input::INPUT)};
197 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
198 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
200 const auto kh = node.param().kh;
201 const auto kw = node.param().kw;
202 const auto stride = node.param().stride;
204 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
205 const auto activation = node.param().activation;
207 VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
208 VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
209 VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
210 VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
211 VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
212 VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
213 VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
214 VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
215 VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
216 VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
217 VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
218 VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
220 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
221 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
223 ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
224 ::arm_compute::Size2D{kw, kh},
225 acl_common::asPadStrideInfo(padding, stride)};
227 auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
229 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
231 _return_fn = std::make_unique<exec::FunctionSequence>(
232 asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
235 void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
237 const auto ofm_index{node.getOutputs().at(0)};
238 const auto ifm_index{node.getInputs().at(ir::operation::AvgPool2D::Input::INPUT)};
240 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
241 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
243 const auto kh = node.param().kh;
244 const auto kw = node.param().kw;
245 const auto stride = node.param().stride;
247 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
248 const auto activation = node.param().activation;
250 VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
251 VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
252 VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
253 VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
254 VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
255 VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
256 VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
257 VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
258 VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
259 VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
260 VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
261 VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
263 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
264 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
266 ::arm_compute::PoolingLayerInfo info{
267 ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
268 acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
270 auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
272 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
274 _return_fn = std::make_unique<exec::FunctionSequence>(
275 asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
278 void KernelGenerator::visit(const ir::operation::Concat &node)
280 const auto ofm_index{node.getOutputs().at(0)};
282 std::vector<ir::OperandIndex> input_indexes;
284 for (const auto &input : node.getInputs())
285 input_indexes.emplace_back(input);
287 const auto axis = node.param().axis;
289 // Concat elimination check
290 bool eliminated = _tensor_builder->areSubTensorsOf(ofm_index, node.getInputs());
293 // If concat eliminated, return a NOP IFunction
294 VERBOSE(acl_cl_KernelGenerator_Concat) << "Concat eliminated" << std::endl;
295 _return_fn = std::make_unique<exec::NopFunction>();
299 auto output_alloc = _tensor_builder->at(ofm_index).get();
300 std::vector<::arm_compute::ICLTensor *> input_tensors;
301 for (auto &ifm_ind : input_indexes)
302 input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
304 std::unique_ptr<::arm_compute::IFunction> fn;
305 if (input_indexes.size() < 2)
307 auto l = std::make_unique<::arm_compute::CLCopy>();
308 l->configure(input_tensors.at(0), output_alloc->handle());
313 auto l = std::make_unique<::arm_compute::CLConcatenateLayer>();
314 const auto rank = _ctx.at(ofm_index).shape().rank();
315 const auto frontend_layout = _current_op_seq_layout;
316 const auto backend_layout = output_alloc->layout();
317 const auto fixed_axis =
318 acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
319 l->configure(input_tensors, output_alloc->handle(), fixed_axis);
323 auto acl_fn = asAclClFunction(std::move(fn));
325 _return_fn = std::move(acl_fn);
328 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
330 using ir::operation::FullyConnected;
332 const auto output_index{node.getOutputs().at(0)};
333 const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
334 const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
335 const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
337 const auto input_rank = _ctx.at(input_index).shape().rank();
339 const auto output_size =
340 _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
341 UNUSED_RELEASE(output_size);
342 assert(_ctx.at(bias_index).shape().dim(0) == output_size);
343 assert(_ctx.at(weight_index).shape().dim(0) == output_size);
344 const auto batch_size =
345 _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
346 const auto input_size =
347 _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
349 // Check for reshaping input's shape into rank-2
350 bool needs_reshape = false;
351 ir::Shape reshape(2);
352 if (input_rank == 3 || input_rank == 4)
354 const auto &ifm_shape = _ctx.at(input_index).shape();
355 auto feature_size = 1;
356 for (int i = 0; i < ifm_shape.rank(); ++i)
358 feature_size *= ifm_shape.dim(i);
361 UNUSED_RELEASE(feature_size);
362 assert(feature_size == batch_size * input_size);
365 needs_reshape = true;
366 reshape.dim(0) = batch_size; /* H */
367 reshape.dim(1) = input_size; /* W */
370 const auto activation = node.param().activation;
372 auto output_alloc = _tensor_builder->at(output_index).get();
373 const auto input_alloc = _tensor_builder->at(input_index).get();
374 const auto weight_alloc = _tensor_builder->at(weight_index).get();
375 const auto bias_alloc = _tensor_builder->at(bias_index).get();
376 const auto frontend_layout = _current_op_seq_layout;
377 const auto acl_layout = output_alloc->handle()->info()->data_layout();
379 auto fn = std::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
380 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
382 arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type =
383 arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL;
384 if (_ctx.at(weight_index).isConstant())
386 kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
387 assert(_ctx.at(weight_index).data());
390 input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
392 ::onert::backend::acl_common::asTensorShape(
393 reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
396 _return_fn = std::make_unique<exec::FunctionSequence>(
397 asAclClFunction(std::move(fn)),
398 ActivationBuilder::generate(activation, output_alloc->handle()));
401 void KernelGenerator::visit(const ir::operation::Mul &node)
403 const auto ofm_index{node.getOutputs().at(0)};
404 const auto lhs_index{node.getInputs().at(ir::operation::Mul::Input::LHS)};
405 const auto rhs_index{node.getInputs().at(ir::operation::Mul::Input::RHS)};
407 const auto activation = node.param().activation;
409 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
410 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
411 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
413 auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>();
415 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
416 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
418 _return_fn = std::make_unique<exec::FunctionSequence>(
419 asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
422 void KernelGenerator::visit(const ir::operation::Reduce &node)
424 const auto output_index{node.getOutputs().at(0)};
425 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
426 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
427 const auto keep_dims{node.param().keep_dims};
428 const auto reduce_type = node.param().reduce_type;
430 auto output_alloc = _tensor_builder->at(output_index).get();
431 auto input_alloc = _tensor_builder->at(input_index).get();
433 // Convert to ACL axes taking into account negative values and possible duplicates.
434 const auto &axes = _ctx.at(axes_index);
435 const auto input_rank = _ctx.at(input_index).shape().rank();
436 const auto frontend_layout = _current_op_seq_layout;
437 const auto backend_layout = input_alloc->layout();
439 std::unique_ptr<arm_compute::IFunction> fn;
440 if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
442 auto l = std::make_unique<::arm_compute::CLReduceMean>();
444 const auto acl_axes =
445 acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
446 l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle());
452 auto l = std::make_unique<::arm_compute::CLReduceOperation>(
453 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
455 const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
456 l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims,
457 acl_common::convertReduceType(reduce_type));
462 auto acl_fn = asAclClFunction(std::move(fn));
464 _return_fn = std::move(acl_fn);
467 void KernelGenerator::visit(const ir::operation::Reshape &node)
469 const auto output_index{node.getOutputs().at(0)};
470 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
472 auto output_alloc = _tensor_builder->at(output_index).get();
473 auto input_alloc = _tensor_builder->at(input_index).get();
475 // NOTE This operation must not be changed the layout from frontend to backend
476 // So, PermutationOperationPass makes layouts of frontend and backend the same.
477 const auto frontend_layout = _current_op_seq_layout;
478 const auto backend_layout = output_alloc->layout();
479 assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
480 frontend_layout == backend_layout);
481 UNUSED_RELEASE(frontend_layout);
482 UNUSED_RELEASE(backend_layout);
484 auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
486 fn->configure(input_alloc->handle(), output_alloc->handle());
488 auto acl_fn = asAclClFunction(std::move(fn));
490 _return_fn = std::move(acl_fn);
493 void KernelGenerator::visit(const ir::operation::Squeeze &node)
495 // Squeeze is identical to reshape except that it has an optional dimensions input.
496 // In addition, optional dims_index is ignored since output tensor already has squeezed shape
497 // by freezer and toco
498 // TODO Support multi-layout for frontend and backend
499 const auto output_index{node.getOutputs().at(0)};
500 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
501 const auto dims{node.param().dims};
502 const auto ndim{node.param().ndim};
506 auto output_alloc = _tensor_builder->at(output_index).get();
507 auto input_alloc = _tensor_builder->at(input_index).get();
508 auto fn = std::make_unique<arm_compute::CLReshapeLayer>();
509 fn->configure(input_alloc->handle(), output_alloc->handle());
510 auto acl_fn = asAclClFunction(std::move(fn));
511 _return_fn = std::move(acl_fn);
514 void KernelGenerator::visit(const ir::operation::Tanh &node)
516 const auto output_index{node.getOutputs().at(0)};
517 const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
519 auto output_alloc = _tensor_builder->at(output_index).get();
520 auto input_alloc = _tensor_builder->at(input_index).get();
522 auto fn = std::make_unique<arm_compute::CLActivationLayer>();
524 const ::arm_compute::ActivationLayerInfo act_info{
525 ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
527 fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
529 auto acl_fn = asAclClFunction(std::move(fn));
531 _return_fn = std::move(acl_fn);
534 void KernelGenerator::visit(const ir::operation::Softmax &node)
536 const auto output_index{node.getOutputs().at(0)};
537 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
539 const auto beta = node.param().beta;
541 auto output_alloc = _tensor_builder->at(output_index).get();
542 auto input_alloc = _tensor_builder->at(input_index).get();
544 auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>(
545 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
547 fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
549 auto acl_fn = asAclClFunction(std::move(fn));
551 _return_fn = std::move(acl_fn);
554 void KernelGenerator::visit(const ir::operation::Slice &node)
556 const auto output_index{node.getOutputs().at(0)};
557 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
558 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
559 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
561 auto outputData_alloc = _tensor_builder->at(output_index).get();
562 auto inputData_alloc = _tensor_builder->at(input_index).get();
563 const auto frontend_layout = _current_op_seq_layout;
564 const auto backend_layout = inputData_alloc->layout();
566 // Set initializers for indices data such as order of inputData
567 int input_rank = _ctx.at(input_index).shape().rank();
568 std::vector<int32_t> starts;
569 std::vector<int32_t> ends;
570 starts.resize(input_rank, 0);
571 ends.resize(input_rank, 0);
573 assert(_ctx.at(begins_index).data());
574 assert(_ctx.at(sizes_index).data());
575 auto beginData_base = _ctx.at(begins_index).data()->base();
576 auto sizeData_base = _ctx.at(sizes_index).data()->base();
577 const int beginData_size = _ctx.at(begins_index).shape().num_elements();
578 const int sizeData_size = _ctx.at(sizes_index).shape().num_elements();
582 UNUSED_RELEASE(beginData_size);
583 UNUSED_RELEASE(sizeData_size);
585 assert(_ctx.at(begins_index).typeInfo().type() == DataType::INT32);
586 assert(_ctx.at(sizes_index).typeInfo().type() == DataType::INT32);
587 assert(beginData_size == input_rank);
588 assert(sizeData_size == input_rank);
590 assert(beginData_base != nullptr);
591 for (int n = 0; n < input_rank; ++n)
593 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
597 int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
598 starts[axis] = begin_value;
600 int32_t size_value = *(reinterpret_cast<const int32_t *>(sizeData_base) + n);
601 ends[axis] = begin_value + size_value;
605 ::arm_compute::Coordinates starts_set;
606 ::arm_compute::Coordinates ends_set;
608 for (size_t i = 0; i < starts.size(); ++i)
610 starts_set.set(i, starts[i]);
611 ends_set.set(i, ends[i]);
614 auto fn = std::make_unique<::arm_compute::CLSlice>();
616 fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
618 auto acl_fn = asAclClFunction(std::move(fn));
620 _return_fn = std::move(acl_fn);
623 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
625 const auto output_index{node.getOutputs().at(0)};
626 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
627 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
628 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
629 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
631 auto outputData_alloc = _tensor_builder->at(output_index).get();
632 auto inputData_alloc = _tensor_builder->at(input_index).get();
633 const auto frontend_layout = _current_op_seq_layout;
634 const auto backend_layout = inputData_alloc->layout();
636 // Set initializers for indices data such as order of inputData
637 int input_rank = _ctx.at(input_index).shape().rank();
638 std::vector<int32_t> starts;
639 std::vector<int32_t> ends;
640 std::vector<int32_t> strides;
641 starts.resize(input_rank, 0);
642 ends.resize(input_rank, 0);
643 strides.resize(input_rank, 0);
645 assert(_ctx.at(starts_index).data());
646 assert(_ctx.at(ends_index).data());
647 assert(_ctx.at(strides_index).data());
648 auto startData_base = _ctx.at(starts_index).data()->base();
649 auto endData_base = _ctx.at(ends_index).data()->base();
650 auto stridesData_base = _ctx.at(strides_index).data()->base();
651 const int startData_size = _ctx.at(starts_index).shape().num_elements();
652 const int endData_size = _ctx.at(ends_index).shape().num_elements();
653 const int stridesData_size = _ctx.at(strides_index).shape().num_elements();
657 UNUSED_RELEASE(startData_size);
658 UNUSED_RELEASE(endData_size);
659 UNUSED_RELEASE(stridesData_size);
661 assert(_ctx.at(starts_index).typeInfo().type() == DataType::INT32);
662 assert(_ctx.at(ends_index).typeInfo().type() == DataType::INT32);
663 assert(_ctx.at(strides_index).typeInfo().type() == DataType::INT32);
664 assert(startData_size == input_rank);
665 assert(endData_size == input_rank);
666 assert(stridesData_size == input_rank);
668 assert(startData_base != nullptr);
669 for (int n = 0; n < input_rank; ++n)
671 auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
675 int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
676 starts[axis] = start_value;
678 int32_t end_value = *(reinterpret_cast<const int32_t *>(endData_base) + n);
679 ends[axis] = end_value;
681 int32_t strides_value = *(reinterpret_cast<const int32_t *>(stridesData_base) + n);
682 strides[axis] = strides_value;
686 // Set mask bits such as order of inputData
687 const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank,
688 frontend_layout, backend_layout);
689 const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
690 frontend_layout, backend_layout);
691 const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
692 node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
694 ::arm_compute::Coordinates starts_set;
695 ::arm_compute::Coordinates ends_set;
696 ::arm_compute::BiStrides strides_set;
698 for (size_t i = 0; i < starts.size(); ++i)
700 starts_set.set(i, starts[i]);
701 ends_set.set(i, ends[i]);
702 strides_set.set(i, strides[i]);
705 auto fn = std::make_unique<::arm_compute::CLStridedSlice>();
707 fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
708 strides_set, begin_mask, end_mask, shrink_axis_mask);
710 auto acl_fn = asAclClFunction(std::move(fn));
712 _return_fn = std::move(acl_fn);
715 void KernelGenerator::visit(const ir::operation::Transpose &node)
717 const auto ofm_idx{node.getOutputs().at(0)};
718 const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
719 const auto &perm{node.param().perm};
721 const auto rank = _ctx.at(ifm_idx).shape().rank();
723 auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
724 auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
725 const auto frontend_layout = _current_op_seq_layout;
726 const auto backend_layout = ifm_alloc->layout();
728 std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
730 auto backend_pv = ::onert::backend::acl_common::getARMComputePermutationVector(
731 rank, pv, frontend_layout, backend_layout);
733 auto fn = std::make_unique<::arm_compute::CLPermute>();
735 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
737 auto acl_fn = asAclClFunction(std::move(fn));
739 _return_fn = std::move(acl_fn);
742 void KernelGenerator::visit(const ir::operation::Add &node)
744 const auto ofm_index{node.getOutputs().at(0)};
745 const auto lhs_index{node.getInputs().at(ir::operation::Add::Input::LHS)};
746 const auto rhs_index{node.getInputs().at(ir::operation::Add::Input::RHS)};
748 const auto activation = node.param().activation;
750 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
751 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
752 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
754 auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>();
756 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
757 arm_compute::ConvertPolicy::SATURATE);
759 _return_fn = std::make_unique<exec::FunctionSequence>(
760 asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
763 void KernelGenerator::visit(const ir::operation::Sub &node)
765 const auto ofm_index{node.getOutputs().at(0)};
766 const auto lhs_index{node.getInputs().at(ir::operation::Sub::Input::LHS)};
767 const auto rhs_index{node.getInputs().at(ir::operation::Sub::Input::RHS)};
769 const auto activation = node.param().activation;
771 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
772 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
773 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
775 auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>();
777 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
778 arm_compute::ConvertPolicy::SATURATE);
780 _return_fn = std::make_unique<exec::FunctionSequence>(
781 asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
784 void KernelGenerator::visit(const ir::operation::Div &node)
786 const auto ofm_index{node.getOutputs().at(0)};
787 const auto lhs_index{node.getInputs().at(ir::operation::Div::Input::LHS)};
788 const auto rhs_index{node.getInputs().at(ir::operation::Div::Input::RHS)};
790 const auto activation = node.param().activation;
792 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
793 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
794 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
796 auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>();
798 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
800 _return_fn = std::make_unique<exec::FunctionSequence>(
801 asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
804 void KernelGenerator::visit(const ir::operation::Exp &node)
806 const auto output_index{node.getOutputs().at(0)};
807 const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
809 auto output_alloc = _tensor_builder->at(output_index).get();
810 auto input_alloc = _tensor_builder->at(input_index).get();
812 auto fn = std::make_unique<::arm_compute::CLExpLayer>();
814 fn->configure(input_alloc->handle(), output_alloc->handle());
816 auto acl_fn = asAclClFunction(std::move(fn));
818 _return_fn = std::move(acl_fn);
821 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
823 const auto output_index{node.getOutputs().at(0)};
824 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
826 auto output_alloc = _tensor_builder->at(output_index).get();
827 auto input_alloc = _tensor_builder->at(input_index).get();
829 auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
831 fn->configure(input_alloc->handle(), output_alloc->handle());
833 auto acl_fn = asAclClFunction(std::move(fn));
835 _return_fn = std::move(acl_fn);
838 void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
840 const auto ofm_index{node.getOutputs().at(0)};
841 const auto ifm_index{node.getInputs().at(ir::operation::InstanceNorm::Input::INPUT)};
842 const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
843 const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
845 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
846 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
847 auto gamma_alloc = _tensor_builder->at(gamma_index).get();
848 auto beta_alloc = _tensor_builder->at(beta_index).get();
849 auto epsilon = node.param().epsilon;
850 auto activation = node.param().activation;
852 auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
854 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
855 beta_alloc->handle(), epsilon);
857 _return_fn = std::make_unique<exec::FunctionSequence>(
858 asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
861 void KernelGenerator::visit(const ir::operation::Logistic &node)
863 const auto ofm_index{node.getOutputs().at(0)};
864 const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
866 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
867 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
869 const ::arm_compute::ActivationLayerInfo act_info{
870 ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
872 auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
874 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
876 auto acl_fn = asAclClFunction(std::move(fn));
878 _return_fn = std::move(acl_fn);
881 void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
883 const auto output_index{node.getOutputs().at(0)};
884 const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
885 const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
887 auto output_alloc = _tensor_builder->at(output_index).get();
888 auto input0_alloc = _tensor_builder->at(input0_index).get();
889 auto input1_alloc = _tensor_builder->at(input1_index).get();
891 auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>();
893 fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
894 ::arm_compute::BinaryLogicalOperation::AND);
896 auto acl_fn = asAclClFunction(std::move(fn));
898 _return_fn = std::move(acl_fn);
901 void KernelGenerator::visit(const ir::operation::LSTM &node)
903 // TODO Support dynamic rnn
904 // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
905 const auto scratch_buffer_index{
906 node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
907 const auto output_state_out_index{
908 node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
909 const auto cell_state_out_index{
910 node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
911 const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
913 const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
914 const auto input_to_input_weights_index{
915 node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
916 const auto input_to_forget_weights_index{
917 node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
918 const auto input_to_cell_weights_index{
919 node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
920 const auto input_to_output_weights_index{
921 node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
922 const auto recurrent_to_input_weights_index{
923 node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
924 const auto recurrent_to_forget_weights_index{
925 node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
926 const auto recurrent_to_cell_weights_index{
927 node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
928 const auto recurrent_to_output_weights_index{
929 node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
930 const auto cell_to_input_weights_index{
931 node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
932 const auto cell_to_forget_weights_index{
933 node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
934 const auto cell_to_output_weights_index{
935 node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
936 const auto input_gate_bias_index{
937 node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
938 const auto forget_gate_bias_index{
939 node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
940 const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
941 const auto output_gate_bias_index{
942 node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
943 const auto projection_weights_index{
944 node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
945 const auto projection_bias_index{
946 node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
947 const auto output_state_in_index{
948 node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
949 const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
950 const auto cell_threshold = node.param().cell_threshold;
951 const auto projection_threshold = node.param().projection_threshold;
953 bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
954 _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
955 bool has_recurrent_to_input_weights =
956 _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
957 _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
958 bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
959 bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
960 bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
961 _ctx.at(projection_weights_index).shape().dim(1) != 0;
962 bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
964 // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
967 // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
968 bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
970 // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
971 // But the cell_to_input_weights does not exist in regular CIFG although peephole.
973 // false: no peephole
974 bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
976 // NOTE Although the projection weights has data the projection bias may not have data.
977 bool has_projection_param = has_projection_weights;
979 const auto activation = node.param().activation;
980 const auto cell_clip = cell_threshold;
981 const auto projection_clip = projection_threshold;
982 assert(cell_clip >= 0.f && projection_clip >= 0.f);
984 auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
985 auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
986 auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
987 auto output_alloc = _tensor_builder->at(output_index).get();
989 auto input_alloc = _tensor_builder->at(input_index).get();
991 auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
992 auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
993 auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
994 auto recurrent_to_forget_weights_alloc =
995 _tensor_builder->at(recurrent_to_forget_weights_index).get();
996 auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
997 auto recurrent_to_output_weights_alloc =
998 _tensor_builder->at(recurrent_to_output_weights_index).get();
1000 auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
1001 auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
1002 auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
1003 auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
1004 auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
1006 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
1008 auto fn = std::make_unique<::arm_compute::CLLSTMLayer>();
1010 ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
1013 auto input_to_input_weights_alloc =
1014 _tensor_builder->at(input_to_input_weights_index).get(); // optional
1015 auto recurrent_to_input_weights_alloc =
1016 _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
1017 auto cell_to_input_weights_handle =
1018 has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
1019 : nullptr; // optional (non-cifg && peephole)
1020 auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
1021 lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
1022 recurrent_to_input_weights_alloc->handle(),
1023 cell_to_input_weights_handle, input_gate_bias_alloc->handle());
1025 if (has_peephole_param)
1027 auto cell_to_forget_weights_alloc =
1028 _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
1029 auto cell_to_output_weights_alloc =
1030 _tensor_builder->at(cell_to_output_weights_index).get(); // optional
1031 lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
1032 cell_to_output_weights_alloc->handle());
1034 if (has_projection_param)
1036 auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
1037 auto projection_bias_handle = has_projection_bias
1038 ? _tensor_builder->at(projection_bias_index).get()->handle()
1039 : nullptr; // optional
1040 lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
1044 input_alloc->handle(), input_to_forget_weights_alloc->handle(),
1045 input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
1046 recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
1047 recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
1048 cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
1049 cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
1050 output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
1051 lstm_params, act_info, cell_clip, projection_clip);
1053 auto acl_fn = asAclClFunction(std::move(fn));
1055 _return_fn = std::move(acl_fn);
1058 void KernelGenerator::visit(const ir::operation::Comparison &node)
1060 const auto output_index{node.getOutputs().at(0)};
1061 const auto input0_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
1062 const auto input1_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
1064 const auto comparison_type = node.param().comparison_type;
1066 auto output_alloc = _tensor_builder->at(output_index).get();
1067 auto input0_alloc = _tensor_builder->at(input0_index).get();
1068 auto input1_alloc = _tensor_builder->at(input1_index).get();
1070 auto fn = std::make_unique<::arm_compute::CLComparison>();
1072 fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
1073 (arm_compute::ComparisonOperation)comparison_type);
1075 auto acl_fn = asAclClFunction(std::move(fn));
1077 _return_fn = std::move(acl_fn);
1080 void KernelGenerator::visit(const ir::operation::Pack &node)
1082 const auto output_index{node.getOutputs().at(0)};
1083 auto axis{node.param().axis};
1085 const auto output_rank = _ctx.at(output_index).shape().rank();
1087 std::vector<ir::OperandIndex> input_indexes;
1088 for (const auto &input_index : node.getInputs())
1089 input_indexes.emplace_back(input_index);
1091 auto output = _tensor_builder->at(output_index).get()->handle();
1092 std::vector<arm_compute::ICLTensor *> inputs;
1093 for (const auto &input_index : input_indexes)
1094 inputs.emplace_back(_tensor_builder->at(input_index)->handle());
1096 const auto frontend_layout = _current_op_seq_layout;
1097 const auto backend_layout = _tensor_builder->at(output_index).get()->layout();
1100 axis += output_rank;
1101 axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
1103 auto fn = std::make_unique<::arm_compute::CLStackLayer>();
1105 // Disable applied dim_correction
1106 std::vector<arm_compute::TensorShape> orig_inputs_acl_tensor_shapes;
1107 for (const auto &input_index : input_indexes)
1109 size_t input_rank = _ctx.at(input_index).shape().rank();
1110 const auto &input_alloc = _tensor_builder->at(input_index);
1111 orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape());
1112 assert(input_rank == input_alloc->num_dimensions());
1113 if (input_rank != input_alloc->info()->num_dimensions())
1115 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1116 input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
1117 _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
1121 fn->configure(inputs, axis, output);
1123 // Revert disabling applied dim_correction
1124 assert(inputs.size() == orig_inputs_acl_tensor_shapes.size());
1125 for (size_t i = 0; i < inputs.size(); ++i)
1127 inputs.at(i)->info()->set_tensor_shape(orig_inputs_acl_tensor_shapes.at(i));
1130 _return_fn = asAclClFunction(std::move(fn));
1133 void KernelGenerator::visit(const ir::operation::Permute &node)
1135 const auto ofm_idx{node.getOutputs().at(0)};
1136 const auto ifm_idx{node.getInputs().at(0)};
1137 const auto permute_type = node.getPermuteType();
1138 auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
1139 auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
1140 const auto rank = _ctx.at(ofm_idx).shape().rank();
1141 assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
1143 std::unique_ptr<::arm_compute::IFunction> fn;
1144 arm_compute::PermutationVector pv;
1145 if (permute_type == ir::operation::Permute::Type::NCHW_TO_NHWC && rank == 4)
1148 pv = arm_compute::PermutationVector{2, 0, 1};
1150 auto l = std::make_unique<::arm_compute::CLPermute>();
1152 l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1156 else if (permute_type == ir::operation::Permute::Type::NHWC_TO_NCHW && rank == 4)
1159 pv = arm_compute::PermutationVector{1, 2, 0};
1161 auto l = std::make_unique<::arm_compute::CLPermute>();
1163 l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
1169 auto l = std::make_unique<::arm_compute::CLCopy>();
1171 l->configure(ifm_alloc->handle(), ofm_alloc->handle());
1176 auto acl_fn = asAclClFunction(std::move(fn));
1178 _return_fn = std::move(acl_fn);
1181 void KernelGenerator::visit(const ir::operation::RSQRT &node)
1183 const auto ofm_index{node.getOutputs().at(0)};
1184 const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
1186 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1187 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1189 auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>();
1191 fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1193 _return_fn = asAclClFunction(std::move(fn));
1196 void KernelGenerator::visit(const ir::operation::ReLU &node)
1198 const auto output_index{node.getOutputs().at(0)};
1199 const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
1201 auto output_alloc = _tensor_builder->at(output_index).get();
1202 auto input_alloc = _tensor_builder->at(input_index).get();
1204 auto fn = std::make_unique<arm_compute::CLActivationLayer>();
1206 const ::arm_compute::ActivationLayerInfo act_info{
1207 ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
1209 fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1211 auto acl_fn = asAclClFunction(std::move(fn));
1213 _return_fn = std::move(acl_fn);
1216 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
1218 const auto ofm_index{node.getOutputs().at(0)};
1220 const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
1222 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1223 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1225 auto fn = std::make_unique<::arm_compute::CLScale>();
1227 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
1228 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
1229 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
1231 auto acl_fn = asAclClFunction(std::move(fn));
1233 _return_fn = std::move(acl_fn);
1236 void KernelGenerator::visit(const ir::operation::ReLU1 &node)
1238 const auto ofm_index{node.getOutputs().at(0)};
1239 const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
1241 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1242 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1244 const ::arm_compute::ActivationLayerInfo act_info{
1245 ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
1247 auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
1249 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1251 auto acl_fn = asAclClFunction(std::move(fn));
1253 _return_fn = std::move(acl_fn);
1256 void KernelGenerator::visit(const ir::operation::ReLU6 &node)
1258 const auto ofm_index{node.getOutputs().at(0)};
1259 const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
1261 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1262 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1264 const ::arm_compute::ActivationLayerInfo act_info{
1265 ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
1267 auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
1269 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
1271 auto acl_fn = asAclClFunction(std::move(fn));
1273 _return_fn = std::move(acl_fn);
1276 void KernelGenerator::visit(const ir::operation::RNN &node)
1278 const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
1279 const auto hidden_state_out_index{
1280 node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
1282 const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
1283 const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
1284 const auto recurrent_weights_index{
1285 node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
1286 const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
1287 const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
1289 const auto activation = node.param().activation;
1291 auto output_alloc = _tensor_builder->at(output_index).get();
1292 auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
1294 auto input_alloc = _tensor_builder->at(input_index).get();
1295 auto weights_alloc = _tensor_builder->at(weights_index).get();
1296 auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
1297 auto bias_alloc = _tensor_builder->at(bias_index).get();
1298 auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
1299 auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
1301 auto copy_layer = std::make_unique<::arm_compute::CLCopy>();
1302 copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
1303 _return_fn = asAclClFunction(std::move(copy_layer));
1305 auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>(
1306 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1307 fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
1308 bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
1310 _return_fn = asAclClFunction(std::move(fn));
1313 void KernelGenerator::visit(const ir::operation::Floor &node)
1315 const auto ofm_index{node.getOutputs().at(0)};
1316 const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
1318 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1319 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1321 auto fn = std::make_unique<::arm_compute::CLFloor>();
1323 fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1325 auto acl_fn = asAclClFunction(std::move(fn));
1327 _return_fn = std::move(acl_fn);
1330 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1332 const auto ofm_index{node.getOutputs().at(0)};
1333 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
1334 const auto block_size_index{
1335 node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
1336 const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
1338 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1339 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1340 auto block_size_alloc = _tensor_builder->at(block_size_index).get();
1341 auto paddings_alloc = _tensor_builder->at(paddings_index).get();
1343 assert(_ctx.at(block_size_index).data());
1344 assert(_ctx.at(paddings_index).data());
1346 std::unique_ptr<::arm_compute::IFunction> fn;
1348 auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>();
1349 l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
1350 ofm_alloc->handle());
1353 auto acl_fn = asAclClFunction(std::move(fn));
1355 _return_fn = std::move(acl_fn);
1358 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1360 const auto ofm_index{node.getOutputs().at(0)};
1361 const auto ifm_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1363 auto block_size = node.param().block_size;
1365 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1366 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1368 auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>();
1370 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
1372 auto acl_fn = asAclClFunction(std::move(fn));
1374 _return_fn = std::move(acl_fn);
1377 void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
1379 const auto ofm_index{node.getOutputs().at(0)};
1380 const auto ifm_index{node.getInputs().at(ir::operation::L2Pool2D::Input::INPUT)};
1382 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1383 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1385 uint32_t kw = node.param().kw;
1386 uint32_t kh = node.param().kh;
1387 const auto stride = node.param().stride;
1388 const auto padding =
1389 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
1390 const auto activation = node.param().activation;
1392 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1393 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1395 ::arm_compute::PoolingLayerInfo info{
1396 ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
1397 ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
1399 auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
1401 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
1403 _return_fn = std::make_unique<exec::FunctionSequence>(
1404 asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
1407 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
1409 const auto output_index{node.getOutputs().at(0)};
1410 const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
1411 const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
1413 auto output_alloc = _tensor_builder->at(output_index).get();
1414 auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1415 auto values_alloc = _tensor_builder->at(values_index).get();
1417 auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>();
1419 fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
1421 auto acl_fn = asAclClFunction(std::move(fn));
1423 _return_fn = std::move(acl_fn);
1426 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1428 const auto ofm_index{node.getOutputs().at(0)};
1429 const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
1431 // {CL|Neon}L2Normalization performs the reduction only along dimension 0
1432 // L2 Normalization always performs the reduction along the depth axis
1433 // Thus, we repurpose {CL|Neon}NormalizationLayers to act as depthwise L2 normalizations by
1434 // choosing normalization parameters as below
1436 const auto &ifm_shape = _ctx.at(ifm_index).shape();
1437 // TODO Support optional constant dimension that normalization would be performed on
1438 const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
1440 2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
1441 float alpha = 1.0f; // In the implementation to make alpha_ become 1
1442 float beta = 0.5f; // pow(reduction, -0.5) = 1 / sqrt(reduction)
1443 float bias = 0.0f; // Don't offset the reduction.
1445 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1446 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1448 const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
1449 radius, alpha, beta, bias, false);
1451 auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
1453 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1455 auto acl_fn = asAclClFunction(std::move(fn));
1457 _return_fn = std::move(acl_fn);
1460 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
1462 const auto output_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::OUTPUT)};
1463 const auto hits_index{node.getOutputs().at(ir::operation::HashtableLookup::Output::HITS)};
1465 const auto lookups_index{node.getInputs().at(ir::operation::HashtableLookup::Input::LOOKUPS)};
1466 const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
1467 const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
1469 auto output_alloc = _tensor_builder->at(output_index).get();
1470 auto hits_alloc = _tensor_builder->at(hits_index).get();
1472 auto lookups_alloc = _tensor_builder->at(lookups_index).get();
1473 auto keys_alloc = _tensor_builder->at(keys_index).get();
1474 auto values_alloc = _tensor_builder->at(values_index).get();
1476 auto fn = std::make_unique<::arm_compute::CLHashtableLookup>();
1478 fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
1479 output_alloc->handle(), hits_alloc->handle());
1481 auto acl_fn = asAclClFunction(std::move(fn));
1483 _return_fn = std::move(acl_fn);
1486 void KernelGenerator::visit(const ir::operation::PReLU &node)
1488 const auto ofm_index{node.getOutputs().at(0)};
1489 const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
1490 const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
1492 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1493 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1494 auto alpha_alloc = _tensor_builder->at(alpha_index).get();
1496 auto fn = std::make_unique<::arm_compute::CLPReLU>();
1498 fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
1500 auto acl_fn = asAclClFunction(std::move(fn));
1502 _return_fn = std::move(acl_fn);
1505 void KernelGenerator::visit(const ir::operation::TransposeConv &node)
1507 const auto ofm_index{node.getOutputs().at(0)};
1508 const auto ker_index{node.getInputs().at(ir::operation::TransposeConv::Input::KERNEL)};
1509 const auto ifm_index{node.getInputs().at(ir::operation::TransposeConv::Input::INPUT)};
1511 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
1512 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
1513 const auto ker_shape = _ctx.at(ker_index).shape().asFeature(_current_op_seq_layout);
1515 const auto stride = node.param().stride;
1517 assert((node.param().padding.type == ir::PaddingType::SAME) ||
1518 (node.param().padding.type == ir::PaddingType::VALID));
1519 auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
1520 ker_shape.W, ker_shape.H);
1522 uint32_t invalid_horizontal = 0;
1523 uint32_t invalid_vertical = 0;
1524 if (node.param().padding.type == ir::PaddingType::VALID)
1526 invalid_horizontal =
1527 ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
1528 invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
1531 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1532 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1533 auto ker_alloc = _tensor_builder->at(ker_index).get();
1535 const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
1537 auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>(
1538 _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
1540 fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
1541 invalid_horizontal, invalid_vertical);
1543 auto acl_fn = asAclClFunction(std::move(fn));
1545 _return_fn = std::move(acl_fn);
1548 void KernelGenerator::visit(const ir::operation::SQRT &node)
1550 const auto output_index{node.getOutputs().at(0)};
1551 const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
1553 auto output_alloc = _tensor_builder->at(output_index).get();
1554 auto input_alloc = _tensor_builder->at(input_index).get();
1556 const ::arm_compute::ActivationLayerInfo act_info{
1557 ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
1559 auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
1561 fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1563 auto acl_fn = asAclClFunction(std::move(fn));
1565 _return_fn = std::move(acl_fn);
1568 void KernelGenerator::visit(const ir::operation::LogicalOr &node)
1570 const auto output_index{node.getOutputs().at(0)};
1571 const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
1572 const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
1574 auto output_alloc = _tensor_builder->at(output_index).get();
1575 auto input0_alloc = _tensor_builder->at(input0_index).get();
1576 auto input1_alloc = _tensor_builder->at(input1_index).get();
1578 auto fn = std::make_unique<::arm_compute::CLBitwiseOr>();
1580 fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
1582 auto acl_fn = asAclClFunction(std::move(fn));
1584 _return_fn = std::move(acl_fn);
1587 void KernelGenerator::visit(const ir::operation::LogicalNot &node)
1589 const auto output_index{node.getOutputs().at(0)};
1590 const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
1592 auto output_alloc = _tensor_builder->at(output_index).get();
1593 auto input_alloc = _tensor_builder->at(input_index).get();
1595 auto fn = std::make_unique<::arm_compute::CLBitwiseNot>();
1597 fn->configure(input_alloc->handle(), output_alloc->handle());
1599 auto acl_fn = asAclClFunction(std::move(fn));
1601 _return_fn = std::move(acl_fn);
1604 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1606 const auto ofm_index{node.getOutputs().at(0)};
1607 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1608 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1610 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1611 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
1612 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
1614 auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
1616 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
1618 auto acl_fn = asAclClFunction(std::move(fn));
1620 _return_fn = std::move(acl_fn);
1623 void KernelGenerator::visit(const ir::operation::TopKV2 &node)
1625 const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
1626 const auto outputIndices_index{
1627 node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
1629 const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
1631 // Currently, we only support the vector input.
1632 assert(_ctx.at(inputData_index).shape().rank() == 1 ||
1633 _ctx.at(inputData_index).shape().rank() == 2);
1635 const auto k = node.param().k;
1637 auto values_alloc = _tensor_builder->at(outputValues_index).get();
1638 auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
1639 auto input_alloc = _tensor_builder->at(inputData_index).get();
1641 auto fn = std::make_unique<::arm_compute::CLTopKV2>();
1643 fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
1645 auto acl_fn = asAclClFunction(std::move(fn));
1647 _return_fn = std::move(acl_fn);
1650 void KernelGenerator::visit(const ir::operation::Gather &node)
1652 const auto ofm_index{node.getOutputs().at(0)};
1654 const auto ifm_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
1655 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
1657 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1658 const auto axis_raw = node.param().axis;
1659 const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
1660 const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
1662 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1663 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1664 auto indices_alloc = _tensor_builder->at(indices_index).get();
1666 // NOTE The frontend layout and backend layout must be the same for this operation.
1667 // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
1668 // is not not efficient even if it works well. If so, it would be better to set the
1669 // layout of these backend tensors to the same layout.
1670 // There is also one thing we have to think about. This operation depends on the layout of
1671 // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
1672 // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
1673 // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
1674 const auto backend_layout = ofm_alloc->layout();
1675 UNUSED_RELEASE(backend_layout);
1676 assert(backend_layout == ifm_alloc->layout());
1677 assert(backend_layout == indices_alloc->layout());
1678 assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
1680 auto fn = std::make_unique<::arm_compute::CLGatherEx>();
1682 // input is n-D, indices k-D, output is (n + k - 1)-D
1683 size_t n = ifm_rank;
1684 assert(n == ifm_alloc->num_dimensions());
1685 size_t k = _ctx.at(indices_index).shape().rank();
1686 assert(k == indices_alloc->num_dimensions());
1688 // Disable applied dim_correction
1689 const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape();
1690 if (n != ifm_alloc->info()->num_dimensions())
1692 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1693 const auto ifm = _ctx.at(ifm_index);
1694 ifm_alloc->info()->set_tensor_shape(
1695 acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
1697 const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape();
1698 if (k != indices_alloc->info()->num_dimensions())
1700 // This means that high dimension's value is 1 and indices tensor is applied dim_correction
1701 const auto indices = _ctx.at(indices_index);
1702 indices_alloc->info()->set_tensor_shape(
1703 acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
1706 fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
1708 // Revert disabling applied dim_correction
1709 ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
1710 indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
1712 auto acl_fn = asAclClFunction(std::move(fn));
1714 _return_fn = std::move(acl_fn);
1717 void KernelGenerator::visit(const ir::operation::Neg &node)
1719 const auto ofm_index{node.getOutputs().at(0)};
1720 const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
1722 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1723 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1725 auto fn = std::make_unique<::arm_compute::CLNeg>();
1727 fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
1729 auto acl_fn = asAclClFunction(std::move(fn));
1731 _return_fn = std::move(acl_fn);
1734 void KernelGenerator::visit(const ir::operation::Abs &node)
1736 const auto output_index{node.getOutputs().at(0)};
1737 const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
1739 auto output_alloc = _tensor_builder->at(output_index).get();
1740 auto input_alloc = _tensor_builder->at(input_index).get();
1742 const ::arm_compute::ActivationLayerInfo act_info{
1743 ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
1745 auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
1747 fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
1749 auto acl_fn = asAclClFunction(std::move(fn));
1751 _return_fn = std::move(acl_fn);
1754 void KernelGenerator::visit(const ir::operation::ArgMax &node)
1756 const auto ofm_index{node.getOutputs().at(0)};
1757 const auto ifm_index{node.getInputs().at(ir::operation::ArgMax::Input::INPUT)};
1759 auto ifm_shape = _ctx.at(ifm_index).shape();
1760 auto ofm_shape = _ctx.at(ofm_index).shape();
1762 assert((ifm_shape.rank() - 1) == ofm_shape.rank());
1764 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1765 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1766 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1767 auto frontend_layout = _current_op_seq_layout;
1768 auto backend_layout = ifm_alloc->layout();
1770 int axis_value = node.param().axis;
1773 axis_value += ifm_rank;
1777 acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
1779 auto fn = std::make_unique<::arm_compute::CLArgOperation>();
1781 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
1782 ::arm_compute::ArgOperation::MAX);
1784 auto acl_fn = asAclClFunction(std::move(fn));
1786 _return_fn = std::move(acl_fn);
1789 void KernelGenerator::visit(const ir::operation::Dequantize &node)
1791 const auto output_index{node.getOutputs().at(0)};
1792 const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
1794 auto output_alloc = _tensor_builder->at(output_index).get();
1795 auto input_alloc = _tensor_builder->at(input_index).get();
1797 auto fn = std::make_unique<::arm_compute::CLCast>();
1799 fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE);
1801 auto acl_fn = asAclClFunction(std::move(fn));
1803 _return_fn = std::move(acl_fn);
1806 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
1808 const auto ofm_index{node.getOutputs().at(0)};
1809 const auto ifm_index{
1810 node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
1812 auto radius = node.param().radius;
1813 auto alpha = node.param().alpha;
1814 auto beta = node.param().beta;
1815 auto bias = node.param().bias;
1817 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1818 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1820 const auto norm_info = ::arm_compute::NormalizationLayerInfo(
1821 ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
1823 auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
1825 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
1827 auto acl_fn = asAclClFunction(std::move(fn));
1829 _return_fn = std::move(acl_fn);
1832 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1834 const auto output_index{node.getOutputs().at(0)};
1835 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1837 auto block_size = node.param().block_size;
1838 assert(block_size > 0);
1840 auto output_alloc = _tensor_builder->at(output_index).get();
1841 auto input_alloc = _tensor_builder->at(input_index).get();
1843 auto fn = std::make_unique<::arm_compute::CLDepthToSpace>();
1845 fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
1847 auto acl_fn = asAclClFunction(std::move(fn));
1849 _return_fn = std::move(acl_fn);
1852 void KernelGenerator::visit(const ir::operation::Split &node)
1854 const auto ifm_index{node.getInputs().at(ir::operation::Split::Input::INPUT)};
1856 assert(node.param().num_splits == static_cast<int>(node.getOutputs().size()));
1858 const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
1859 std::vector<ir::OperandIndex> output_indexes;
1860 for (const auto &output : node.getOutputs())
1861 output_indexes.emplace_back(output);
1863 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
1864 std::vector<arm_compute::ICLTensor *> output_allocs;
1865 for (const auto &ofm_ind : output_indexes)
1866 output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
1868 const auto frontend_layout = _current_op_seq_layout;
1869 const auto backend_layout = ifm_alloc->layout();
1870 auto axis = node.param().axis;
1873 axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
1875 auto fn = std::make_unique<::arm_compute::CLSplit>();
1877 fn->configure(ifm_alloc->handle(), output_allocs, axis);
1879 _return_fn = asAclClFunction(std::move(fn));
1882 void KernelGenerator::visit(const ir::operation::Unpack &node)
1884 const auto input_index{node.getInputs().at(ir::operation::Unpack::Input::INPUT)};
1885 auto axis{node.param().axis};
1887 const auto input_rank = _ctx.at(input_index).shape().rank();
1889 std::vector<ir::OperandIndex> output_indexes;
1890 for (const auto &output_index : node.getOutputs())
1891 output_indexes.emplace_back(output_index);
1893 auto input = _tensor_builder->at(input_index).get()->handle();
1894 std::vector<arm_compute::ICLTensor *> outputs;
1895 for (const auto &output_index : output_indexes)
1896 outputs.emplace_back(_tensor_builder->at(output_index)->handle());
1898 const auto frontend_layout = _current_op_seq_layout;
1899 const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
1902 axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
1904 // Disable applied dim_correction
1905 std::vector<arm_compute::TensorShape> orig_outputs_acl_tensor_shapes;
1906 for (const auto &output_index : output_indexes)
1908 size_t output_rank = _ctx.at(output_index).shape().rank();
1909 const auto &output_alloc = _tensor_builder->at(output_index);
1910 orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
1911 assert(output_rank == output_alloc->num_dimensions());
1912 if (output_rank != output_alloc->info()->num_dimensions())
1914 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1915 output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
1916 _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
1920 auto fn = std::make_unique<::arm_compute::CLUnstack>();
1922 fn->configure(input, outputs, axis);
1924 _return_fn = asAclClFunction(std::move(fn));
1927 void KernelGenerator::visit(const ir::operation::Pad &node)
1929 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
1930 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
1931 const auto output_index{node.getOutputs().at(0)};
1932 assert(_ctx.at(pad_index).data());
1934 auto rank = _ctx.at(input_index).shape().rank();
1935 auto pad_base = _ctx.at(pad_index).data()->base();
1937 auto input_type = _ctx.at(input_index).typeInfo();
1938 auto data_type = acl_common::asDataType(input_type.type());
1939 auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
1940 const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
1942 auto input = _tensor_builder->at(input_index).get()->handle();
1943 auto output = _tensor_builder->at(output_index).get()->handle();
1945 const auto frontend_layout = _current_op_seq_layout;
1946 const auto backend_layout = _tensor_builder->at(input_index).get()->layout();
1948 ::arm_compute::PaddingList padding_list;
1949 padding_list.resize(rank);
1950 for (int32_t n = 0; n < rank; ++n)
1952 const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
1955 acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
1956 padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
1958 auto fn = std::make_unique<::arm_compute::CLPadLayer>();
1960 // Disable applied dim_correction
1961 size_t input_rank = _ctx.at(input_index).shape().rank();
1962 const auto &input_alloc = _tensor_builder->at(input_index);
1963 assert(input_rank == input_alloc->num_dimensions());
1964 if (input_rank != input_alloc->info()->num_dimensions())
1966 // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
1967 input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
1968 _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
1971 fn->configure(input, output, padding_list, pixel_value);
1973 // Do not revert disabling applied dim_correction CLPadKernel has cl kernel for 4-dimension
1974 // It would produce a mistach of result
1976 _return_fn = asAclClFunction(std::move(fn));
1979 void KernelGenerator::visit(const ir::operation::Min &node)
1981 const auto ofm_index{node.getOutputs().at(0)};
1982 const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
1983 const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
1985 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
1986 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
1987 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
1989 auto fn = std::make_unique<::arm_compute::CLElementwiseMin>();
1991 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
1993 auto acl_fn = asAclClFunction(std::move(fn));
1995 _return_fn = std::move(acl_fn);
1998 void KernelGenerator::visit(const ir::operation::Max &node)
2000 const auto ofm_index{node.getOutputs().at(0)};
2001 const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
2002 const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
2004 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
2005 auto lhs_alloc = _tensor_builder->at(lhs_index).get();
2006 auto rhs_alloc = _tensor_builder->at(rhs_index).get();
2008 auto fn = std::make_unique<::arm_compute::CLElementwiseMax>();
2010 fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
2012 auto acl_fn = asAclClFunction(std::move(fn));
2014 _return_fn = std::move(acl_fn);
2017 void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
2019 const auto ofm_index{node.getOutputs().at(0)};
2020 const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
2022 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
2023 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
2025 auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
2027 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
2030 auto acl_fn = asAclClFunction(std::move(fn));
2032 _return_fn = std::move(acl_fn);
2035 void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
2037 const auto ofm_index{node.getOutputs().at(0)};
2038 const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
2040 auto ofm_alloc = _tensor_builder->at(ofm_index).get();
2041 auto ifm_alloc = _tensor_builder->at(ifm_index).get();
2043 auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
2045 fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
2048 auto acl_fn = asAclClFunction(std::move(fn));
2050 _return_fn = std::move(acl_fn);
2053 } // namespace acl_cl
2054 } // namespace backend
2055 } // namespace onert