2 * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include "KernelGenerator.h"
19 #include "ops/AddNLayer.h"
20 #include "ops/ArgMinMaxLayer.h"
21 #include "ops/BatchToSpaceNDLayer.h"
22 #include "ops/BinaryArithmeticLayer.h"
23 #include "ops/CompareLayer.h"
24 #include "ops/ConcatLayer.h"
25 #include "ops/ConvolutionLayer.h"
26 #include "ops/DepthToSpaceLayer.h"
27 #include "ops/DepthwiseConvolutionLayer.h"
28 #include "ops/EinsumLayer.h"
29 #include "ops/ElementwiseActivationLayer.h"
30 #include "ops/ElementwiseBinaryLayer.h"
31 #include "ops/ElementwiseUnaryLayer.h"
32 #include "ops/ExpandDimsLayer.h"
33 #include "ops/FillLayer.h"
34 #include "ops/FullyConnectedLayer.h"
35 #include "ops/GatherLayer.h"
36 #include "ops/LSTMLayer.h"
37 #include "ops/MeanLayer.h"
38 #include "ops/DetectionPostProcessLayer.h"
39 #include "ops/OneHotLayer.h"
40 #include "ops/OperationUtils.h"
41 #include "ops/PackLayer.h"
42 #include "ops/PadLayer.h"
43 #include "ops/PoolLayer.h"
44 #include "ops/PowLayer.h"
45 #include "ops/QuantizeLayer.h"
46 #include "ops/RangeLayer.h"
47 #include "ops/RankLayer.h"
48 #include "ops/ReduceLayer.h"
49 #include "ops/ReshapeLayer.h"
50 #include "ops/ResizeBilinearLayer.h"
51 #include "ops/ReverseLayer.h"
52 #include "ops/SelectLayer.h"
53 #include "ops/ShapeLayer.h"
54 #include "ops/SliceLayer.h"
55 #include "ops/SoftMaxLayer.h"
56 #include "ops/StridedSliceLayer.h"
57 #include "ops/SpaceToBatchNDLayer.h"
58 #include "ops/SpaceToDepthLayer.h"
59 #include "ops/SplitLayer.h"
60 #include "ops/SplitVLayer.h"
61 #include "ops/TileLayer.h"
62 #include "ops/TransposeLayer.h"
63 #include "ops/UnpackLayer.h"
64 #include "ops/SquaredDiffLayer.h"
65 #include "ops/L2NormLayer.h"
66 #include "ops/MatrixBandPartLayer.h"
67 #include "ops/BatchMatMulLayer.h"
68 #include "ops/BroadcastToLayer.h"
69 #include "ops/FusedBatchNormLayer.h"
70 #include "ops/LogSoftMaxLayer.h"
71 #include "ops/StatelessRandomUniformLayer.h"
73 #include <backend/Backend.h>
74 #include <backend/IConfig.h>
76 #include <util/Utils.h>
77 #include <util/logging.h>
78 #include <exec/DynamicShapeInferer.h>
92 convertArithmeticType(ir::operation::BinaryArithmetic::ArithmeticType arithmetic_type_ir)
94 switch (arithmetic_type_ir)
96 case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
97 return ops::ArithmeticType::kAdd;
98 case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
99 return ops::ArithmeticType::kSub;
100 case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
101 return ops::ArithmeticType::kMul;
102 case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
103 return ops::ArithmeticType::kDiv;
105 throw std::runtime_error("cpu KernelGenerator : Not supported operation yet");
109 ops::ElementwiseActivationType
110 convertElementwiseActivationType(ir::operation::ElementwiseActivation::Type type_ir)
114 case ir::operation::ElementwiseActivation::Type::ELU:
115 return ops::ElementwiseActivationType::kElu;
116 case ir::operation::ElementwiseActivation::Type::LOGISTIC:
117 return ops::ElementwiseActivationType::kLogistic;
118 case ir::operation::ElementwiseActivation::Type::RELU:
119 return ops::ElementwiseActivationType::kReLU;
120 case ir::operation::ElementwiseActivation::Type::TANH:
121 return ops::ElementwiseActivationType::kTanh;
122 case ir::operation::ElementwiseActivation::Type::LEAKY_RELU:
123 return ops::ElementwiseActivationType::kLeakyReLU;
125 throw std::runtime_error("cpu KernelGenerator : Not supported operation yet");
129 ops::ElementwiseBinaryType
130 convertElementwiseBinaryType(ir::operation::ElementwiseBinary::ElementwiseBinaryType type_ir)
134 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::FLOOR_DIV:
135 return ops::ElementwiseBinaryType::kFloorDiv;
136 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
137 return ops::ElementwiseBinaryType::kLogicalAnd;
138 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
139 return ops::ElementwiseBinaryType::kLogicalOr;
140 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
141 return ops::ElementwiseBinaryType::kMax;
142 case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
143 return ops::ElementwiseBinaryType::kMin;
145 throw std::runtime_error("cpu KernelGenerator : Not supported operation yet");
149 ops::ElementwiseUnaryType convertElementwiseUnaryType(ir::operation::ElementwiseUnary::Type type_ir)
153 case ir::operation::ElementwiseUnary::Type::ABS:
154 return ops::ElementwiseUnaryType::kAbs;
155 case ir::operation::ElementwiseUnary::Type::CAST:
156 return ops::ElementwiseUnaryType::kCast;
157 case ir::operation::ElementwiseUnary::Type::COS:
158 return ops::ElementwiseUnaryType::kCos;
159 case ir::operation::ElementwiseUnary::Type::DEQUANTIZE:
160 return ops::ElementwiseUnaryType::kDequantize;
161 case ir::operation::ElementwiseUnary::Type::ERF:
162 return ops::ElementwiseUnaryType::kErf;
163 case ir::operation::ElementwiseUnary::Type::EXP:
164 return ops::ElementwiseUnaryType::kExp;
165 case ir::operation::ElementwiseUnary::Type::FLOOR:
166 return ops::ElementwiseUnaryType::kFloor;
167 case ir::operation::ElementwiseUnary::Type::LOG:
168 return ops::ElementwiseUnaryType::kLog;
169 case ir::operation::ElementwiseUnary::Type::LOGICAL_NOT:
170 return ops::ElementwiseUnaryType::kLogicalNot;
171 case ir::operation::ElementwiseUnary::Type::NEG:
172 return ops::ElementwiseUnaryType::kNeg;
173 case ir::operation::ElementwiseUnary::Type::QUANTIZE:
174 return ops::ElementwiseUnaryType::kQuantize;
175 case ir::operation::ElementwiseUnary::Type::ROUND:
176 return ops::ElementwiseUnaryType::kRound;
177 case ir::operation::ElementwiseUnary::Type::RSQRT:
178 return ops::ElementwiseUnaryType::kRSqrt;
179 case ir::operation::ElementwiseUnary::Type::SIN:
180 return ops::ElementwiseUnaryType::kSin;
181 case ir::operation::ElementwiseUnary::Type::SQRT:
182 return ops::ElementwiseUnaryType::kSqrt;
183 case ir::operation::ElementwiseUnary::Type::SQUARE:
184 return ops::ElementwiseUnaryType::kSquare;
185 case ir::operation::ElementwiseUnary::Type::ZEROS_LIKE:
186 return ops::ElementwiseUnaryType::kZerosLike;
188 throw std::runtime_error("cpu KernelGenerator : Not supported operation yet");
192 ops::PoolType convertPoolType(ir::operation::Pool2D::PoolType type_ir)
196 case ir::operation::Pool2D::PoolType::AVG:
197 return ops::PoolType::kAvg;
198 case ir::operation::Pool2D::PoolType::MAX:
199 return ops::PoolType::kMax;
201 throw std::runtime_error("cpu KernelGenerator : Not supported operation yet");
205 ops::ReduceType convertReduceType(ir::operation::Reduce::ReduceType reduce_type_ir)
207 switch (reduce_type_ir)
209 case ir::operation::Reduce::ReduceType::ALL:
210 return ops::ReduceType::kAll;
211 case ir::operation::Reduce::ReduceType::ANY:
212 return ops::ReduceType::kAny;
213 case ir::operation::Reduce::ReduceType::MAX:
214 return ops::ReduceType::kMax;
215 case ir::operation::Reduce::ReduceType::MIN:
216 return ops::ReduceType::kMin;
217 case ir::operation::Reduce::ReduceType::PROD:
218 return ops::ReduceType::kProd;
219 case ir::operation::Reduce::ReduceType::SUM:
220 return ops::ReduceType::kSum;
222 throw std::runtime_error("cpu KernelGenerator : Not supported operation yet");
227 KernelGenerator::KernelGenerator(
228 const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
229 const std::shared_ptr<basic::TensorRegistry> &tensor_reg,
230 const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
231 const std::shared_ptr<ExternalContext> &external_context)
232 : basic::KernelGeneratorBase{graph},
233 _ctx(graph.operands()), _operations_ctx{graph.operations()}, _current_layout{graph.layout()},
234 _tensor_builder(tensor_builder), _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
235 _external_context(external_context)
240 std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
242 auto ret = std::make_unique<exec::FunctionSequence>();
244 assert(_tensor_builder->dynamicTensorManager());
247 auto dyn_shape_inferer = std::make_shared<exec::DynamicShapeInferer>(_ctx, _tensor_reg);
249 // Prepare to handle dynamic tensors later
250 auto dyn_ctx = std::make_shared<exec::FunctionSequence::DynamicTensorCtx>();
252 dyn_ctx->op_ind = ind;
253 dyn_ctx->operations = &_operations_ctx;
254 dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
256 ret->dynamic_tensor_ctx(dyn_ctx);
259 auto &op = _graph.operations().at(ind);
261 assert(_return_fn); // _return_fn must have been generated
262 ret->append(std::move(_return_fn));
264 for (auto ind : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
266 auto portable_tensor = _tensor_reg->getPortableTensor(ind);
269 assert(portable_tensor->layout() == ir::Layout::NHWC);
272 auto tensor = _tensor_reg->getNativeTensor(ind);
275 tensor->increase_ref();
281 void KernelGenerator::visit(const ir::operation::AddN &node)
283 const auto output_index{node.getOutputs().at(0)};
285 std::vector<const IPortableTensor *> input_tensors;
286 for (auto &input_idx : node.getInputs())
287 input_tensors.emplace_back(_tensor_reg->getPortableTensor(input_idx));
289 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
291 auto fn = std::make_unique<ops::AddNLayer>();
293 fn->configure(std::move(input_tensors), output_tensor);
295 _return_fn = std::move(fn);
298 void KernelGenerator::visit(const ir::operation::Conv2D &node)
300 using ir::operation::Conv2D;
302 const auto ofm_index{node.getOutputs().at(0)};
303 const auto ifm_index{node.getInputs().at(Conv2D::Input::INPUT)};
304 const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
305 const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
307 auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
308 auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
309 auto ker_tensor = _tensor_reg->getPortableTensor(ker_index);
310 auto bias_tensor = _tensor_reg->getPortableTensor(bias_index);
312 const auto stride = node.param().stride;
313 const auto activation = node.param().activation;
314 const auto param_padding = node.param().padding;
315 const auto dilation = node.param().dilation;
316 auto fn = std::make_unique<ops::ConvolutionLayer>();
318 if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic())
320 fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left,
321 param_padding.param.right, param_padding.param.top, param_padding.param.bottom,
322 stride.horizontal, stride.vertical, dilation.width_factor, dilation.height_factor,
323 activation, ofm_tensor);
325 _return_fn = std::move(fn);
328 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
329 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
330 // Kernel format is [depth_out, kernel_height, kernel_width, depth_in].
331 const auto &ker_shape = _ctx.at(ker_index).shape();
332 const auto ker_height = ker_shape.dim(1);
333 const auto ker_width = ker_shape.dim(2);
336 ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
337 dilation.width_factor, dilation.height_factor);
339 fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
340 padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
341 dilation.width_factor, dilation.height_factor, activation, ofm_tensor);
343 _return_fn = std::move(fn);
346 void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
348 using ir::operation::DepthwiseConv2D;
350 const auto ofm_index{node.getOutputs().at(0)};
351 const auto ifm_index{node.getInputs().at(DepthwiseConv2D::Input::INPUT)};
352 const auto ker_index{node.getInputs().at(DepthwiseConv2D::Input::KERNEL)};
353 const auto bias_index{node.getInputs().at(DepthwiseConv2D::Input::BIAS)};
355 const auto stride = node.param().stride;
356 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
357 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
358 // Kernel format is [1, kernel_height, kernel_width, depth_out].
359 const auto &ker_shape = _ctx.at(ker_index).shape();
360 const auto ker_height = ker_shape.dim(1);
361 const auto ker_width = ker_shape.dim(2);
362 const auto dilation_width = node.param().dilation.width_factor;
363 const auto dilation_height = node.param().dilation.height_factor;
364 const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
365 ker_width, ker_height, dilation_width, dilation_height);
366 const auto multiplier = node.param().multiplier;
367 const auto activation = node.param().activation;
369 auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
370 auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
371 auto ker_tensor = _tensor_reg->getPortableTensor(ker_index);
372 auto bias_tensor = _tensor_reg->getPortableTensor(bias_index);
374 auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>();
376 fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top,
377 padding.bottom, stride.horizontal, stride.vertical, multiplier, dilation_width,
378 dilation_height, activation, ofm_tensor, _external_context);
380 _return_fn = std::move(fn);
383 void KernelGenerator::visit(const ir::operation::Concat &node)
385 const auto ofm_index{node.getOutputs().at(0)};
387 const auto rank = _ctx.at(ofm_index).shape().rank();
388 const auto axis = ops::getAxis(rank, node.param().axis, _current_layout);
390 auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
392 std::vector<const IPortableTensor *> input_tensors;
393 for (auto &ifm_idx : node.getInputs())
394 input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
396 auto fn = std::make_unique<ops::ConcatLayer>();
398 fn->configure(input_tensors, axis, output_tensor);
400 _return_fn = std::move(fn);
403 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
405 const auto output_index{node.getOutputs().at(0)};
406 const auto input_index{node.getInputs().at(ir::operation::BatchToSpaceND::INPUT)};
407 const auto block_size_index{node.getInputs().at(ir::operation::BatchToSpaceND::BLOCK_SIZE)};
409 auto output_alloc = _tensor_reg->getPortableTensor(output_index);
410 auto input_alloc = _tensor_reg->getPortableTensor(input_index);
411 auto block_size_alloc = _tensor_reg->getPortableTensor(block_size_index);
413 auto fn = std::make_unique<ops::BatchToSpaceNDLayer>();
415 IPortableTensor *crops_alloc = nullptr;
416 const auto NNApiInputs = 2;
418 if (node.getInputs().size() != NNApiInputs)
420 const auto crops_data_index{node.getInputs().at(ir::operation::BatchToSpaceND::CROPS_DATA)};
421 crops_alloc = _tensor_reg->getPortableTensor(crops_data_index);
424 fn->configure(input_alloc, output_alloc, block_size_alloc, crops_alloc);
426 _return_fn = std::move(fn);
429 void KernelGenerator::visit(const ir::operation::Fill &node)
431 const auto output_index{node.getOutputs().at(0)};
432 // SHAPE input is used for shape inference
433 const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)};
435 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
436 auto value_tensor = _tensor_reg->getPortableTensor(value_index);
438 auto fn = std::make_unique<ops::FillLayer>();
440 fn->configure(value_tensor, output_tensor);
442 _return_fn = std::move(fn);
445 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
447 using ir::operation::FullyConnected;
449 const auto output_index{node.getOutputs().at(0)};
450 const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
451 const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
452 const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
453 const auto activation = node.param().activation;
454 const auto weights_format = node.param().weights_format;
456 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
457 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
458 auto weight_tensor = _tensor_reg->getPortableTensor(weight_index);
459 auto bias_tensor = bias_index.undefined() ? nullptr : _tensor_reg->getPortableTensor(bias_index);
461 auto fn = std::make_unique<ops::FullyConnectedLayer>();
463 fn->configure(input_tensor, weight_tensor, bias_tensor, activation, weights_format, output_tensor,
466 _return_fn = std::move(fn);
469 void KernelGenerator::visit(const ir::operation::Reshape &node)
471 const auto output_index{node.getOutputs().at(0)};
472 const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
474 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
475 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
477 // optional 2nd input
478 IPortableTensor *shape_tensor = nullptr;
480 if (node.getInputs().size() == 2)
482 const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)};
483 shape_tensor = _tensor_reg->getPortableTensor(shape_index);
486 auto fn = std::make_unique<ops::ReshapeLayer>();
488 fn->configure(input_tensor, shape_tensor, output_tensor);
489 _return_fn = std::move(fn);
492 void KernelGenerator::visit(const ir::operation::Squeeze &node)
494 const auto output_index{node.getOutputs().at(0)};
495 const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
497 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
498 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
500 // Squeeze can share same kernel with reshape
501 auto fn = std::make_unique<ops::ReshapeLayer>();
503 fn->configure(input_tensor, nullptr, output_tensor);
505 _return_fn = std::move(fn);
508 void KernelGenerator::visit(const ir::operation::Softmax &node)
510 const auto output_index{node.getOutputs().at(0)};
511 const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
513 const auto beta = node.param().beta;
515 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
516 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
518 auto fn = std::make_unique<ops::SoftMaxLayer>();
520 fn->configure(input_tensor, beta, output_tensor);
522 _return_fn = std::move(fn);
525 void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
527 const auto ofm_index{node.getOutputs().at(0)};
528 const auto lhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::LHS)};
529 const auto rhs_index{node.getInputs().at(ir::operation::BinaryArithmetic::Input::RHS)};
531 const auto activation = node.param().activation;
533 auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
534 auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
535 auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
537 auto fn = std::make_unique<ops::BinaryArithmeticLayer>();
539 fn->configure(lhs_tensor, rhs_tensor, ofm_tensor, activation,
540 convertArithmeticType(node.param().arithmetic_type));
542 _return_fn = std::move(fn);
545 void KernelGenerator::visit(const ir::operation::Comparison &node)
547 const auto ofm_index{node.getOutputs().at(0)};
548 const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
549 const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
551 auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
552 auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
553 auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
555 auto comparison_type = node.param().comparison_type;
557 auto fn = std::make_unique<ops::CompareLayer>();
559 fn->configure(lhs_tensor, rhs_tensor, comparison_type, ofm_tensor);
561 _return_fn = std::move(fn);
564 void KernelGenerator::visit(const ir::operation::Gather &node)
566 const auto output_index{node.getOutputs().at(0)};
567 const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
568 const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
570 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
571 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
572 auto indices_tensor = _tensor_reg->getPortableTensor(indices_index);
574 const auto backend_layout = output_tensor->layout();
575 UNUSED_RELEASE(backend_layout);
577 // NOTE The frontend layout and backend layout must be the same for this operation.
578 // If not the same, we have to add a stage(?) to perform permutation of output tensor. It
579 // is not not efficient even if it works well. If so, it would be better to set the
580 // layout of these backend tensors to the same layout.
581 // There is also one thing we have to think about. This operation depends on the layout of
582 // a model. For example, if a model in NHWC has this operation as output rank == 4, indices
583 // rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
584 // and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
585 assert(backend_layout == input_tensor->layout());
586 assert(backend_layout == indices_tensor->layout());
587 const auto &input_shape = _ctx.at(input_index).shape();
588 UNUSED_RELEASE(input_shape);
589 assert(input_shape.rank() < 4 || _current_layout == backend_layout);
591 const auto axis_raw = node.param().axis;
592 const auto axis_value = (axis_raw < 0 ? (input_shape.rank() + axis_raw) : axis_raw);
594 auto fn = std::make_unique<ops::GatherLayer>();
596 fn->configure(input_tensor, indices_tensor, output_tensor, axis_value);
598 _return_fn = std::move(fn);
601 void KernelGenerator::visit(const ir::operation::OneHot &node)
603 const auto output_index{node.getOutputs().at(0)};
604 const auto indices_index{node.getInputs().at(ir::operation::OneHot::INDICES)};
605 const auto depth_index{node.getInputs().at(ir::operation::OneHot::Input::DEPTH)};
606 const auto onvalue_index{node.getInputs().at(ir::operation::OneHot::Input::ON_VALUE)};
607 const auto offvalue_index{node.getInputs().at(ir::operation::OneHot::Input::OFF_VALUE)};
609 const auto axis = node.param().axis;
611 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
612 auto indices_tensor = _tensor_reg->getPortableTensor(indices_index);
613 auto depth_tensor = _tensor_reg->getPortableTensor(depth_index);
614 auto onvalue_tensor = _tensor_reg->getPortableTensor(onvalue_index);
615 auto offvalue_tensor = _tensor_reg->getPortableTensor(offvalue_index);
617 assert(indices_tensor->data_type() == OperandType::INT32);
618 assert(axis <= static_cast<int>(indices_tensor->getShape().rank()));
620 auto fn = std::make_unique<ops::OneHotLayer>();
622 fn->configure(indices_tensor, depth_tensor, onvalue_tensor, offvalue_tensor, output_tensor, axis);
624 _return_fn = std::move(fn);
627 void KernelGenerator::visit(const ir::operation::Einsum &node)
629 const auto ofm_index{node.getOutputs().at(0)};
631 auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
632 std::vector<const IPortableTensor *> input_tensors;
633 for (auto &ifm_idx : node.getInputs())
634 input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
636 const auto equation = node.param().equation;
638 auto fn = std::make_unique<ops::EinsumLayer>();
640 fn->configure(input_tensors, equation, output_tensor);
642 _return_fn = std::move(fn);
645 void KernelGenerator::visit(const ir::operation::Custom &node)
647 auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq,
648 std::vector<custom::TypeInfo> &types,
649 std::vector<IPortableTensor *> &tensors) {
650 for (auto &idx : opSeq)
652 const auto &operand = _ctx.at(idx);
653 // TODO make sure using `_current_layout` is correct for custom operations
654 types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()});
655 auto in_tensor = _tensor_reg->getPortableTensor(idx);
656 tensors.emplace_back(in_tensor);
660 backend::custom::CustomKernelConfigParams params{};
662 fill_op_info(node.getInputs(), params.input_types, params.input_tensors);
663 fill_op_info(node.getOutputs(), params.output_types, params.output_tensors);
665 params.userdata = node.userdata().data;
666 params.userdata_size = node.userdata().size;
668 auto fn = _kernel_builder->buildKernel(node.id(), std::move(params));
670 _return_fn = std::move(fn);
673 void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
675 const auto output_index{node.getOutputs().at(0)};
676 const auto input_index{node.getInputs().at(ir::operation::ElementwiseActivation::Input::INPUT)};
678 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
679 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
681 auto fn = std::make_unique<ops::ElementwiseActivationLayer>();
683 fn->configure(input_tensor, output_tensor, node.param().alpha, node.param().beta,
684 convertElementwiseActivationType(node.param().op_type));
686 _return_fn = std::move(fn);
689 void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
691 const auto output_index{node.getOutputs().at(0)};
692 const auto lhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::LHS)};
693 const auto rhs_index{node.getInputs().at(ir::operation::ElementwiseBinary::Input::RHS)};
695 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
696 auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
697 auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
699 auto fn = std::make_unique<ops::ElementwiseBinaryLayer>();
701 fn->configure(lhs_tensor, rhs_tensor, output_tensor,
702 convertElementwiseBinaryType(node.param().op_type));
704 _return_fn = std::move(fn);
707 void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
709 const auto output_index{node.getOutputs().at(0)};
710 const auto input_index{node.getInputs().at(ir::operation::ElementwiseUnary::Input::INPUT)};
712 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
713 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
715 if (node.param().op_type == ir::operation::ElementwiseUnary::Type::QUANTIZE)
717 auto fn = std::make_unique<ops::QuantizeLayer>();
718 fn->configure(input_tensor, output_tensor);
719 _return_fn = std::move(fn);
723 auto fn = std::make_unique<ops::ElementwiseUnaryLayer>();
724 fn->configure(input_tensor, output_tensor, convertElementwiseUnaryType(node.param().op_type));
725 _return_fn = std::move(fn);
729 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
731 const auto output_index{node.getOutputs().at(0)};
732 const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
733 // AXIS input is used for output shape inference
735 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
736 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
738 auto fn = std::make_unique<ops::ExpandDimsLayer>();
740 fn->configure(input_tensor, output_tensor);
742 _return_fn = std::move(fn);
745 void KernelGenerator::visit(const ir::operation::Pack &node)
747 const auto ofm_index{node.getOutputs().at(0)};
749 const auto rank = _ctx.at(ofm_index).shape().rank();
750 const auto axis = ops::getAxis(rank, node.param().axis, _current_layout);
752 assert(-rank <= axis && axis < rank);
754 auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
756 std::vector<const IPortableTensor *> input_tensors;
757 for (auto &ifm_idx : node.getInputs())
758 input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
760 auto fn = std::make_unique<ops::PackLayer>();
762 fn->configure(input_tensors, axis, output_tensor);
764 _return_fn = std::move(fn);
767 void KernelGenerator::visit(const ir::operation::Unpack &node)
769 const auto input_index{node.getInputs().at(0)};
771 const auto rank = _ctx.at(input_index).shape().rank();
772 const auto axis = ops::getAxis(rank, node.param().axis, _current_layout);
774 assert(rank == 0 || (-rank <= axis && axis < rank));
776 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
778 std::vector<IPortableTensor *> output_tensors;
779 for (auto &output_idx : node.getOutputs())
780 output_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
782 auto fn = std::make_unique<ops::UnpackLayer>();
784 uint32_t axis_resolved = (axis < 0 ? axis + rank : axis);
786 fn->configure(input_tensor, axis_resolved, node.param().num, output_tensors);
788 _return_fn = std::move(fn);
791 void KernelGenerator::visit(const ir::operation::Pad &node)
793 const auto input_index{node.getInputs().at(ir::operation::Pad::Input::INPUT)};
794 const auto pad_index{node.getInputs().at(ir::operation::Pad::Input::PAD)};
795 const auto output_index{node.getOutputs().at(0)};
796 assert(_ctx.at(pad_index).data());
798 auto input = _tensor_reg->getPortableTensor(input_index);
799 auto output = _tensor_reg->getPortableTensor(output_index);
800 auto pad_rank = _ctx.at(pad_index).shape().dim(0);
801 auto pad_base = reinterpret_cast<const int32_t *>(_ctx.at(pad_index).data()->base());
803 auto fn = std::make_unique<ops::PadLayer>();
805 bool isPadV2 = node.getInputs().size() == 3 ? true : false;
806 const void *value = nullptr;
810 const auto value_index{node.getInputs().at(ir::operation::Pad::Input::VALUE)};
811 value = reinterpret_cast<const void *>(_ctx.at(value_index).data()->base());
814 fn->configure(input, output, pad_base, pad_rank, value);
815 _return_fn = std::move(fn);
818 void KernelGenerator::visit(const ir::operation::Transpose &node)
820 const auto output_index{node.getOutputs().at(0)};
821 const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
822 const auto perm_index{node.getInputs().at(ir::operation::Transpose::Input::PERMUTATION)};
824 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
825 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
826 auto perm_tensor = _tensor_reg->getPortableTensor(perm_index);
828 auto fn = std::make_unique<ops::TransposeLayer>();
830 fn->configure(input_tensor, perm_tensor, output_tensor);
832 _return_fn = std::move(fn);
835 void KernelGenerator::visit(const ir::operation::Reduce &node)
837 const auto output_index{node.getOutputs().at(0)};
838 const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
839 const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
841 const auto keep_dims = node.param().keep_dims;
842 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
843 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
844 auto axes_tensor = _tensor_reg->getPortableTensor(axes_index);
846 if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN)
848 auto fn = std::make_unique<ops::MeanLayer>();
850 fn->configure(input_tensor, axes_tensor, output_tensor, keep_dims);
852 _return_fn = std::move(fn);
856 auto fn = std::make_unique<ops::ReduceLayer>();
858 const auto reduce_type = convertReduceType(node.param().reduce_type);
859 fn->configure(input_tensor, axes_tensor, output_tensor, reduce_type, keep_dims);
861 _return_fn = std::move(fn);
865 void KernelGenerator::visit(const ir::operation::Select &node)
867 const auto output_index{node.getOutputs().at(0)};
868 const auto condition_index{node.getInputs().at(ir::operation::Select::Input::CONDITION)};
869 const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
870 const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
872 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
873 auto condition_tensor = _tensor_reg->getPortableTensor(condition_index);
874 auto true_tensor = _tensor_reg->getPortableTensor(true_index);
875 auto false_tensor = _tensor_reg->getPortableTensor(false_index);
877 auto fn = std::make_unique<ops::SelectLayer>();
879 fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor);
881 _return_fn = std::move(fn);
884 void KernelGenerator::visit(const ir::operation::Slice &node)
886 const auto output_index{node.getOutputs().at(0)};
887 const auto input_index{node.getInputs().at(ir::operation::Slice::Input::INPUT)};
888 const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
889 const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
891 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
892 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
893 auto begins_tensor = _tensor_reg->getPortableTensor(begins_index);
894 auto sizes_tensor = _tensor_reg->getPortableTensor(sizes_index);
896 auto fn = std::make_unique<ops::SliceLayer>();
898 fn->configure(input_tensor, begins_tensor, sizes_tensor, output_tensor);
900 _return_fn = std::move(fn);
903 void KernelGenerator::visit(const ir::operation::StridedSlice &node)
905 const auto output_index{node.getOutputs().at(0)};
906 const auto input_index{node.getInputs().at(ir::operation::StridedSlice::Input::INPUT)};
907 const auto starts_index{node.getInputs().at(ir::operation::StridedSlice::Input::STARTS)};
908 const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
909 const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
911 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
912 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
913 auto starts_tensor = _tensor_reg->getPortableTensor(starts_index);
914 auto ends_tensor = _tensor_reg->getPortableTensor(ends_index);
915 auto strides_tensor = _tensor_reg->getPortableTensor(strides_index);
917 auto begin_mask = node.param().begin_mask;
918 auto end_mask = node.param().end_mask;
919 auto shrink_axis_mask = node.param().shrink_axis_mask;
921 auto fn = std::make_unique<ops::StridedSliceLayer>();
923 fn->configure(input_tensor, starts_tensor, ends_tensor, strides_tensor, output_tensor, begin_mask,
924 end_mask, shrink_axis_mask);
926 _return_fn = std::move(fn);
929 void KernelGenerator::visit(const ir::operation::Split &node)
931 const auto num_splits = node.param().num_splits;
932 assert(num_splits == static_cast<int>(node.getOutputs().size()));
934 const auto input_idx{node.getInputs().at(ir::operation::Split::Input::INPUT)};
935 const auto axis_idx{node.getInputs().at(ir::operation::Split::Input::AXIS)};
937 auto in_tensor = _tensor_reg->getPortableTensor(input_idx);
938 auto axis_tensor = _tensor_reg->getPortableTensor(axis_idx);
940 std::vector<IPortableTensor *> out_tensors;
941 for (auto &output_idx : node.getOutputs())
942 out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
944 auto fn = std::make_unique<ops::SplitLayer>();
946 fn->configure(in_tensor, axis_tensor, num_splits, out_tensors);
948 _return_fn = std::move(fn);
951 void KernelGenerator::visit(const ir::operation::Shape &node)
953 const auto ofm_index{node.getOutputs().at(0)};
954 const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
956 auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
957 auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
959 auto fn = std::make_unique<ops::ShapeLayer>();
961 fn->configure(ifm_tensor, ofm_tensor);
963 _return_fn = std::move(fn);
966 void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
968 const auto output_index{node.getOutputs().at(0)};
969 const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::INPUT)};
971 auto align_corners = node.param().align_corners;
972 auto half_pixel_centers = node.param().half_pixel_centers;
974 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
975 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
977 auto fn = std::make_unique<ops::ResizeBilinearLayer>();
979 if (node.getInputs().size() == 1)
981 fn->configure(input_tensor, output_tensor, node.param().height_out, node.param().width_out,
982 align_corners, half_pixel_centers);
986 assert(node.getInputs().size() == 2);
987 const auto size_index{node.getInputs().at(ir::operation::ResizeBilinear::SIZE)};
988 auto size_tensor = _tensor_reg->getPortableTensor(size_index);
989 if (size_tensor->is_constant())
991 auto size_vec = _ctx.at(size_index).asVector<int32_t>();
992 const auto height_out = size_vec[0];
993 const auto width_out = size_vec[1];
994 fn->configure(input_tensor, output_tensor, height_out, width_out, align_corners,
999 fn->configure(input_tensor, output_tensor, size_tensor, align_corners, half_pixel_centers);
1003 _return_fn = std::move(fn);
1006 void KernelGenerator::visit(const ir::operation::Reverse &node)
1008 const auto output_index{node.getOutputs().at(0)};
1009 const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)};
1010 const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)};
1012 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1013 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1014 auto axis_tensor = _tensor_reg->getPortableTensor(axis_index);
1016 auto fn = std::make_unique<ops::ReverseLayer>();
1018 fn->configure(input_tensor, axis_tensor, output_tensor);
1020 _return_fn = std::move(fn);
1023 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
1025 const auto output_index{node.getOutputs().at(0)};
1026 const auto input_index{node.getInputs().at(ir::operation::ArgMinMax::INPUT)};
1027 const auto axis_index{node.getInputs().at(ir::operation::ArgMinMax::AXIS)};
1029 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1030 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1031 auto axis_tensor = _tensor_reg->getPortableTensor(axis_index);
1033 auto fn = std::make_unique<ops::ArgMinMaxLayer>();
1035 fn->configure(input_tensor, output_tensor, axis_tensor, node.param().is_arg_max);
1037 _return_fn = std::move(fn);
1040 void KernelGenerator::visit(const ir::operation::Pool2D &node)
1042 const auto ofm_index{node.getOutputs().at(0)};
1043 const auto ifm_index{node.getInputs().at(ir::operation::Pool2D::Input::INPUT)};
1045 const auto kh = node.param().kh;
1046 const auto kw = node.param().kw;
1047 const auto stride = node.param().stride;
1048 const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
1049 const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
1050 const auto padding =
1051 ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
1052 const auto activation = node.param().activation;
1054 auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
1055 auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
1057 auto fn = std::make_unique<ops::PoolLayer>();
1059 fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
1060 stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor,
1061 convertPoolType(node.param().op_type));
1063 _return_fn = std::move(fn);
1066 void KernelGenerator::visit(const ir::operation::Pow &node)
1068 const auto output_index{node.getOutputs().at(0)};
1069 const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)};
1070 const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)};
1072 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1073 auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
1074 auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
1076 auto fn = std::make_unique<ops::PowLayer>();
1078 fn->configure(lhs_tensor, rhs_tensor, ir::Activation::NONE, output_tensor);
1080 _return_fn = std::move(fn);
1083 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
1085 const auto output_index{node.getOutputs().at(0)};
1086 const auto input_index{node.getInputs().at(0)};
1088 auto output_alloc = _tensor_reg->getPortableTensor(output_index);
1089 auto input_alloc = _tensor_reg->getPortableTensor(input_index);
1091 auto fn = std::make_unique<ops::L2NormLayer>();
1093 fn->configure(input_alloc, output_alloc);
1095 _return_fn = std::move(fn);
1098 void KernelGenerator::visit(const ir::operation::Range &node)
1100 const auto output_index{node.getOutputs().at(0)};
1101 const auto start_index{node.getInputs().at(ir::operation::Range::START)};
1102 const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)};
1103 const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)};
1105 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1106 auto start_tensor = _tensor_reg->getPortableTensor(start_index);
1107 auto limit_tensor = _tensor_reg->getPortableTensor(limit_index);
1108 auto delta_tensor = _tensor_reg->getPortableTensor(delta_index);
1110 auto fn = std::make_unique<ops::RangeLayer>();
1112 fn->configure(start_tensor, limit_tensor, delta_tensor, output_tensor);
1113 _return_fn = std::move(fn);
1116 void KernelGenerator::visit(const ir::operation::Rank &node)
1118 const auto ofm_index{node.getOutputs().at(0)};
1119 const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
1121 auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
1122 auto ifm_tensor = _tensor_reg->getPortableTensor(ifm_index);
1124 auto fn = std::make_unique<ops::RankLayer>();
1126 fn->configure(ifm_tensor, ofm_tensor);
1128 _return_fn = std::move(fn);
1131 void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
1133 const auto ofm_index{node.getOutputs().at(0)};
1134 const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
1135 const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
1137 auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
1138 auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
1139 auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
1141 auto fn = std::make_unique<ops::SqDiffLayer>();
1143 fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
1144 _return_fn = std::move(fn);
1147 void KernelGenerator::visit(const ir::operation::Tile &node)
1149 const auto output_index{node.getOutputs().at(0)};
1150 const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)};
1151 const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)};
1153 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1154 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1155 auto multiples_tensor = _tensor_reg->getPortableTensor(multiples_index);
1157 auto fn = std::make_unique<ops::TileLayer>();
1159 fn->configure(input_tensor, multiples_tensor, output_tensor);
1160 _return_fn = std::move(fn);
1163 void KernelGenerator::visit(const ir::operation::MatrixBandPart &node)
1165 const auto output_index{node.getOutputs().at(0)};
1166 const auto input_index{node.getInputs().at(ir::operation::MatrixBandPart::INPUT)};
1167 const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)};
1168 const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)};
1170 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1171 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1172 auto num_lower_tensor = _tensor_reg->getPortableTensor(num_lower_index);
1173 auto num_upper_tensor = _tensor_reg->getPortableTensor(num_upper_index);
1175 auto fn = std::make_unique<ops::MatrixBandPartLayer>();
1177 fn->configure(input_tensor, num_lower_tensor, num_upper_tensor, output_tensor);
1178 _return_fn = std::move(fn);
1181 void KernelGenerator::visit(const ir::operation::DetectionPostProcess &node)
1183 using NMS = ir::operation::DetectionPostProcess;
1185 ops::DetectionPostProcessLayer::DetectionPostProcessParameters parameters;
1186 parameters.scales.y = node.param().scale.y_scale;
1187 parameters.scales.x = node.param().scale.x_scale;
1188 parameters.scales.w = node.param().scale.w_scale;
1189 parameters.scales.h = node.param().scale.h_scale;
1191 parameters.iou_threshold = node.param().iou_threshold;
1192 parameters.score_threshold = node.param().score_threshold;
1193 parameters.max_boxes_per_class = node.param().max_boxes_per_class;
1194 parameters.max_detections = node.param().max_detections;
1195 parameters.num_classes = node.param().num_classes;
1196 parameters.center_box_format = node.param().center_size_boxes;
1197 parameters.max_classes_per_detection = node.param().max_classes_per_detection;
1199 auto boxes_index = node.getInputs().at(NMS::Input::BOXES);
1200 auto scores_index = node.getInputs().at(NMS::Input::SCORES);
1201 auto anchors_index = node.getInputs().at(NMS::Input::INPUT_ANCHORS);
1203 auto o_classes_index = node.getOutputs().at(NMS::Output::BOX_CLASSES);
1204 auto o_coords_index = node.getOutputs().at(NMS::Output::BOX_COORDS);
1205 auto o_scores_index = node.getOutputs().at(NMS::Output::BOX_SCORES);
1206 auto o_num_selected_index = node.getOutputs().at(NMS::Output::NUM_SELECTED);
1208 parameters.boxes_descr = _ctx.at(boxes_index).shape().dims();
1209 parameters.scrores_descr = _ctx.at(scores_index).shape().dims();
1211 parameters.boxes_input = _tensor_reg->getPortableTensor(boxes_index);
1212 parameters.scores_input = _tensor_reg->getPortableTensor(scores_index);
1213 parameters.anchors_input = _tensor_reg->getPortableTensor(anchors_index);
1215 parameters.box_classes_output = _tensor_reg->getPortableTensor(o_classes_index);
1216 parameters.box_coords_output = _tensor_reg->getPortableTensor(o_coords_index);
1217 parameters.box_scores_output = _tensor_reg->getPortableTensor(o_scores_index);
1218 parameters.num_selections_output = _tensor_reg->getPortableTensor(o_num_selected_index);
1220 auto fn = std::make_unique<ops::DetectionPostProcessLayer>();
1221 fn->configure(std::move(parameters));
1223 _return_fn = std::move(fn);
1226 void KernelGenerator::visit(const ir::operation::BatchMatMul &node)
1228 const auto output_index{node.getOutputs().at(0)};
1229 const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)};
1230 const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)};
1232 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1233 auto lhs_tensor = _tensor_reg->getPortableTensor(lhs_index);
1234 auto rhs_tensor = _tensor_reg->getPortableTensor(rhs_index);
1236 const auto adj_x = node.param().adj_x;
1237 const auto adj_y = node.param().adj_y;
1239 auto fn = std::make_unique<ops::BatchMatMulLayer>();
1241 fn->configure(lhs_tensor, rhs_tensor, adj_x, adj_y, output_tensor);
1242 _return_fn = std::move(fn);
1245 void KernelGenerator::visit(const ir::operation::BroadcastTo &node)
1247 const auto output_index{node.getOutputs().at(0)};
1248 const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)};
1249 const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)};
1251 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1252 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1253 auto shape_tensor = _tensor_reg->getPortableTensor(shape_index);
1255 auto fn = std::make_unique<ops::BroadcastToLayer>();
1257 fn->configure(input_tensor, shape_tensor, output_tensor);
1259 _return_fn = std::move(fn);
1262 void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
1264 const auto ofm_index{node.getOutputs().at(0)};
1266 auto output_tensor = _tensor_reg->getPortableTensor(ofm_index);
1267 std::vector<const IPortableTensor *> input_tensors;
1268 for (auto &ifm_idx : node.getInputs())
1269 input_tensors.emplace_back(_tensor_reg->getPortableTensor(ifm_idx));
1271 const auto epsilon = node.param().epsilon;
1272 const auto is_training = node.param().is_training;
1273 const auto data_format = node.param().data_format;
1275 auto fn = std::make_unique<ops::FusedBatchNormLayer>();
1277 fn->configure(input_tensors, epsilon, is_training, data_format, output_tensor);
1279 _return_fn = std::move(fn);
1282 void KernelGenerator::visit(const ir::operation::LogSoftmax &node)
1284 const auto output_index{node.getOutputs().at(0)};
1285 const auto input_index{node.getInputs().at(ir::operation::LogSoftmax::Input::INPUT)};
1287 const auto beta = node.param().beta;
1288 const auto axis = node.param().axis;
1290 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1291 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1293 auto fn = std::make_unique<ops::LogSoftMaxLayer>();
1295 fn->configure(input_tensor, beta, axis, output_tensor);
1297 _return_fn = std::move(fn);
1300 void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
1302 const auto output_index{node.getOutputs().at(0)};
1303 const auto input_index{node.getInputs().at(ir::operation::SpaceToBatchND::INPUT)};
1304 const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)};
1305 const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)};
1307 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1308 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1309 auto block_shape_tensor = _tensor_reg->getPortableTensor(block_shape_index);
1310 auto padding_tensor = _tensor_reg->getPortableTensor(padding_index);
1312 auto fn = std::make_unique<ops::SpaceToBatchNDLayer>();
1314 fn->configure(input_tensor, block_shape_tensor, padding_tensor, output_tensor);
1316 _return_fn = std::move(fn);
1319 void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
1321 const auto input_index{node.getInputs().at(ir::operation::DepthToSpace::Input::INPUT)};
1322 const auto output_index{node.getOutputs().at(0)};
1323 auto block_size = node.param().block_size;
1325 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1326 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1328 auto fn = std::make_unique<ops::DepthToSpaceLayer>();
1330 fn->configure(input_tensor, block_size, output_tensor);
1331 _return_fn = std::move(fn);
1334 void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
1336 const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
1337 const auto output_index{node.getOutputs().at(0)};
1338 auto block_size = node.param().block_size;
1340 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1341 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1343 auto fn = std::make_unique<ops::SpaceToDepthLayer>();
1345 fn->configure(input_tensor, block_size, output_tensor);
1346 _return_fn = std::move(fn);
1349 void KernelGenerator::visit(const ir::operation::StatelessRandomUniform &node)
1351 const auto output_index{node.getOutputs().at(0)};
1352 const auto shape_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SHAPE)};
1353 const auto seed_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SEED)};
1355 auto output_alloc = _tensor_reg->getPortableTensor(output_index);
1356 auto shape_alloc = _tensor_reg->getPortableTensor(shape_index);
1357 auto seed_alloc = _tensor_reg->getPortableTensor(seed_index);
1359 auto fn = std::make_unique<ops::StatelessRandomUniformLayer>();
1361 fn->configure(shape_alloc, seed_alloc, output_alloc);
1362 _return_fn = std::move(fn);
1365 void KernelGenerator::visit(const ir::operation::SplitV &node)
1367 const auto num_splits = node.param().num_splits;
1368 assert(num_splits == static_cast<int>(node.getOutputs().size()));
1370 const auto input_idx{node.getInputs().at(ir::operation::SplitV::Input::INPUT)};
1371 const auto size_splits{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
1372 const auto split_dim{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
1374 auto in_tensor = _tensor_reg->getPortableTensor(input_idx);
1375 auto in_size_splits = _tensor_reg->getPortableTensor(size_splits);
1376 auto in_split_dim = _tensor_reg->getPortableTensor(split_dim);
1378 std::vector<IPortableTensor *> out_tensors;
1379 for (auto &output_idx : node.getOutputs())
1380 out_tensors.emplace_back(_tensor_reg->getPortableTensor(output_idx));
1382 auto fn = std::make_unique<ops::SplitVLayer>();
1384 fn->configure(in_tensor, in_size_splits, in_split_dim, num_splits, out_tensors);
1386 _return_fn = std::move(fn);
1389 void KernelGenerator::visit(const ir::operation::LSTM &node)
1391 const auto scratch_buffer_index{
1392 node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
1393 const auto output_state_out_index{
1394 node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
1395 const auto cell_state_out_index{
1396 node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
1397 const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
1399 const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
1400 const auto input_to_input_weights_index{
1401 node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
1402 const auto input_to_forget_weights_index{
1403 node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
1404 const auto input_to_cell_weights_index{
1405 node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
1406 const auto input_to_output_weights_index{
1407 node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
1408 const auto recurrent_to_input_weights_index{
1409 node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
1410 const auto recurrent_to_forget_weights_index{
1411 node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
1412 const auto recurrent_to_cell_weights_index{
1413 node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
1414 const auto recurrent_to_output_weights_index{
1415 node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
1416 const auto cell_to_input_weights_index{
1417 node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
1418 const auto cell_to_forget_weights_index{
1419 node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
1420 const auto cell_to_output_weights_index{
1421 node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
1422 const auto input_gate_bias_index{
1423 node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
1424 const auto forget_gate_bias_index{
1425 node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
1426 const auto cell_gate_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
1427 const auto output_gate_bias_index{
1428 node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
1429 const auto projection_weights_index{
1430 node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
1431 const auto projection_bias_index{
1432 node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
1433 const auto output_state_in_index{
1434 node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
1435 const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
1436 const auto time_major = node.param().time_major;
1438 // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
1439 // has_input_to_input_weights && has_recurrent_to_input_weights: no CIFG
1440 // !(has_input_to_input_weights && has_recurrent_to_input_weights): CIFG
1441 // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
1442 bool has_input_to_input_weights = _ctx.exist(input_to_input_weights_index) &&
1443 (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
1444 _ctx.at(input_to_input_weights_index).shape().dim(1) != 0);
1445 bool has_recurrent_to_input_weights =
1446 _ctx.exist(recurrent_to_input_weights_index) &&
1447 (_ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
1448 _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0);
1450 // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
1451 // But the cell_to_input_weights does not exist in regular CIFG although peephole.
1452 // has_cell_to_forget_weights && has_cell_to_output_weights: peephole
1453 // !(has_cell_to_forget_weights && has_cell_to_output_weights): no peephole
1454 bool has_cell_to_forget_weights = _ctx.exist(cell_to_forget_weights_index) &&
1455 _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
1456 bool has_cell_to_output_weights = _ctx.exist(cell_to_output_weights_index) &&
1457 _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
1459 bool has_input_gate_bias =
1460 _ctx.exist(input_gate_bias_index) && _ctx.at(input_gate_bias_index).shape().dim(0);
1462 bool has_projection_weights = _ctx.exist(projection_weights_index) &&
1463 (_ctx.at(projection_weights_index).shape().dim(0) != 0 &&
1464 _ctx.at(projection_weights_index).shape().dim(1) != 0);
1465 bool has_projection_bias =
1466 _ctx.exist(projection_bias_index) && _ctx.at(projection_bias_index).shape().dim(0);
1468 auto scratch_buffer_tensor = _ctx.exist(scratch_buffer_index)
1469 ? _tensor_reg->getPortableTensor(scratch_buffer_index)
1470 : nullptr; // optional
1471 auto output_state_out_tensor = _ctx.exist(output_state_out_index)
1472 ? _tensor_reg->getPortableTensor(output_state_out_index)
1473 : nullptr; // optional
1474 auto cell_state_out_tensor = _ctx.exist(cell_state_out_index)
1475 ? _tensor_reg->getPortableTensor(cell_state_out_index)
1476 : nullptr; // optional
1477 auto output_tensor = _tensor_reg->getPortableTensor(output_index);
1479 auto input_tensor = _tensor_reg->getPortableTensor(input_index);
1481 auto input_to_input_weights_tensor =
1482 has_input_to_input_weights ? _tensor_reg->getPortableTensor(input_to_input_weights_index)
1483 : nullptr; // optional
1484 auto input_to_forget_weights_tensor =
1485 _tensor_reg->getPortableTensor(input_to_forget_weights_index);
1486 auto input_to_cell_weights_tensor = _tensor_reg->getPortableTensor(input_to_cell_weights_index);
1487 auto input_to_output_weights_tensor =
1488 _tensor_reg->getPortableTensor(input_to_output_weights_index);
1489 auto recurrent_to_input_weights_tensor =
1490 has_recurrent_to_input_weights
1491 ? _tensor_reg->getPortableTensor(recurrent_to_input_weights_index)
1492 : nullptr; // optional
1493 auto recurrent_to_forget_weights_tensor =
1494 _tensor_reg->getPortableTensor(recurrent_to_forget_weights_index);
1495 auto recurrent_to_cell_weights_tensor =
1496 _tensor_reg->getPortableTensor(recurrent_to_cell_weights_index);
1497 auto recurrent_to_output_weights_tensor =
1498 _tensor_reg->getPortableTensor(recurrent_to_output_weights_index);
1500 auto cell_to_input_weights_tensor = _tensor_reg->getPortableTensor(cell_to_input_weights_index);
1501 auto cell_to_forget_weights_tensor =
1502 has_cell_to_forget_weights ? _tensor_reg->getPortableTensor(cell_to_forget_weights_index)
1503 : nullptr; // optional
1504 auto cell_to_output_weights_tensor =
1505 has_cell_to_output_weights ? _tensor_reg->getPortableTensor(cell_to_output_weights_index)
1506 : nullptr; // optional
1508 auto input_gate_bias_tensor =
1509 has_input_gate_bias ? _tensor_reg->getPortableTensor(input_gate_bias_index) : nullptr;
1510 auto forget_gate_bias_tensor = _tensor_reg->getPortableTensor(forget_gate_bias_index);
1511 auto cell_gate_bias_tensor = _tensor_reg->getPortableTensor(cell_gate_bias_index);
1512 auto output_gate_bias_tensor = _tensor_reg->getPortableTensor(output_gate_bias_index);
1513 auto output_state_in_tensor = _tensor_reg->getPortableTensor(output_state_in_index);
1514 auto cell_state_in_tensor = _tensor_reg->getPortableTensor(cell_state_in_index);
1516 auto projection_weights_tensor = has_projection_weights
1517 ? _tensor_reg->getPortableTensor(projection_weights_index)
1518 : nullptr; // optional
1519 auto projection_bias_tensor = has_projection_bias
1520 ? _tensor_reg->getPortableTensor(projection_bias_index)
1521 : nullptr; // optional
1523 IPortableTensor *input_layer_norm_weights_tensor = nullptr;
1524 IPortableTensor *forget_layer_norm_weights_tensor = nullptr;
1525 IPortableTensor *cell_layer_norm_weights_tensor = nullptr;
1526 IPortableTensor *output_layer_norm_weights_tensor = nullptr;
1527 if (node.getInputs().size() == 24)
1529 const auto input_layer_norm_weights_index{
1530 node.getInputs().at(ir::operation::LSTM::Input::INPUT_LAYER_NORMALIZATION_WEIGHTS)};
1531 const auto forget_layer_norm_weights_index{
1532 node.getInputs().at(ir::operation::LSTM::Input::FORGET_LAYER_NORMALIZATION_WEIGHTS)};
1533 const auto cell_layer_norm_weights_index{
1534 node.getInputs().at(ir::operation::LSTM::Input::CELL_LAYER_NORMALIZATION_WEIGHTS)};
1535 const auto output_layer_norm_weights_index{
1536 node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_LAYER_NORMALIZATION_WEIGHTS)};
1538 input_layer_norm_weights_tensor =
1539 _tensor_reg->getPortableTensor(input_layer_norm_weights_index);
1540 forget_layer_norm_weights_tensor =
1541 _tensor_reg->getPortableTensor(forget_layer_norm_weights_index);
1542 cell_layer_norm_weights_tensor = _tensor_reg->getPortableTensor(cell_layer_norm_weights_index);
1543 output_layer_norm_weights_tensor =
1544 _tensor_reg->getPortableTensor(output_layer_norm_weights_index);
1547 auto fn = std::make_unique<ops::LSTMLayer>();
1550 input_tensor, input_to_input_weights_tensor, input_to_forget_weights_tensor,
1551 input_to_cell_weights_tensor, input_to_output_weights_tensor, recurrent_to_input_weights_tensor,
1552 recurrent_to_forget_weights_tensor, recurrent_to_cell_weights_tensor,
1553 recurrent_to_output_weights_tensor, cell_to_input_weights_tensor, cell_to_forget_weights_tensor,
1554 cell_to_output_weights_tensor, input_layer_norm_weights_tensor,
1555 forget_layer_norm_weights_tensor, cell_layer_norm_weights_tensor,
1556 output_layer_norm_weights_tensor,
1557 /*aux_input=*/nullptr,
1558 /*aux_input_to_input_weights=*/nullptr,
1559 /*aux_input_to_forget_weights=*/nullptr,
1560 /*aux_input_to_cell_weights=*/nullptr,
1561 /*aux_input_to_output_weights=*/nullptr, input_gate_bias_tensor, forget_gate_bias_tensor,
1562 cell_gate_bias_tensor, output_gate_bias_tensor, projection_weights_tensor,
1563 projection_bias_tensor, output_state_in_tensor, cell_state_in_tensor, node.param(),
1564 /*forward_sequence=*/true, time_major,
1565 /*output_offset=*/0, scratch_buffer_tensor, output_state_out_tensor, cell_state_out_tensor,
1567 !_ctx.at(output_state_in_index).info().isVariable() /* means empty buffer on frontend now */,
1568 !_ctx.at(cell_state_in_index).info().isVariable());
1570 _return_fn = std::move(fn);
1574 } // namespace backend
1575 } // namespace onert