1b520cf87a80f41af48285d8a74d5bf9faf0a368
[platform/upstream/opencv.git] / modules / dnn / src / layers / concat_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_cuda.hpp"
46 #include "../op_halide.hpp"
47 #include "../op_inf_engine.hpp"
48 #include "../ie_ngraph.hpp"
49 #include "../op_vkcom.hpp"
50 #include "../op_webnn.hpp"
51 #include "../op_timvx.hpp"
52 #include "../op_cann.hpp"
53
54 #ifdef HAVE_OPENCL
55 #include "opencl_kernels_dnn.hpp"
56 #endif
57
58 #ifdef HAVE_CUDA
59 #include "../cuda4dnn/primitives/concat.hpp"
60 using namespace cv::dnn::cuda4dnn;
61 #endif
62
63 namespace cv
64 {
65 namespace dnn
66 {
67
68 class ConcatLayerImpl CV_FINAL : public ConcatLayer
69 {
70 public:
71     ConcatLayerImpl(const LayerParams& params)
72     {
73         setParamsFrom(params);
74         axis = params.get<int>("axis", 1);
75         padding = params.get<bool>("padding", false);
76         paddingValue = params.get<int>("padding_value", 0);
77
78         zeropoint = params.get<int>("zeropoints", 0);
79         scale = params.get<float>("scales", 1.0f);
80     }
81
82     virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
83                                  const int requiredOutputs,
84                                  std::vector<MatShape> &outputs,
85                                  std::vector<MatShape> &internals) const CV_OVERRIDE
86     {
87         CV_Assert(inputs.size() > 0);
88         outputs.resize(1, inputs[0]);
89         int cAxis = normalize_axis(axis, inputs[0]);
90
91         int axisSum = 0;
92         for (size_t i = 0; i < inputs.size(); i++)
93         {
94             MatShape curShape = inputs[i];
95
96             if (padding)
97             {
98                 for (int curAxis = 0; curAxis < outputs[0].size(); curAxis++)
99                 {
100                     outputs[0][curAxis] = std::max(outputs[0][curAxis], curShape[curAxis]);
101                 }
102             }
103             else
104             {
105                 CV_Assert(curShape.size() == outputs[0].size());
106                 for (int curAxis = 0; curAxis < outputs[0].size(); curAxis++)
107                 {
108                     if (curAxis != cAxis && outputs[0][curAxis] != curShape[curAxis])
109                         CV_Error(Error::StsBadSize, "Inconsistent shape for ConcatLayer");
110                 }
111             }
112
113             axisSum += curShape[cAxis];
114         }
115         outputs[0][cAxis] = axisSum;
116         return false;
117     }
118
119     virtual bool supportBackend(int backendId) CV_OVERRIDE
120     {
121 #ifdef HAVE_TIMVX
122         if (backendId == DNN_BACKEND_TIMVX && haveTimVX() && !padding)
123         {
124             if (axis == -1)
125                 return false;
126             int len = this->type.length();
127             if (len <= 4)
128                 return false;
129             if (this->type.substr(len - 4) == "Int8")
130                 return true;
131             else
132                 return false;
133         }
134 #endif
135
136 #ifdef HAVE_INF_ENGINE
137         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
138             return true;
139 #endif
140         return backendId == DNN_BACKEND_OPENCV ||
141                backendId == DNN_BACKEND_CUDA ||
142                (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !padding) ||  // By channels
143                (backendId == DNN_BACKEND_WEBNN && !padding) ||
144                (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding) ||
145                (backendId == DNN_BACKEND_CANN && !padding);
146     }
147
148     template <class T>
149     class ChannelConcatInvoker : public ParallelLoopBody
150     {
151     public:
152         std::vector<Mat>* inputs;
153         Mat* output;
154         int nstripes;
155         std::vector<const T*> chptrs;
156
157         static void run(std::vector<Mat>& inputs, Mat& output, int nstripes)
158         {
159             ChannelConcatInvoker cc;
160             cc.inputs = &inputs;
161             cc.output = &output;
162             cc.nstripes = nstripes;
163
164             size_t i, ninputs = inputs.size();
165             int nchannels = 0, batchsz = output.size[0];
166             for( i = 0; i < ninputs; i++ )
167             {
168                 Mat& inp = inputs[i];
169                 CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) &&
170                            inp.dims == 4 && inp.size[0] == output.size[0] &&
171                            inp.size[2] == output.size[2] &&
172                            inp.size[3] == output.size[3] );
173                 nchannels += inp.size[1];
174             }
175             CV_Assert( nchannels == output.size[1] );
176             CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) );
177
178             cc.chptrs.resize(nchannels*batchsz);
179
180             int ofs = 0;
181             for( i = 0; i < ninputs; i++)
182             {
183                 Mat& inp = inputs[i];
184                 for( int j = 0; j < batchsz; j++ )
185                     for( int k = 0; k < inp.size[1]; k++ )
186                     {
187                         const T* ptr = inp.ptr<T>(j, k);
188                         cc.chptrs[ofs + j*nchannels + k] = ptr;
189                     }
190                 ofs += inp.size[1];
191             }
192
193             parallel_for_(Range(0, nstripes), cc, nstripes);
194         }
195
196         ChannelConcatInvoker()  : inputs(0), output(0), nstripes(0) {}
197
198         void operator()(const Range& r) const CV_OVERRIDE
199         {
200             size_t planeSize = (size_t)output->size[2]*output->size[3];
201             size_t nch = chptrs.size();
202             size_t total = nch*planeSize;
203             size_t stripeSize = (total + nstripes - 1)/nstripes;
204             size_t stripeStart = r.start*stripeSize;
205             size_t stripeEnd = std::min(total, r.end*stripeSize);
206             const T** ptrs = (const T**)&chptrs[0];
207             T* outptr = output->ptr<T>();
208             size_t blockSize0 = 1 << 16;
209
210             for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
211             {
212                 size_t ch = ofs0/planeSize;
213                 size_t ofs = ofs0 - ch*planeSize;
214                 size_t blockSize = std::min(blockSize0, planeSize - ofs);
215                 memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
216                 ofs0 += blockSize;
217             }
218         }
219     };
220
221 #ifdef HAVE_OPENCL
222     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
223     {
224         std::vector<UMat> inputs;
225         std::vector<UMat> outputs;
226
227         bool use_half = (inps.depth() == CV_16S);
228         inps.getUMatVector(inputs);
229         outs.getUMatVector(outputs);
230
231         int cAxis = normalize_axis(axis, inputs[0].dims);
232         if (padding)
233             return false;
234
235         int bottom_concat_axis;
236         int concat_size = total(shape(inputs[0]), cAxis + 1);
237         int top_concat_axis = outputs[0].size[cAxis];
238         int num_concats = total(shape(inputs[0]), 0, cAxis);
239         int offset_concat_axis = 0;
240         UMat& outMat = outputs[0];
241         String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
242         String kname = format("concat_%s", use_half ? "half" : "float");
243
244         for (size_t i = 0; i < inputs.size(); i++)
245         {
246             ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
247             if (kernel.empty())
248                 return false;
249
250             UMat& inpMat = inputs[i];
251             bottom_concat_axis = inputs[i].size[cAxis];
252             size_t nthreads = inputs[i].total();
253
254             kernel.set(0, (int)nthreads);
255             kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat));
256             kernel.set(2, (int)num_concats);
257             kernel.set(3, (int)concat_size);
258             kernel.set(4, (int)top_concat_axis);
259             kernel.set(5, (int)bottom_concat_axis);
260             kernel.set(6, (int)offset_concat_axis);
261             kernel.set(7, ocl::KernelArg::PtrWriteOnly(outMat));
262
263             if (!kernel.run(1, &nthreads, NULL, false))
264                 return false;
265
266             offset_concat_axis += bottom_concat_axis;
267         }
268
269         return true;
270     }
271 #endif
272
273     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
274     {
275         CV_TRACE_FUNCTION();
276         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
277
278         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
279                    inputs_arr.depth() != CV_8S,
280                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
281
282         std::vector<Mat> inputs, outputs;
283         inputs_arr.getMatVector(inputs);
284         outputs_arr.getMatVector(outputs);
285
286         int cAxis = normalize_axis(axis, inputs[0].dims);
287         Mat& outMat = outputs[0];
288
289         if (padding)
290             outMat.setTo(paddingValue);
291
292         if( cAxis == 1 && outMat.dims == 4 && !padding)
293         {
294             int nstripes = getNumThreads();
295             if (outMat.type() == CV_8S)
296                 ChannelConcatInvoker<int8_t>::run(inputs, outMat, nstripes);
297             else
298                 ChannelConcatInvoker<float>::run(inputs, outMat, nstripes);
299         }
300         else
301         {
302             std::vector<Range> ranges(outputs[0].dims, Range::all());
303
304             ranges[cAxis].start = 0;
305             for (size_t i = 0; i < inputs.size(); i++)
306             {
307                 ranges[cAxis].end = ranges[cAxis].start + inputs[i].size[cAxis];
308                 for (int j = 0; j < outMat.dims; ++j)
309                 {
310                     if (j == cAxis) continue;
311                     ranges[j].start = (outMat.size[j] - inputs[i].size[j]) / 2;
312                     ranges[j].end = ranges[j].start + inputs[i].size[j];
313                 }
314                 inputs[i].copyTo(outMat(&ranges[0]));
315                 ranges[cAxis].start = ranges[cAxis].end;
316             }
317         }
318     }
319
320 #ifdef HAVE_CUDA
321     Ptr<BackendNode> initCUDA(
322         void *context_,
323         const std::vector<Ptr<BackendWrapper>>& inputs,
324         const std::vector<Ptr<BackendWrapper>>& outputs
325     ) override
326     {
327         auto context = reinterpret_cast<csl::CSLContext*>(context_);
328
329         auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
330         auto concat_axis = normalize_axis(axis, input_wrapper->getRank());
331         return make_cuda_node<cuda4dnn::ConcatOp>(preferableTarget, std::move(context->stream), concat_axis, padding);
332     }
333 #endif
334
335     virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
336     {
337 #ifdef HAVE_VULKAN
338         vkcom::Tensor in = VkComTensor(input[0]);
339         int cAxis = normalize_axis(axis, in.dimNum());
340         std::shared_ptr<vkcom::OpBase> op(new vkcom::OpConcat(cAxis));
341         return Ptr<BackendNode>(new VkComBackendNode(input, op));
342 #endif // HAVE_VULKAN
343         return Ptr<BackendNode>();
344     }
345
346     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
347     {
348 #ifdef HAVE_HALIDE
349         std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
350
351         Halide::Var x("x"), y("y"), c("c"), n("n");
352         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
353         int offset = inputBuffers[0].channels();
354         Halide::Expr topExpr = select(c < offset,
355                                       inputBuffers[0](x, y, c, n),
356                                       inputBuffers[1](x, y, c - offset, n));
357         for (int i = 2; i < input.size(); ++i)
358         {
359             offset += inputBuffers[i - 1].channels();
360             topExpr = select(c < offset, topExpr,
361                              inputBuffers[i](x, y, c - offset, n));
362         }
363         top(x, y, c, n) = topExpr;
364         return Ptr<BackendNode>(new HalideBackendNode(top));
365 #endif  // HAVE_HALIDE
366         return Ptr<BackendNode>();
367     }
368
369 #ifdef HAVE_CANN
370     virtual Ptr<BackendNode> initCann(const std::vector<Ptr<BackendWrapper> > &inputsWrapper, const int index, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
371     {
372         CV_Assert(inputsWrapper.size() == nodes.size());
373
374         // create operator
375         std::string op_name = cv::format("concat_%d", index);
376         auto op = std::make_shared<ge::op::ConcatD>(op_name);
377
378         // set attributes
379         int N = inputsWrapper.size();
380         op->set_attr_concat_dim(axis);
381         op->set_attr_N(N);
382
383         // set inputs : x (dynamic)
384         op->create_dynamic_input_x(N);
385         for (int i = 0; i < N; i++)
386         {
387             auto x_i = inputsWrapper[i].dynamicCast<CannBackendWrapper>();
388             auto x_i_desc = x_i->getTensorDesc();
389             auto op_x_i = nodes[i].dynamicCast<CannBackendNode>()->getOp();
390             op->set_dynamic_input_x(i, *op_x_i, "y");
391             op->update_dynamic_input_desc_x(i, *x_i_desc);
392         }
393
394         // set outputs
395         auto output_y_desc = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
396         op->update_output_desc_y(*output_y_desc);
397
398         return Ptr<BackendNode>(new CannBackendNode(op));
399     }
400 #endif
401
402 #ifdef HAVE_DNN_NGRAPH
403     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
404                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
405     {
406         InferenceEngine::DataPtr data = ngraphDataNode(inputs[0]);
407         const int numDims = data->getDims().size();
408         const int cAxis = normalize_axis(axis, numDims);
409         std::vector<size_t> maxDims(numDims, 0);
410
411         CV_Assert(inputs.size() == nodes.size());
412         ngraph::OutputVector inp_nodes;
413         for (int i = 0; i < nodes.size(); ++i)
414         {
415             inp_nodes.push_back(nodes[i].dynamicCast<InfEngineNgraphNode>()->node);
416
417             std::vector<size_t> inpShape = ngraphDataNode(inputs[i])->getDims();
418             for (int i = 0; i < numDims; ++i)
419                 maxDims[i] = std::max(maxDims[i], inpShape[i]);
420         }
421         for (int i = 0; i < inp_nodes.size(); ++i)
422         {
423             bool needPadding = false;
424             std::vector<size_t> inpShape = ngraphDataNode(inputs[i])->getDims();
425             std::vector<int64_t> begins(inpShape.size(), 0), ends(inpShape.size(), 0);
426             for (int j = 0; j < inpShape.size(); ++j)
427             {
428                 if (j != cAxis && inpShape[j] != maxDims[j])
429                 {
430                     needPadding = true;
431                     begins[j] = static_cast<int64_t>((maxDims[j] - inpShape[j]) / 2);
432                     ends[j] = static_cast<int64_t>(maxDims[j] - inpShape[j] - begins[j]);
433                 }
434             }
435             if (needPadding)
436             {
437                 inp_nodes[i] = std::make_shared<ngraph::op::v1::Pad>(
438                     inp_nodes[i],
439                     std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{begins.size()}, begins.data()),
440                     std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{ends.size()}, ends.data()),
441                     ngraph::op::PadMode::CONSTANT);
442             }
443         }
444         auto concat = std::make_shared<ngraph::op::Concat>(inp_nodes, cAxis);
445         return Ptr<BackendNode>(new InfEngineNgraphNode(concat));
446     }
447 #endif  // HAVE_DNN_NGRAPH
448
449 #ifdef HAVE_TIMVX
450     virtual Ptr<BackendNode> initTimVX(void* timVXInfo_,
451                                        const std::vector<Ptr<BackendWrapper> > &inputsWrapper,
452                                        const std::vector<Ptr<BackendWrapper> > &outputsWrapper,
453                                        bool isLast) CV_OVERRIDE
454     {
455         // tvGraph Initialization.
456         auto timVxInfo = reinterpret_cast<TimVXInfo *>(timVXInfo_);
457         CV_Assert(timVxInfo);
458         Ptr<TimVXGraph> tvGraph = timVxInfo->getGraph();
459         CV_Assert(tvGraph);
460         Ptr<tim::vx::Graph> graph = tvGraph->graph;
461
462         Ptr<TimVXBackendWrapper> inputWrapper = inputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
463         // convert axis from OpenCV NCHW toTimVX WHCN.
464         Mat blob0 = inputWrapper->getMat();
465
466         // TODO! support TimVX 5 dim in future.
467         if(blob0.dims >4)
468             return Ptr<TimVXBackendNode>();
469
470         int cAxis = normalize_axis(axis, blob0.dims);
471         int tvAxis = blob0.dims - 1 - cAxis;
472         CV_Assert(tvAxis>= 0);
473         std::vector<int> inputsIndex, outputsIndex;
474         int input_index = -1, output_index = -1;
475
476         // Input
477         Ptr<tim::vx::Quantization> tvQuant = Ptr<tim::vx::Quantization>(
478                 new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, scale, zeropoint));
479
480         for (int i = 0; i<inputsWrapper.size(); i++)
481         {
482             inputWrapper = inputsWrapper[i].dynamicCast<TimVXBackendWrapper>();
483             if (inputWrapper->isTensor())
484             {
485                 input_index = tvGraph->getTensorIndex(inputWrapper->getTensor());
486                 if (input_index == -1)
487                 {
488                     // Copy To New inputWrapper
489                     Mat tmp = inputWrapper->getMat();
490                     inputWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(tmp));
491                 }
492             }
493
494             if (!inputWrapper->isTensor())
495             {
496                 inputWrapper->createTensor(graph,tim::vx::TensorAttribute::INPUT, tvQuant);
497                 input_index = tvGraph->addWrapper(inputWrapper);
498             }
499             inputsIndex.push_back(input_index);
500         }
501
502         //Output
503         CV_Assert(outputsWrapper.size() == 1);
504         Ptr<TimVXBackendWrapper> outputWrapper = outputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
505
506         if (isLast)
507         {
508             auto shapeType = getShapeTypeFromMat(outputWrapper->getMat());
509
510             // For Graph Output tensor, we need to set tensor shape before createTensor().
511             outputWrapper->setTensorShape(shapeType);
512             outputWrapper->createTensor(graph, tim::vx::TensorAttribute::OUTPUT, tvQuant);
513         }
514         else
515         {
516             outputWrapper->createTensor(graph, tim::vx::TensorAttribute::TRANSIENT, tvQuant);
517         }
518         output_index = tvGraph->addWrapper(outputWrapper);
519         outputsIndex.push_back(output_index);
520
521         std::shared_ptr<tim::vx::Operation> tvConcate = graph->CreateOperation<tim::vx::ops::Concat>(tvAxis, inputsWrapper.size());
522
523         Ptr<TimVXBackendNode> tvBackendNode = new TimVXBackendNode(tvGraph, tvConcate, inputsIndex, outputsIndex);
524
525         return tvBackendNode;
526     }
527 #endif // HAVE_TIMVX
528
529     virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
530                              const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
531     {
532         if (padding)
533             params.set("padding_value", zeropoints[1][0]);
534         return true;
535     }
536
537 #ifdef HAVE_WEBNN
538     virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
539     {
540         Ptr<WebnnBackendNode> node = nodes[0].dynamicCast<WebnnBackendNode>();
541         auto& webnnGraphBuilder = node->net->builder;
542         std::vector<ml::Operand> inputsOperand;
543         for (int i = 0; i < nodes.size(); i++)
544         {
545             inputsOperand.push_back(nodes[i].dynamicCast<WebnnBackendNode>()->operand);
546         }
547         auto operand = webnnGraphBuilder.Concat(inputsOperand.size(), inputsOperand.data(), axis);
548         return Ptr<BackendNode>(new WebnnBackendNode(operand));
549     }
550 #endif
551
552     int zeropoint;
553     float scale;
554 };
555
556 Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
557 {
558     return Ptr<ConcatLayer>(new ConcatLayerImpl(params));
559 }
560
561 }
562 }