1 /*M///////////////////////////////////////////////////////////////////////////////////////
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
11 // For Open Source Computer Vision Library
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_cuda.hpp"
46 #include "../op_halide.hpp"
47 #include "../op_inf_engine.hpp"
48 #include "../ie_ngraph.hpp"
49 #include "../op_vkcom.hpp"
50 #include "../op_webnn.hpp"
51 #include "../op_timvx.hpp"
52 #include "../op_cann.hpp"
55 #include "opencl_kernels_dnn.hpp"
59 #include "../cuda4dnn/primitives/concat.hpp"
60 using namespace cv::dnn::cuda4dnn;
68 class ConcatLayerImpl CV_FINAL : public ConcatLayer
71 ConcatLayerImpl(const LayerParams& params)
73 setParamsFrom(params);
74 axis = params.get<int>("axis", 1);
75 padding = params.get<bool>("padding", false);
76 paddingValue = params.get<int>("padding_value", 0);
78 zeropoint = params.get<int>("zeropoints", 0);
79 scale = params.get<float>("scales", 1.0f);
82 virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
83 const int requiredOutputs,
84 std::vector<MatShape> &outputs,
85 std::vector<MatShape> &internals) const CV_OVERRIDE
87 CV_Assert(inputs.size() > 0);
88 outputs.resize(1, inputs[0]);
89 int cAxis = normalize_axis(axis, inputs[0]);
92 for (size_t i = 0; i < inputs.size(); i++)
94 MatShape curShape = inputs[i];
98 for (int curAxis = 0; curAxis < outputs[0].size(); curAxis++)
100 outputs[0][curAxis] = std::max(outputs[0][curAxis], curShape[curAxis]);
105 CV_Assert(curShape.size() == outputs[0].size());
106 for (int curAxis = 0; curAxis < outputs[0].size(); curAxis++)
108 if (curAxis != cAxis && outputs[0][curAxis] != curShape[curAxis])
109 CV_Error(Error::StsBadSize, "Inconsistent shape for ConcatLayer");
113 axisSum += curShape[cAxis];
115 outputs[0][cAxis] = axisSum;
119 virtual bool supportBackend(int backendId) CV_OVERRIDE
122 if (backendId == DNN_BACKEND_TIMVX && haveTimVX() && !padding)
126 int len = this->type.length();
129 if (this->type.substr(len - 4) == "Int8")
136 #ifdef HAVE_INF_ENGINE
137 if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
140 return backendId == DNN_BACKEND_OPENCV ||
141 backendId == DNN_BACKEND_CUDA ||
142 (backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1 && !padding) || // By channels
143 (backendId == DNN_BACKEND_WEBNN && !padding) ||
144 (backendId == DNN_BACKEND_VKCOM && haveVulkan() && !padding) ||
145 (backendId == DNN_BACKEND_CANN && !padding);
149 class ChannelConcatInvoker : public ParallelLoopBody
152 std::vector<Mat>* inputs;
155 std::vector<const T*> chptrs;
157 static void run(std::vector<Mat>& inputs, Mat& output, int nstripes)
159 ChannelConcatInvoker cc;
162 cc.nstripes = nstripes;
164 size_t i, ninputs = inputs.size();
165 int nchannels = 0, batchsz = output.size[0];
166 for( i = 0; i < ninputs; i++ )
168 Mat& inp = inputs[i];
169 CV_Assert( inp.isContinuous() && (inp.type() == CV_32F || inp.type() == CV_16S || inp.type() == CV_8S) &&
170 inp.dims == 4 && inp.size[0] == output.size[0] &&
171 inp.size[2] == output.size[2] &&
172 inp.size[3] == output.size[3] );
173 nchannels += inp.size[1];
175 CV_Assert( nchannels == output.size[1] );
176 CV_Assert( output.isContinuous() && (output.type() == CV_32F || output.type() == CV_16S || output.type() == CV_8S) );
178 cc.chptrs.resize(nchannels*batchsz);
181 for( i = 0; i < ninputs; i++)
183 Mat& inp = inputs[i];
184 for( int j = 0; j < batchsz; j++ )
185 for( int k = 0; k < inp.size[1]; k++ )
187 const T* ptr = inp.ptr<T>(j, k);
188 cc.chptrs[ofs + j*nchannels + k] = ptr;
193 parallel_for_(Range(0, nstripes), cc, nstripes);
196 ChannelConcatInvoker() : inputs(0), output(0), nstripes(0) {}
198 void operator()(const Range& r) const CV_OVERRIDE
200 size_t planeSize = (size_t)output->size[2]*output->size[3];
201 size_t nch = chptrs.size();
202 size_t total = nch*planeSize;
203 size_t stripeSize = (total + nstripes - 1)/nstripes;
204 size_t stripeStart = r.start*stripeSize;
205 size_t stripeEnd = std::min(total, r.end*stripeSize);
206 const T** ptrs = (const T**)&chptrs[0];
207 T* outptr = output->ptr<T>();
208 size_t blockSize0 = 1 << 16;
210 for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
212 size_t ch = ofs0/planeSize;
213 size_t ofs = ofs0 - ch*planeSize;
214 size_t blockSize = std::min(blockSize0, planeSize - ofs);
215 memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
222 bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
224 std::vector<UMat> inputs;
225 std::vector<UMat> outputs;
227 bool use_half = (inps.depth() == CV_16S);
228 inps.getUMatVector(inputs);
229 outs.getUMatVector(outputs);
231 int cAxis = normalize_axis(axis, inputs[0].dims);
235 int bottom_concat_axis;
236 int concat_size = total(shape(inputs[0]), cAxis + 1);
237 int top_concat_axis = outputs[0].size[cAxis];
238 int num_concats = total(shape(inputs[0]), 0, cAxis);
239 int offset_concat_axis = 0;
240 UMat& outMat = outputs[0];
241 String buildopt = format(" -DDtype=%s", (use_half) ? "half" : "float");
242 String kname = format("concat_%s", use_half ? "half" : "float");
244 for (size_t i = 0; i < inputs.size(); i++)
246 ocl::Kernel kernel(kname.c_str(), ocl::dnn::concat_oclsrc, buildopt);
250 UMat& inpMat = inputs[i];
251 bottom_concat_axis = inputs[i].size[cAxis];
252 size_t nthreads = inputs[i].total();
254 kernel.set(0, (int)nthreads);
255 kernel.set(1, ocl::KernelArg::PtrReadOnly(inpMat));
256 kernel.set(2, (int)num_concats);
257 kernel.set(3, (int)concat_size);
258 kernel.set(4, (int)top_concat_axis);
259 kernel.set(5, (int)bottom_concat_axis);
260 kernel.set(6, (int)offset_concat_axis);
261 kernel.set(7, ocl::KernelArg::PtrWriteOnly(outMat));
263 if (!kernel.run(1, &nthreads, NULL, false))
266 offset_concat_axis += bottom_concat_axis;
273 void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
276 CV_TRACE_ARG_VALUE(name, "name", name.c_str());
278 CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget) &&
279 inputs_arr.depth() != CV_8S,
280 forward_ocl(inputs_arr, outputs_arr, internals_arr))
282 std::vector<Mat> inputs, outputs;
283 inputs_arr.getMatVector(inputs);
284 outputs_arr.getMatVector(outputs);
286 int cAxis = normalize_axis(axis, inputs[0].dims);
287 Mat& outMat = outputs[0];
290 outMat.setTo(paddingValue);
292 if( cAxis == 1 && outMat.dims == 4 && !padding)
294 int nstripes = getNumThreads();
295 if (outMat.type() == CV_8S)
296 ChannelConcatInvoker<int8_t>::run(inputs, outMat, nstripes);
298 ChannelConcatInvoker<float>::run(inputs, outMat, nstripes);
302 std::vector<Range> ranges(outputs[0].dims, Range::all());
304 ranges[cAxis].start = 0;
305 for (size_t i = 0; i < inputs.size(); i++)
307 ranges[cAxis].end = ranges[cAxis].start + inputs[i].size[cAxis];
308 for (int j = 0; j < outMat.dims; ++j)
310 if (j == cAxis) continue;
311 ranges[j].start = (outMat.size[j] - inputs[i].size[j]) / 2;
312 ranges[j].end = ranges[j].start + inputs[i].size[j];
314 inputs[i].copyTo(outMat(&ranges[0]));
315 ranges[cAxis].start = ranges[cAxis].end;
321 Ptr<BackendNode> initCUDA(
323 const std::vector<Ptr<BackendWrapper>>& inputs,
324 const std::vector<Ptr<BackendWrapper>>& outputs
327 auto context = reinterpret_cast<csl::CSLContext*>(context_);
329 auto input_wrapper = inputs[0].dynamicCast<CUDABackendWrapper>();
330 auto concat_axis = normalize_axis(axis, input_wrapper->getRank());
331 return make_cuda_node<cuda4dnn::ConcatOp>(preferableTarget, std::move(context->stream), concat_axis, padding);
335 virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
338 vkcom::Tensor in = VkComTensor(input[0]);
339 int cAxis = normalize_axis(axis, in.dimNum());
340 std::shared_ptr<vkcom::OpBase> op(new vkcom::OpConcat(cAxis));
341 return Ptr<BackendNode>(new VkComBackendNode(input, op));
342 #endif // HAVE_VULKAN
343 return Ptr<BackendNode>();
346 virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
349 std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
351 Halide::Var x("x"), y("y"), c("c"), n("n");
352 Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
353 int offset = inputBuffers[0].channels();
354 Halide::Expr topExpr = select(c < offset,
355 inputBuffers[0](x, y, c, n),
356 inputBuffers[1](x, y, c - offset, n));
357 for (int i = 2; i < input.size(); ++i)
359 offset += inputBuffers[i - 1].channels();
360 topExpr = select(c < offset, topExpr,
361 inputBuffers[i](x, y, c - offset, n));
363 top(x, y, c, n) = topExpr;
364 return Ptr<BackendNode>(new HalideBackendNode(top));
365 #endif // HAVE_HALIDE
366 return Ptr<BackendNode>();
370 virtual Ptr<BackendNode> initCann(const std::vector<Ptr<BackendWrapper> > &inputsWrapper, const int index, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
372 CV_Assert(inputsWrapper.size() == nodes.size());
375 std::string op_name = cv::format("concat_%d", index);
376 auto op = std::make_shared<ge::op::ConcatD>(op_name);
379 int N = inputsWrapper.size();
380 op->set_attr_concat_dim(axis);
383 // set inputs : x (dynamic)
384 op->create_dynamic_input_x(N);
385 for (int i = 0; i < N; i++)
387 auto x_i = inputsWrapper[i].dynamicCast<CannBackendWrapper>();
388 auto x_i_desc = x_i->getTensorDesc();
389 auto op_x_i = nodes[i].dynamicCast<CannBackendNode>()->getOp();
390 op->set_dynamic_input_x(i, *op_x_i, "y");
391 op->update_dynamic_input_desc_x(i, *x_i_desc);
395 auto output_y_desc = std::make_shared<ge::TensorDesc>(ge::Shape(), ge::FORMAT_NCHW, ge::DT_FLOAT);
396 op->update_output_desc_y(*output_y_desc);
398 return Ptr<BackendNode>(new CannBackendNode(op));
402 #ifdef HAVE_DNN_NGRAPH
403 virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
404 const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
406 InferenceEngine::DataPtr data = ngraphDataNode(inputs[0]);
407 const int numDims = data->getDims().size();
408 const int cAxis = normalize_axis(axis, numDims);
409 std::vector<size_t> maxDims(numDims, 0);
411 CV_Assert(inputs.size() == nodes.size());
412 ngraph::OutputVector inp_nodes;
413 for (int i = 0; i < nodes.size(); ++i)
415 inp_nodes.push_back(nodes[i].dynamicCast<InfEngineNgraphNode>()->node);
417 std::vector<size_t> inpShape = ngraphDataNode(inputs[i])->getDims();
418 for (int i = 0; i < numDims; ++i)
419 maxDims[i] = std::max(maxDims[i], inpShape[i]);
421 for (int i = 0; i < inp_nodes.size(); ++i)
423 bool needPadding = false;
424 std::vector<size_t> inpShape = ngraphDataNode(inputs[i])->getDims();
425 std::vector<int64_t> begins(inpShape.size(), 0), ends(inpShape.size(), 0);
426 for (int j = 0; j < inpShape.size(); ++j)
428 if (j != cAxis && inpShape[j] != maxDims[j])
431 begins[j] = static_cast<int64_t>((maxDims[j] - inpShape[j]) / 2);
432 ends[j] = static_cast<int64_t>(maxDims[j] - inpShape[j] - begins[j]);
437 inp_nodes[i] = std::make_shared<ngraph::op::v1::Pad>(
439 std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{begins.size()}, begins.data()),
440 std::make_shared<ngraph::op::Constant>(ngraph::element::i64, ngraph::Shape{ends.size()}, ends.data()),
441 ngraph::op::PadMode::CONSTANT);
444 auto concat = std::make_shared<ngraph::op::Concat>(inp_nodes, cAxis);
445 return Ptr<BackendNode>(new InfEngineNgraphNode(concat));
447 #endif // HAVE_DNN_NGRAPH
450 virtual Ptr<BackendNode> initTimVX(void* timVXInfo_,
451 const std::vector<Ptr<BackendWrapper> > &inputsWrapper,
452 const std::vector<Ptr<BackendWrapper> > &outputsWrapper,
453 bool isLast) CV_OVERRIDE
455 // tvGraph Initialization.
456 auto timVxInfo = reinterpret_cast<TimVXInfo *>(timVXInfo_);
457 CV_Assert(timVxInfo);
458 Ptr<TimVXGraph> tvGraph = timVxInfo->getGraph();
460 Ptr<tim::vx::Graph> graph = tvGraph->graph;
462 Ptr<TimVXBackendWrapper> inputWrapper = inputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
463 // convert axis from OpenCV NCHW toTimVX WHCN.
464 Mat blob0 = inputWrapper->getMat();
466 // TODO! support TimVX 5 dim in future.
468 return Ptr<TimVXBackendNode>();
470 int cAxis = normalize_axis(axis, blob0.dims);
471 int tvAxis = blob0.dims - 1 - cAxis;
472 CV_Assert(tvAxis>= 0);
473 std::vector<int> inputsIndex, outputsIndex;
474 int input_index = -1, output_index = -1;
477 Ptr<tim::vx::Quantization> tvQuant = Ptr<tim::vx::Quantization>(
478 new tim::vx::Quantization(tim::vx::QuantType::ASYMMETRIC, scale, zeropoint));
480 for (int i = 0; i<inputsWrapper.size(); i++)
482 inputWrapper = inputsWrapper[i].dynamicCast<TimVXBackendWrapper>();
483 if (inputWrapper->isTensor())
485 input_index = tvGraph->getTensorIndex(inputWrapper->getTensor());
486 if (input_index == -1)
488 // Copy To New inputWrapper
489 Mat tmp = inputWrapper->getMat();
490 inputWrapper = Ptr<TimVXBackendWrapper>(new TimVXBackendWrapper(tmp));
494 if (!inputWrapper->isTensor())
496 inputWrapper->createTensor(graph,tim::vx::TensorAttribute::INPUT, tvQuant);
497 input_index = tvGraph->addWrapper(inputWrapper);
499 inputsIndex.push_back(input_index);
503 CV_Assert(outputsWrapper.size() == 1);
504 Ptr<TimVXBackendWrapper> outputWrapper = outputsWrapper[0].dynamicCast<TimVXBackendWrapper>();
508 auto shapeType = getShapeTypeFromMat(outputWrapper->getMat());
510 // For Graph Output tensor, we need to set tensor shape before createTensor().
511 outputWrapper->setTensorShape(shapeType);
512 outputWrapper->createTensor(graph, tim::vx::TensorAttribute::OUTPUT, tvQuant);
516 outputWrapper->createTensor(graph, tim::vx::TensorAttribute::TRANSIENT, tvQuant);
518 output_index = tvGraph->addWrapper(outputWrapper);
519 outputsIndex.push_back(output_index);
521 std::shared_ptr<tim::vx::Operation> tvConcate = graph->CreateOperation<tim::vx::ops::Concat>(tvAxis, inputsWrapper.size());
523 Ptr<TimVXBackendNode> tvBackendNode = new TimVXBackendNode(tvGraph, tvConcate, inputsIndex, outputsIndex);
525 return tvBackendNode;
529 virtual bool tryQuantize(const std::vector<std::vector<float> > &scales,
530 const std::vector<std::vector<int> > &zeropoints, LayerParams& params) CV_OVERRIDE
533 params.set("padding_value", zeropoints[1][0]);
538 virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
540 Ptr<WebnnBackendNode> node = nodes[0].dynamicCast<WebnnBackendNode>();
541 auto& webnnGraphBuilder = node->net->builder;
542 std::vector<ml::Operand> inputsOperand;
543 for (int i = 0; i < nodes.size(); i++)
545 inputsOperand.push_back(nodes[i].dynamicCast<WebnnBackendNode>()->operand);
547 auto operand = webnnGraphBuilder.Concat(inputsOperand.size(), inputsOperand.data(), axis);
548 return Ptr<BackendNode>(new WebnnBackendNode(operand));
556 Ptr<ConcatLayer> ConcatLayer::create(const LayerParams& params)
558 return Ptr<ConcatLayer>(new ConcatLayerImpl(params));