978b1b6f4a86525481204554fd4168dc1b059160
[platform/upstream/opencv.git] / modules / dnn / src / layers / eltwise_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_halide.hpp"
46 #include "../op_inf_engine.hpp"
47 #include "../ie_ngraph.hpp"
48
49 #ifdef HAVE_OPENCL
50 #include "opencl_kernels_dnn.hpp"
51 #endif
52
53 namespace cv
54 {
55 namespace dnn
56 {
57
58 class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
59 {
60 public:
61     enum EltwiseOp
62     {
63         PROD = 0,
64         SUM = 1,
65         MAX = 2,
66         DIV = 3
67     } op;
68     std::vector<float> coeffs;
69     bool variableChannels;
70
71     EltwiseLayerImpl(const LayerParams& params)
72     {
73         setParamsFrom(params);
74         op = SUM;
75         if (params.has("operation"))
76         {
77             String operation = params.get<String>("operation").toLowerCase();
78             if (operation == "prod")
79                 op = PROD;
80             else if (operation == "sum")
81                 op = SUM;
82             else if (operation == "max")
83                 op = MAX;
84             else if (operation == "div")
85                 op = DIV;
86             else
87                 CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
88         }
89
90         if (params.has("coeff"))
91         {
92             DictValue paramCoeff = params.get("coeff");
93             int i, n = paramCoeff.size();
94             coeffs.resize(n);
95             for (i = 0; i < n; i++)
96             {
97                 coeffs[i] = paramCoeff.get<float>(i);
98             }
99         }
100     }
101
102     virtual bool supportBackend(int backendId) CV_OVERRIDE
103     {
104         return backendId == DNN_BACKEND_OPENCV ||
105                backendId == DNN_BACKEND_HALIDE ||
106                ((((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()))
107                 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && !variableChannels));
108     }
109
110     bool getMemoryShapes(const std::vector<MatShape> &inputs,
111                          const int requiredOutputs,
112                          std::vector<MatShape> &outputs,
113                          std::vector<MatShape> &internals) const CV_OVERRIDE
114     {
115         CV_Assert(inputs.size() >= 2);
116         CV_Assert(inputs[0].size() >= 2);
117         CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
118         CV_Assert(op == SUM || coeffs.size() == 0);
119
120         int dims = inputs[0].size();
121         // Number of channels in output shape is determined by the first input tensor.
122         int numChannels = inputs[0][1];
123         for (int i = 1; i < inputs.size(); i++)
124         {
125             CV_Assert(inputs[0][0] == inputs[i][0]);
126
127             // It's allowed for channels axis to be different.
128             for (int j = 2; j < dims; j++)
129                 CV_Assert(inputs[0][j] == inputs[i][j]);
130         }
131
132         outputs.assign(1, inputs[0]);
133         outputs[0][1] = numChannels;
134         return false;
135     }
136
137     void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
138     {
139         std::vector<Mat> inputs;
140         inputs_arr.getMatVector(inputs);
141         variableChannels = false;
142         for (int i = 1; i < inputs.size(); ++i)
143         {
144             if (inputs[i].size[1] != inputs[0].size[1])
145             {
146                 variableChannels = true;
147                 break;
148             }
149         }
150     }
151
152
153     class EltwiseInvoker : public ParallelLoopBody
154     {
155     public:
156         std::vector<const Mat*> srcs;
157         int nsrcs;
158         Mat* dst;
159         std::vector<float> coeffs;
160         EltwiseOp op;
161         int nstripes;
162         const ActivationLayer* activ;
163         int channels;
164         size_t planeSize;
165
166         EltwiseInvoker() : nsrcs(0), dst(0), op(PROD), nstripes(0), activ(0), channels(0), planeSize(0)  {}
167
168         static void run(const Mat* srcs, int nsrcs, Mat& dst,
169                         const std::vector<float>& coeffs, EltwiseOp op,
170                         const ActivationLayer* activ, int nstripes)
171         {
172             CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
173             CV_Assert(coeffs.empty() || coeffs.size() == (size_t)nsrcs);
174
175             EltwiseInvoker p;
176             p.srcs.resize(nsrcs);
177             p.coeffs = coeffs;
178             for( int i = 0; i < nsrcs; i++ )
179             {
180                 p.srcs[i] = srcs + i;
181                 CV_Assert(srcs[i].type() == dst.type() &&
182                           srcs[i].isContinuous());
183                 // Sort srcs and coefficients in the order by number of channels
184                 for( int j = i; j >= 1 && p.srcs[j - 1]->size[1] < p.srcs[j]->size[1]; j-- )
185                 {
186                     std::swap(p.srcs[j - 1], p.srcs[j]);
187                     if (!p.coeffs.empty())
188                         std::swap(p.coeffs[j - 1], p.coeffs[j]);
189                 }
190             }
191
192             p.nsrcs = nsrcs;
193             p.dst = &dst;
194             p.op = op;
195             p.nstripes = nstripes;
196             p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
197
198             p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
199             CV_Assert(dst.total() == dst.size[0] * p.channels * p.planeSize);
200
201             bool simpleCoeffs = true;
202             if( op == SUM && !coeffs.empty() )
203             {
204                 CV_Assert( coeffs.size() == (size_t)nsrcs );
205
206                 for( size_t i = 0; i < coeffs.size(); i++ )
207                     if( coeffs[i] != 1 )
208                     {
209                         simpleCoeffs = false;
210                         break;
211                     }
212             }
213             if (simpleCoeffs)
214                 p.coeffs.clear();
215             p.activ = activ;
216
217             parallel_for_(Range(0, nstripes), p, nstripes);
218         }
219
220         void operator()(const Range& r) const CV_OVERRIDE
221         {
222             size_t total = dst->size[0]*planeSize;
223             size_t stripeSize = (total + nstripes - 1)/nstripes;
224             size_t stripeStart = r.start*stripeSize;
225             size_t stripeEnd = std::min(r.end*stripeSize, total);
226             int c, j, k, n;
227             const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
228             float* dstptr0 = dst->ptr<float>();
229             int blockSize0 = 1 << 12, blockSize;
230
231             for( size_t ofs = stripeStart; ofs < stripeEnd; ofs += blockSize )
232             {
233                 int sampleIdx = (int)(ofs / planeSize);
234                 int delta = (int)ofs - sampleIdx * planeSize;
235                 blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
236                 if( blockSize <= 0 )
237                     break;
238
239                 for( c = 0; c < channels; c++ )
240                 {
241                     size_t globalDelta = delta + (sampleIdx*channels + c)*planeSize;
242                     const float* srcptr0 = srcs[0]->ptr<float>() + globalDelta;
243                     float* dstptr = dstptr0 + globalDelta;
244
245                     // This code assumes that srcs are sorted in descending order by channels.
246                     for (n = 1; n < nsrcs && c < srcs[n]->size[1]; ++n) {}
247
248                     if (n == 1)
249                     {
250                         if( !coeffsptr )
251                         {
252                             for( j = 0; j < blockSize; j++ )
253                             {
254                                 dstptr[j] = srcptr0[j];
255                             }
256                         }
257                         else
258                         {
259                             float c0 = coeffsptr[0];
260                             for( j = 0; j < blockSize; j++ )
261                             {
262                                 dstptr[j] = c0*srcptr0[j];
263                             }
264                         }
265                     }
266                     else if( op == PROD )
267                     {
268                         for( k = 1; k < n; k++ )
269                         {
270                             const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
271                             for( j = 0; j < blockSize; j++ )
272                             {
273                                 dstptr[j] = srcptr0[j]*srcptr1[j];
274                             }
275                             srcptr0 = (const float*)dstptr;
276                         }
277                     }
278                     else if( op == DIV )
279                     {
280                         for( k = 1; k < n; k++ )
281                         {
282                             const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
283                             for( j = 0; j < blockSize; j++ )
284                             {
285                                 dstptr[j] = srcptr0[j]/srcptr1[j];
286                             }
287                             srcptr0 = (const float*)dstptr;
288                         }
289                     }
290                     else if( op == MAX )
291                     {
292                         for( k = 1; k < n; k++ )
293                         {
294                             const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
295                             for( j = 0; j < blockSize; j++ )
296                             {
297                                 dstptr[j] = std::max(srcptr0[j], srcptr1[j]);
298                             }
299                             srcptr0 = (const float*)dstptr;
300                         }
301                     }
302                     else if( !coeffsptr )
303                     {
304                         for( k = 1; k < n; k++ )
305                         {
306                             const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
307                             for( j = 0; j < blockSize; j++ )
308                             {
309                                 dstptr[j] = srcptr0[j] + srcptr1[j];
310                             }
311                             srcptr0 = (const float*)dstptr;
312                         }
313                     }
314                     else
315                     {
316                         float c0 = coeffsptr[0];
317                         for( k = 1; k < n; k++ )
318                         {
319                             const float* srcptr1 = srcs[k]->ptr<float>() + globalDelta;
320                             float c1 = coeffsptr[k];
321                             for( j = 0; j < blockSize; j++ )
322                             {
323                                 dstptr[j] = c0*srcptr0[j] + c1*srcptr1[j];
324                             }
325                             srcptr0 = (const float*)dstptr;
326                             c0 = 1;
327                         }
328                     }
329                 }
330
331                 if( activ )
332                 {
333                     float* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
334                     activ->forwardSlice(ptr, ptr, blockSize, planeSize, 0, channels);
335                 }
336             }
337         }
338     };
339
340 #ifdef HAVE_OPENCL
341     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
342     {
343         std::vector<UMat> inputs;
344         std::vector<UMat> outputs;
345
346         if ((inputs_.depth() == CV_16S && op != SUM) || variableChannels)
347             return false;
348
349         inputs_.getUMatVector(inputs);
350         outputs_.getUMatVector(outputs);
351
352         switch (op)
353         {
354             case SUM:
355                 {
356                     int channels = total(shape(outputs[0]), 0, 2);
357                     int plane_size = total(shape(outputs[0]), 2);
358                     if (channels % 4 == 0 && plane_size % 4 == 0)
359                     {
360                         size_t localsize[] = { 128 };
361                         size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
362                         String opts;
363                         if (inputs_.depth() == CV_16S)
364                             opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
365                         else
366                             opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
367
368                         for (int i = 0; i < (inputs.size() - 1); ++i)
369                         {
370                             String buildopt = format("-DLOOP=%d", i) + opts;
371                             ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
372                             int idx = 0;
373                             UMat inpMat = (i == 0) ? inputs[0] : UMat();
374                             float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i];
375                             float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1];
376                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0]));
377                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1]));
378                             kernel.set(idx++, (int)plane_size);
379                             kernel.set(idx++, (float)coeff1);
380                             kernel.set(idx++, (float)coeff2);
381                             kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0]));
382                             bool ret = kernel.run(1, globalsize, localsize, false);
383                             if (!ret)
384                                 return false;
385                         }
386                     }
387                     else
388                     {
389                         if (inputs_.depth() == CV_16S)
390                             return false;
391
392                         float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
393                         float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
394                         UMat mul0, mul1;
395                         multiply(coeff1, inputs[0], mul0);
396                         multiply(coeff2, inputs[1], mul1);
397                         add(mul0, mul1, outputs[0]);
398                         for (int i = 2; i < inputs.size(); ++i)
399                         {
400                             float coeff = coeffs.empty() ? 1.f : coeffs[i];
401                             multiply(coeff, inputs[i], mul0);
402                             add(mul0, outputs[0], outputs[0]);
403                         }
404                     }
405                 }
406                 break;
407             case PROD:
408                 multiply(inputs[0], inputs[1], outputs[0]);
409                 for (int i = 2; i < inputs.size(); ++i)
410                     multiply(inputs[i], outputs[0], outputs[0]);
411                 break;
412             case DIV:
413                 divide(inputs[0], inputs[1], outputs[0]);
414                 for (int i = 2; i < inputs.size(); ++i)
415                     divide(outputs[0], inputs[i], outputs[0]);
416                 break;
417             case MAX:
418                 max(inputs[0], inputs[1], outputs[0]);
419                 for (int i = 2; i < inputs.size(); ++i)
420                     max(inputs[i], outputs[0], outputs[0]);
421                 break;
422             default:
423                 return false;
424         }
425         return true;
426     }
427 #endif
428
429     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
430     {
431         CV_TRACE_FUNCTION();
432         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
433
434         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
435                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
436
437         if (inputs_arr.depth() == CV_16S)
438         {
439             forward_fallback(inputs_arr, outputs_arr, internals_arr);
440             return;
441         }
442
443         std::vector<Mat> inputs, outputs;
444         inputs_arr.getMatVector(inputs);
445         outputs_arr.getMatVector(outputs);
446
447         CV_Assert(outputs.size() == 1);
448         const int nstripes = getNumThreads();
449         EltwiseInvoker::run(&inputs[0], (int)inputs.size(), outputs[0],
450                             coeffs, op, activ.get(), nstripes);
451     }
452
453     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
454     {
455 #ifdef HAVE_HALIDE
456         Halide::Var x("x"), y("y"), c("c"), n("n");
457         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
458         Halide::Expr topExpr;
459         std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
460         switch (op)
461         {
462             case SUM:
463                 if (coeffs.empty())
464                 {
465                     topExpr = inputBuffers[0](x, y, c, n) +
466                               inputBuffers[1](x, y, c, n);
467                     for (int i = 2; i < inputBuffers.size(); ++i)
468                         topExpr += inputBuffers[i](x, y, c, n);
469                 }
470                 else
471                 {
472                   topExpr = coeffs[0] * inputBuffers[0](x, y, c, n) +
473                             coeffs[1] * inputBuffers[1](x, y, c, n);
474                   for (int i = 2; i < inputBuffers.size(); ++i)
475                       topExpr += coeffs[i] * inputBuffers[i](x, y, c, n);
476                 }
477                 break;
478             case PROD:
479                 topExpr = inputBuffers[0](x, y, c, n) *
480                           inputBuffers[1](x, y, c, n);
481                 for (int i = 2; i < inputBuffers.size(); ++i)
482                     topExpr *= inputBuffers[i](x, y, c, n);
483                 break;
484             case MAX:
485                 topExpr = max(inputBuffers[0](x, y, c, n),
486                               inputBuffers[1](x, y, c, n));
487                 for (int i = 2; i < inputBuffers.size(); ++i)
488                     topExpr = max(topExpr, inputBuffers[i](x, y, c, n));
489                 break;
490             default:
491                 return Ptr<BackendNode>();
492         }
493         top(x, y, c, n) = topExpr;
494         return Ptr<BackendNode>(new HalideBackendNode(top));
495 #endif  // HAVE_HALIDE
496         return Ptr<BackendNode>();
497     }
498
499 #ifdef HAVE_INF_ENGINE
500     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
501     {
502         InferenceEngine::Builder::EltwiseLayer ieLayer(name);
503
504         ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
505
506         if (op == SUM)
507             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM);
508         else if (op == PROD)
509             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL);
510         else if (op == DIV)
511             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::DIV);
512         else if (op == MAX)
513             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX);
514         else
515             CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
516
517         InferenceEngine::Builder::Layer l = ieLayer;
518         if (!coeffs.empty())
519             l.getParameters()["coeff"] = coeffs;
520
521         return Ptr<BackendNode>(new InfEngineBackendNode(l));
522     }
523 #endif  // HAVE_INF_ENGINE
524
525
526 #ifdef HAVE_DNN_NGRAPH
527     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
528                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
529     {
530         auto curr_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
531         if (!coeffs.empty()) {
532             auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[0]);
533             curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
534         }
535
536         for (size_t i = 1; i < nodes.size(); i++)
537         {
538             auto next_node = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
539             if (!coeffs.empty()) {
540                 auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[i]);
541                 next_node = std::make_shared<ngraph::op::v1::Multiply>(next_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
542             }
543             switch (op) {
544                 case SUM:  curr_node = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
545                 case PROD: curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
546                 case DIV:  curr_node = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
547                 case MAX:  curr_node = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
548                 default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
549             }
550         }
551         return Ptr<BackendNode>(new InfEngineNgraphNode(curr_node));
552     }
553 #endif  // HAVE_DNN_NGRAPH
554
555     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
556                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
557     {
558         CV_UNUSED(outputs); // suppress unused variable warning
559         CV_Assert(inputs.size());
560
561         long flops = inputs.size() * total(inputs[0]);
562
563         return flops;
564     }
565
566     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
567     {
568         if (activ.empty() || layer.empty())
569         {
570             activ = layer;
571             return !activ.empty();
572         }
573         else
574             return false;
575     }
576
577     Ptr<ActivationLayer> activ;
578 };
579
580 Ptr<EltwiseLayer> EltwiseLayer::create(const LayerParams& params)
581 {
582     return Ptr<EltwiseLayer>(new EltwiseLayerImpl(params));
583 }
584
585 }
586 }