Merge remote-tracking branch 'upstream/3.4' into merge-3.4
[platform/upstream/opencv.git] / modules / dnn / src / layers / eltwise_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_cuda.hpp"
46 #include "../op_halide.hpp"
47 #include "../op_inf_engine.hpp"
48 #include "../ie_ngraph.hpp"
49
50 #ifdef HAVE_OPENCL
51 #include "opencl_kernels_dnn.hpp"
52 #endif
53
54 #ifdef HAVE_CUDA
55 #include "../cuda4dnn/primitives/eltwise.hpp"
56 using namespace cv::dnn::cuda4dnn;
57 #endif
58
59 namespace cv
60 {
61 namespace dnn
62 {
63
64 class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
65 {
66 public:
67     enum EltwiseOp
68     {
69         PROD = 0,
70         SUM = 1,
71         MAX = 2,
72         DIV = 3
73     } op;
74     std::vector<float> coeffs;
75
76     enum OutputChannelsMode
77     {
78         ELTWISE_CHANNNELS_SAME = 0,              //!< number of channels from inputs must be the same and equal to output's number of channels
79         ELTWISE_CHANNNELS_INPUT_0,               //!< number of channels from inputs may be different,
80                                                  //!< output's number of channels is equal to number of channels of first input
81                                                  //!< number of channels of other inputs should not be greater than number of channels of first input
82         ELTWISE_CHANNNELS_INPUT_0_TRUNCATE,      //!< number of channels from inputs may be different,
83                                                  //!< output's number of channels is equal to number of channels of first input
84                                                  //!< there is restriction on number of channels of other inputs
85                                                  //!< extra channels of other inputs is ignored
86         ELTWISE_CHANNNELS_USE_MAX,               //!< number of channels from inputs may be different,
87                                                  //!< output's number of channels is equal to maximal number of input channels
88                                                  //!< @note supported operation: `SUM`
89     } channelsModeInput;
90
91
92     mutable OutputChannelsMode channelsMode;     //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
93     mutable /*size_t*/int outputChannels;
94
95     EltwiseLayerImpl(const LayerParams& params)
96         : outputChannels(0)
97     {
98         setParamsFrom(params);
99         op = SUM;
100         if (params.has("operation"))
101         {
102             String operation = toLowerCase(params.get<String>("operation"));
103             if (operation == "prod")
104                 op = PROD;
105             else if (operation == "sum")
106                 op = SUM;
107             else if (operation == "max")
108                 op = MAX;
109             else if (operation == "div")
110                 op = DIV;
111             else
112                 CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
113         }
114
115         if (params.has("coeff"))
116         {
117             DictValue paramCoeff = params.get("coeff");
118             int i, n = paramCoeff.size();
119             coeffs.resize(n);
120             for (i = 0; i < n; i++)
121             {
122                 coeffs[i] = paramCoeff.get<float>(i);
123             }
124         }
125
126         channelsModeInput = ELTWISE_CHANNNELS_SAME;
127         if (params.has("output_channels_mode"))
128         {
129             String v = toLowerCase(params.get<String>("output_channels_mode"));
130             if (v == "same")
131             {
132                 channelsModeInput = ELTWISE_CHANNNELS_SAME;
133             }
134             else if (v == "input_0")
135             {
136                 channelsModeInput = ELTWISE_CHANNNELS_INPUT_0;
137             }
138             else if (v == "input_0_truncate")
139             {
140                 channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE;
141             }
142             else if (v == "max_input_channels")
143             {
144                 channelsModeInput = ELTWISE_CHANNNELS_USE_MAX;
145                 if (op != SUM)
146                     CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only");
147             }
148             else
149                 CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\"");
150         }
151         channelsMode = channelsModeInput;
152
153         // TODO Must have checks for other unknown options
154     }
155
156     virtual bool supportBackend(int backendId) CV_OVERRIDE
157     {
158         return backendId == DNN_BACKEND_OPENCV ||
159                backendId == DNN_BACKEND_CUDA ||
160                (backendId == DNN_BACKEND_HALIDE && op != DIV) ||  // TODO: not implemented, see PR #15811
161                ((((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()))
162                 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && channelsMode == ELTWISE_CHANNNELS_SAME));
163     }
164
165     bool getMemoryShapes(const std::vector<MatShape> &inputs,
166                          const int requiredOutputs,
167                          std::vector<MatShape> &outputs,
168                          std::vector<MatShape> &internals) const CV_OVERRIDE
169     {
170         CV_Assert(inputs.size() >= 2);
171         CV_Assert(inputs[0].size() >= 2);
172         CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
173         CV_Assert(op == SUM || coeffs.size() == 0);
174
175         int dims = inputs[0].size();
176         // Number of channels in output shape is determined by the first input tensor.
177         bool variableChannels = false;
178         int numChannels = inputs[0][1];
179         for (size_t i = 1; i < inputs.size(); i++)
180         {
181             CV_Assert(inputs[0][0] == inputs[i][0]);  // batch sizes are equal
182
183             int input_channels = inputs[i][1];
184             if (numChannels != input_channels)
185                 variableChannels = true;
186
187             if (channelsModeInput == ELTWISE_CHANNNELS_SAME)
188             {
189                 CV_Assert(numChannels == input_channels);
190             }
191             else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0)
192             {
193                 CV_Assert(numChannels >= input_channels);
194             }
195             else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
196             {
197                 // nothing to check
198             }
199             else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX)
200             {
201                 numChannels = std::max(numChannels, input_channels);
202             }
203             else
204             {
205                 CV_Assert(0 && "Internal error");
206             }
207
208             for (size_t j = 2; j < dims; j++)
209                 CV_Assert(inputs[0][j] == inputs[i][j]);
210         }
211
212         channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME;
213         outputChannels = numChannels;
214
215         outputs.assign(1, inputs[0]);
216         outputs[0][1] = numChannels;
217         return false;
218     }
219
220
221     class EltwiseInvoker : public ParallelLoopBody
222     {
223         EltwiseLayerImpl& self;
224         std::vector<const Mat*> srcs;
225         std::vector<int> srcNumChannels;
226         int nsrcs;
227         Mat* dst;
228         std::vector<float> coeffs;
229         int nstripes;
230         const ActivationLayer* activ;
231         int channels;
232         size_t planeSize;
233
234         EltwiseInvoker(EltwiseLayerImpl& self_)
235             : self(self_)
236             , nsrcs(0), dst(0), nstripes(0), activ(0), channels(0)
237             , planeSize(0)
238         {}
239
240     public:
241         static void run(EltwiseLayerImpl& self,
242                         const Mat* srcs, int nsrcs, Mat& dst,
243                         int nstripes)
244         {
245             const EltwiseOp op = self.op;
246             CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
247             CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs);
248             CV_CheckGE(nsrcs, 2, "");
249
250             CV_Assert(self.outputChannels == dst.size[1]);
251
252             EltwiseInvoker p(self);
253             p.srcs.resize(nsrcs);
254             p.srcNumChannels.resize(nsrcs);
255             p.coeffs = self.coeffs;  // can be sorted
256
257             bool sortInputs = false;
258             for( int i = 0; i < nsrcs; i++ )
259             {
260                 p.srcs[i] = &srcs[i];
261                 CV_CheckEQ(srcs[i].dims, dst.dims, "");
262                 CV_Assert(srcs[i].isContinuous());
263                 CV_Assert(srcs[i].type() == dst.type());
264                 p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1;
265
266                 if (self.channelsMode == ELTWISE_CHANNNELS_SAME)
267                 {
268                     CV_Assert(srcs[i].size == dst.size);
269                 }
270                 else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0)
271                 {
272                     if (i == 0)
273                         CV_Assert(srcs[0].size == dst.size);
274                     CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
275                     sortInputs = true;
276                 }
277                 else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
278                 {
279                     if (i == 0)
280                         CV_Assert(srcs[0].size == dst.size);
281                     sortInputs = true;
282                 }
283                 else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX)
284                 {
285                     CV_Assert(op == SUM);
286                     CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
287                     sortInputs = true;
288                 }
289                 else
290                 {
291                     CV_Assert(0 && "Internal error");
292                 }
293
294                 if (sortInputs)
295                 {
296                     // Sort srcs and coefficients in the desc order by number of channels
297                     for (int j = i; j >= 1; j--)
298                     {
299                         if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1]))
300                         {
301                             std::swap(p.srcs[j - 1], p.srcs[j]);
302                             std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]);
303                             if (!p.coeffs.empty())
304                                 std::swap(p.coeffs[j - 1], p.coeffs[j]);
305                         }
306                         else
307                             break;
308                     }
309                 }
310             }
311
312             p.nsrcs = nsrcs;
313             p.dst = &dst;
314             p.nstripes = nstripes;
315             p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
316
317             p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
318             CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, "");
319
320             bool simpleCoeffs = true;
321             if (op == SUM && !p.coeffs.empty())
322             {
323                 CV_CheckEQ(p.coeffs.size(), (size_t)nsrcs, "");
324
325                 for (size_t i = 0; i < p.coeffs.size(); i++)
326                 {
327                     if (p.coeffs[i] != 1)
328                     {
329                         simpleCoeffs = false;
330                         break;
331                     }
332                 }
333             }
334             if (simpleCoeffs)
335                 p.coeffs.clear();
336             p.activ = self.activ.get();
337
338             parallel_for_(Range(0, nstripes), p, nstripes);
339         }
340
341         void operator()(const Range& r) const CV_OVERRIDE
342         {
343             const EltwiseOp op = self.op;
344             size_t total = dst->size[0]*planeSize;
345             size_t stripeSize = (total + nstripes - 1)/nstripes;
346             size_t stripeStart = r.start*stripeSize;
347             size_t stripeEnd = std::min(r.end*stripeSize, total);
348             const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
349             float* dstptr0 = dst->ptr<float>();
350             int blockSize0 = 1 << 12;
351
352             for (size_t ofs = stripeStart; ofs < stripeEnd; )
353             {
354                 int sampleIdx = (int)(ofs / planeSize);
355                 int delta = (int)ofs - sampleIdx * planeSize;
356                 int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
357                 if( blockSize <= 0 )
358                     break;
359                 ofs += blockSize;
360
361                 for (int c = 0; c < channels; c++)
362                 {
363                     size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize;
364                     float* dstptr = dstptr0 + dstIdx;
365
366                     // process first two inputs
367                     {
368                         const float* srcptr0 = srcs[0]->ptr<float>() + dstIdx;
369
370                         const int inputIdx = 1;
371                         int src1_channels = srcNumChannels[inputIdx];
372                         if (c >= src1_channels)
373                         {
374                             // no data from second input
375                             if (!coeffsptr || coeffsptr[0] == 1.0f)
376                             {
377                                 for (int j = 0; j < blockSize; j++)
378                                 {
379                                     dstptr[j] = srcptr0[j];
380                                 }
381                             }
382                             else
383                             {
384                                 float c0 = coeffsptr[0];
385                                 for (int j = 0; j < blockSize; j++)
386                                 {
387                                     dstptr[j] = c0*srcptr0[j];
388                                 }
389                             }
390                         }
391                         else
392                         {
393                             size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize;
394                             const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
395
396                             if (op == PROD)
397                             {
398                                 for (int j = 0; j < blockSize; j++)
399                                 {
400                                     dstptr[j] = srcptr0[j] * srcptrI[j];
401                                 }
402                             }
403                             else if (op == DIV)
404                             {
405                                 for (int j = 0; j < blockSize; j++)
406                                 {
407                                     dstptr[j] = srcptr0[j] / srcptrI[j];
408                                 }
409                             }
410                             else if (op == MAX)
411                             {
412                                 for (int j = 0; j < blockSize; j++)
413                                 {
414                                     dstptr[j] = std::max(srcptr0[j], srcptrI[j]);
415                                 }
416                             }
417                             else if (op == SUM)
418                             {
419                                 if (!coeffsptr || (coeffsptr[0] == 1.0f && coeffsptr[1] == 1.0f))
420                                 {
421                                     for (int j = 0; j < blockSize; j++)
422                                     {
423                                         dstptr[j] = srcptr0[j] + srcptrI[j];
424                                     }
425                                 }
426                                 else
427                                 {
428                                     float c0 = coeffsptr[0];
429                                     float c1 = coeffsptr[1];
430                                     for (int j = 0; j < blockSize; j++)
431                                     {
432                                         dstptr[j] = c0*srcptr0[j] + c1*srcptrI[j];
433                                     }
434                                 }
435                             }
436                             else
437                                 CV_Error(Error::StsInternal, "");
438                         }
439                     }
440
441                     // aggregate other inputs (3+)
442                     for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++)
443                     {
444                         int srcI_channels = srcNumChannels[inputIdx];
445                         if (c >= srcI_channels)
446                             continue;  // no data from second input
447                         size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize;
448                         const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
449
450                         if (op == PROD)
451                         {
452                             for (int j = 0; j < blockSize; j++)
453                             {
454                                 dstptr[j] *= srcptrI[j];
455                             }
456                         }
457                         else if (op == DIV)
458                         {
459                             for (int j = 0; j < blockSize; j++)
460                             {
461                                 dstptr[j] /= srcptrI[j];
462                             }
463                         }
464                         else if (op == MAX)
465                         {
466                             for (int j = 0; j < blockSize; j++)
467                             {
468                                 dstptr[j] = std::max(dstptr[j], srcptrI[j]);
469                             }
470                         }
471                         else if (op == SUM)
472                         {
473                             if (!coeffsptr || coeffsptr[inputIdx] == 1.0f)
474                             {
475                                 for (int j = 0; j < blockSize; j++)
476                                 {
477                                     dstptr[j] += srcptrI[j];
478                                 }
479                             }
480                             else
481                             {
482                                 float cI = coeffsptr[inputIdx];
483                                 for (int j = 0; j < blockSize; j++)
484                                 {
485                                     dstptr[j] += cI * srcptrI[j];
486                                 }
487                             }
488                         }
489                         else
490                             CV_Error(Error::StsInternal, "");
491                     }
492                 }
493
494                 if( activ )
495                 {
496                     float* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
497                     activ->forwardSlice(ptr, ptr, blockSize, planeSize, 0, channels);
498                 }
499             }
500         }
501     };
502
503 #ifdef HAVE_OPENCL
504     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
505     {
506         std::vector<UMat> inputs;
507         std::vector<UMat> outputs;
508
509         if ((inputs_.depth() == CV_16S && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
510             return false;
511
512         inputs_.getUMatVector(inputs);
513         outputs_.getUMatVector(outputs);
514
515         switch (op)
516         {
517             case SUM:
518                 {
519                     int channels = total(shape(outputs[0]), 0, 2);
520                     int plane_size = total(shape(outputs[0]), 2);
521                     if (channels % 4 == 0 && plane_size % 4 == 0)
522                     {
523                         size_t localsize[] = { 128 };
524                         size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
525                         String opts;
526                         if (inputs_.depth() == CV_16S)
527                             opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
528                         else
529                             opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
530
531                         for (int i = 0; i < (inputs.size() - 1); ++i)
532                         {
533                             String buildopt = format("-DLOOP=%d", i) + opts;
534                             ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
535                             int idx = 0;
536                             UMat inpMat = (i == 0) ? inputs[0] : UMat();
537                             float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i];
538                             float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1];
539                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0]));
540                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1]));
541                             kernel.set(idx++, (int)plane_size);
542                             kernel.set(idx++, (float)coeff1);
543                             kernel.set(idx++, (float)coeff2);
544                             kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0]));
545                             bool ret = kernel.run(1, globalsize, localsize, false);
546                             if (!ret)
547                                 return false;
548                         }
549                     }
550                     else
551                     {
552                         if (inputs_.depth() == CV_16S)
553                             return false;
554
555                         float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
556                         float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
557                         UMat mul0, mul1;
558                         multiply(coeff1, inputs[0], mul0);
559                         multiply(coeff2, inputs[1], mul1);
560                         add(mul0, mul1, outputs[0]);
561                         for (int i = 2; i < inputs.size(); ++i)
562                         {
563                             float coeff = coeffs.empty() ? 1.f : coeffs[i];
564                             multiply(coeff, inputs[i], mul0);
565                             add(mul0, outputs[0], outputs[0]);
566                         }
567                     }
568                 }
569                 break;
570             case PROD:
571                 multiply(inputs[0], inputs[1], outputs[0]);
572                 for (int i = 2; i < inputs.size(); ++i)
573                     multiply(inputs[i], outputs[0], outputs[0]);
574                 break;
575             case DIV:
576                 divide(inputs[0], inputs[1], outputs[0]);
577                 for (int i = 2; i < inputs.size(); ++i)
578                     divide(outputs[0], inputs[i], outputs[0]);
579                 break;
580             case MAX:
581                 max(inputs[0], inputs[1], outputs[0]);
582                 for (int i = 2; i < inputs.size(); ++i)
583                     max(inputs[i], outputs[0], outputs[0]);
584                 break;
585             default:
586                 return false;
587         }
588         return true;
589     }
590 #endif
591
592     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
593     {
594         CV_TRACE_FUNCTION();
595         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
596
597         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
598                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
599
600         if (inputs_arr.depth() == CV_16S)
601         {
602             forward_fallback(inputs_arr, outputs_arr, internals_arr);
603             return;
604         }
605
606         std::vector<Mat> inputs, outputs;
607         inputs_arr.getMatVector(inputs);
608         outputs_arr.getMatVector(outputs);
609
610         CV_Assert(outputs.size() == 1);
611         const int nstripes = getNumThreads();
612         EltwiseInvoker::run(*this,
613                             &inputs[0], (int)inputs.size(), outputs[0],
614                             nstripes);
615     }
616
617 #ifdef HAVE_CUDA
618     Ptr<BackendNode> initCUDA(
619         void *context_,
620         const std::vector<Ptr<BackendWrapper>>& inputs,
621         const std::vector<Ptr<BackendWrapper>>& outputs
622     ) override
623     {
624         auto context = reinterpret_cast<csl::CSLContext*>(context_);
625
626         auto op_ = [this] {
627             switch (op) {
628             case MAX: return cuda4dnn::EltwiseOpType::MAX;
629             case SUM: return cuda4dnn::EltwiseOpType::SUM;
630             case PROD: return cuda4dnn::EltwiseOpType::PRODUCT;
631             case DIV: return cuda4dnn::EltwiseOpType::DIV;
632             }
633             return cuda4dnn::EltwiseOpType::SUM;
634         }();
635
636         return make_cuda_node<cuda4dnn::EltwiseOp>(preferableTarget, std::move(context->stream), op_, coeffs);
637     }
638 #endif
639
640     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
641     {
642 #ifdef HAVE_HALIDE
643         Halide::Var x("x"), y("y"), c("c"), n("n");
644         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
645         Halide::Expr topExpr;
646         std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
647         switch (op)
648         {
649             case SUM:
650                 if (coeffs.empty())
651                 {
652                     topExpr = inputBuffers[0](x, y, c, n) +
653                               inputBuffers[1](x, y, c, n);
654                     for (int i = 2; i < inputBuffers.size(); ++i)
655                         topExpr += inputBuffers[i](x, y, c, n);
656                 }
657                 else
658                 {
659                   topExpr = coeffs[0] * inputBuffers[0](x, y, c, n) +
660                             coeffs[1] * inputBuffers[1](x, y, c, n);
661                   for (int i = 2; i < inputBuffers.size(); ++i)
662                       topExpr += coeffs[i] * inputBuffers[i](x, y, c, n);
663                 }
664                 break;
665             case PROD:
666                 topExpr = inputBuffers[0](x, y, c, n) *
667                           inputBuffers[1](x, y, c, n);
668                 for (int i = 2; i < inputBuffers.size(); ++i)
669                     topExpr *= inputBuffers[i](x, y, c, n);
670                 break;
671             case DIV:
672                 topExpr = inputBuffers[0](x, y, c, n) /
673                           inputBuffers[1](x, y, c, n);
674                 for (int i = 2; i < inputBuffers.size(); ++i)
675                     topExpr /= inputBuffers[i](x, y, c, n);
676                 break;
677             case MAX:
678                 topExpr = max(inputBuffers[0](x, y, c, n),
679                               inputBuffers[1](x, y, c, n));
680                 for (int i = 2; i < inputBuffers.size(); ++i)
681                     topExpr = max(topExpr, inputBuffers[i](x, y, c, n));
682                 break;
683             default:
684                 return Ptr<BackendNode>();
685         }
686         top(x, y, c, n) = topExpr;
687         return Ptr<BackendNode>(new HalideBackendNode(top));
688 #endif  // HAVE_HALIDE
689         return Ptr<BackendNode>();
690     }
691
692 #ifdef HAVE_INF_ENGINE
693     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
694     {
695         InferenceEngine::Builder::EltwiseLayer ieLayer(name);
696
697         ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
698
699         if (op == SUM)
700             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM);
701         else if (op == PROD)
702             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL);
703         else if (op == DIV)
704             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::DIV);
705         else if (op == MAX)
706             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX);
707         else
708             CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
709
710         InferenceEngine::Builder::Layer l = ieLayer;
711         if (!coeffs.empty())
712             l.getParameters()["coeff"] = coeffs;
713
714         return Ptr<BackendNode>(new InfEngineBackendNode(l));
715     }
716 #endif  // HAVE_INF_ENGINE
717
718
719 #ifdef HAVE_DNN_NGRAPH
720     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
721                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
722     {
723         auto curr_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
724         if (!coeffs.empty()) {
725             auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[0]);
726             curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
727         }
728
729         for (size_t i = 1; i < nodes.size(); i++)
730         {
731             auto next_node = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
732             if (!coeffs.empty()) {
733                 auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[i]);
734                 next_node = std::make_shared<ngraph::op::v1::Multiply>(next_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
735             }
736             switch (op) {
737                 case SUM:  curr_node = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
738                 case PROD: curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
739                 case DIV:  curr_node = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
740                 case MAX:  curr_node = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
741                 default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
742             }
743         }
744         return Ptr<BackendNode>(new InfEngineNgraphNode(curr_node));
745     }
746 #endif  // HAVE_DNN_NGRAPH
747
748     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
749                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
750     {
751         CV_UNUSED(outputs); // suppress unused variable warning
752         CV_Assert(inputs.size());
753
754         // FIXIT: handle inputs with different number of channels
755         long flops = inputs.size() * total(inputs[0]);
756
757         return flops;
758     }
759
760     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
761     {
762         if (activ.empty() || layer.empty())
763         {
764             activ = layer;
765             return !activ.empty();
766         }
767         else
768             return false;
769     }
770
771     Ptr<ActivationLayer> activ;
772 };
773
774 Ptr<EltwiseLayer> EltwiseLayer::create(const LayerParams& params)
775 {
776     return Ptr<EltwiseLayer>(new EltwiseLayerImpl(params));
777 }
778
779 }
780 }