Merge pull request #14827 from YashasSamaga:cuda4dnn-csl-low
[platform/upstream/opencv.git] / modules / dnn / src / layers / eltwise_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_cuda.hpp"
46 #include "../op_halide.hpp"
47 #include "../op_inf_engine.hpp"
48
49 #ifdef HAVE_OPENCL
50 #include "opencl_kernels_dnn.hpp"
51 #endif
52
53 #ifdef HAVE_CUDA
54 #include "../cuda4dnn/primitives/eltwise.hpp"
55 using namespace cv::dnn::cuda4dnn;
56 #endif
57
58 namespace cv
59 {
60 namespace dnn
61 {
62
63 class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
64 {
65 public:
66     enum EltwiseOp
67     {
68         PROD = 0,
69         SUM = 1,
70         MAX = 2,
71     } op;
72     std::vector<float> coeffs;
73
74     EltwiseLayerImpl(const LayerParams& params)
75     {
76         setParamsFrom(params);
77         op = SUM;
78         if (params.has("operation"))
79         {
80             String operation = toLowerCase(params.get<String>("operation"));
81             if (operation == "prod")
82                 op = PROD;
83             else if (operation == "sum")
84                 op = SUM;
85             else if (operation == "max")
86                 op = MAX;
87             else
88                 CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
89         }
90
91         if (params.has("coeff"))
92         {
93             DictValue paramCoeff = params.get("coeff");
94             int i, n = paramCoeff.size();
95             coeffs.resize(n);
96             for (i = 0; i < n; i++)
97             {
98                 coeffs[i] = paramCoeff.get<float>(i);
99             }
100         }
101     }
102
103     virtual bool supportBackend(int backendId) CV_OVERRIDE
104     {
105         return backendId == DNN_BACKEND_OPENCV ||
106                backendId == DNN_BACKEND_CUDA ||
107                backendId == DNN_BACKEND_HALIDE ||
108                (backendId == DNN_BACKEND_INFERENCE_ENGINE &&
109                 (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()));
110     }
111
112     bool getMemoryShapes(const std::vector<MatShape> &inputs,
113                          const int requiredOutputs,
114                          std::vector<MatShape> &outputs,
115                          std::vector<MatShape> &internals) const CV_OVERRIDE
116     {
117         CV_Assert(inputs.size() >= 2);
118         CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
119         CV_Assert(op == SUM || coeffs.size() == 0);
120
121         for (int i = 1; i < inputs.size(); i++)
122         {
123             CV_Assert(inputs[0] == inputs[i]);
124         }
125
126         outputs.assign(1, inputs[0]);
127
128         return false;
129     }
130
131     class EltwiseInvoker : public ParallelLoopBody
132     {
133     public:
134         const Mat* srcs;
135         int nsrcs;
136         Mat* dst;
137         const std::vector<float>* coeffs;
138         EltwiseOp op;
139         int nstripes;
140         const ActivationLayer* activ;
141         int channels;
142         size_t planeSize;
143
144         EltwiseInvoker() : srcs(0), nsrcs(0), dst(0), coeffs(0), op(PROD), nstripes(0), activ(0), channels(0), planeSize(0)  {}
145
146         static void run(const Mat* srcs, int nsrcs, Mat& dst,
147                         const std::vector<float>& coeffs, EltwiseOp op,
148                         const ActivationLayer* activ, int nstripes)
149         {
150             CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
151             CV_Assert(coeffs.empty() || coeffs.size() == (size_t)nsrcs);
152
153             for( int i = 0; i < nsrcs; i++ )
154             {
155                 CV_Assert(srcs[i].size == dst.size &&
156                           srcs[i].type() == dst.type() &&
157                           srcs[i].isContinuous());
158             }
159
160             EltwiseInvoker p;
161             p.srcs = srcs;
162             p.nsrcs = nsrcs;
163             p.dst = &dst;
164             p.op = op;
165             p.nstripes = nstripes;
166             p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
167
168             p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
169             CV_Assert(dst.total() == dst.size[0] * p.channels * p.planeSize);
170
171             bool simpleCoeffs = true;
172             if( op == SUM && !coeffs.empty() )
173             {
174                 CV_Assert( coeffs.size() == (size_t)nsrcs );
175
176                 for( size_t i = 0; i < coeffs.size(); i++ )
177                     if( coeffs[i] != 1 )
178                     {
179                         simpleCoeffs = false;
180                         break;
181                     }
182             }
183             p.coeffs = simpleCoeffs ? 0 : &coeffs;
184             p.activ = activ;
185
186             parallel_for_(Range(0, nstripes), p, nstripes);
187         }
188
189         void operator()(const Range& r) const CV_OVERRIDE
190         {
191             size_t total = dst->size[0]*planeSize;
192             size_t stripeSize = (total + nstripes - 1)/nstripes;
193             size_t stripeStart = r.start*stripeSize;
194             size_t stripeEnd = std::min(r.end*stripeSize, total);
195             int c, j, k, n = nsrcs;
196             const float* coeffsptr = coeffs && !coeffs->empty() ? &coeffs->at(0) : 0;
197             float* dstptr0 = dst->ptr<float>();
198             int blockSize0 = 1 << 12, blockSize;
199
200             for( size_t ofs = stripeStart; ofs < stripeEnd; ofs += blockSize )
201             {
202                 int sampleIdx = (int)(ofs / planeSize);
203                 int delta = (int)ofs - sampleIdx * planeSize;
204                 blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
205                 if( blockSize <= 0 )
206                     break;
207
208                 for( c = 0; c < channels; c++ )
209                 {
210                     size_t globalDelta = delta + (sampleIdx*channels + c)*planeSize;
211                     const float* srcptr0 = srcs[0].ptr<float>() + globalDelta;
212                     float* dstptr = dstptr0 + globalDelta;
213
214                     if( op == PROD )
215                     {
216                         for( k = 1; k < n; k++ )
217                         {
218                             const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
219                             for( j = 0; j < blockSize; j++ )
220                             {
221                                 dstptr[j] = srcptr0[j]*srcptr1[j];
222                             }
223                             srcptr0 = (const float*)dstptr;
224                         }
225                     }
226                     else if( op == MAX )
227                     {
228                         for( k = 1; k < n; k++ )
229                         {
230                             const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
231                             for( j = 0; j < blockSize; j++ )
232                             {
233                                 dstptr[j] = std::max(srcptr0[j], srcptr1[j]);
234                             }
235                             srcptr0 = (const float*)dstptr;
236                         }
237                     }
238                     else if( !coeffsptr )
239                     {
240                         for( k = 1; k < n; k++ )
241                         {
242                             const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
243                             for( j = 0; j < blockSize; j++ )
244                             {
245                                 dstptr[j] = srcptr0[j] + srcptr1[j];
246                             }
247                             srcptr0 = (const float*)dstptr;
248                         }
249                     }
250                     else
251                     {
252                         float c0 = coeffsptr[0];
253                         for( k = 1; k < n; k++ )
254                         {
255                             const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
256                             float c1 = coeffsptr[k];
257                             for( j = 0; j < blockSize; j++ )
258                             {
259                                 dstptr[j] = c0*srcptr0[j] + c1*srcptr1[j];
260                             }
261                             srcptr0 = (const float*)dstptr;
262                             c0 = 1;
263                         }
264                     }
265                 }
266
267                 if( activ )
268                 {
269                     float* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
270                     activ->forwardSlice(ptr, ptr, blockSize, planeSize, 0, channels);
271                 }
272             }
273         }
274     };
275
276 #ifdef HAVE_OPENCL
277     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
278     {
279         std::vector<UMat> inputs;
280         std::vector<UMat> outputs;
281
282         if (inputs_.depth() == CV_16S && op != SUM)
283             return false;
284
285         inputs_.getUMatVector(inputs);
286         outputs_.getUMatVector(outputs);
287
288         switch (op)
289         {
290             case SUM:
291                 {
292                     int channels = total(shape(outputs[0]), 0, 2);
293                     int plane_size = total(shape(outputs[0]), 2);
294                     if (channels % 4 == 0 && plane_size % 4 == 0)
295                     {
296                         size_t localsize[] = { 128 };
297                         size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
298                         String opts;
299                         if (inputs_.depth() == CV_16S)
300                             opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
301                         else
302                             opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
303
304                         for (int i = 0; i < (inputs.size() - 1); ++i)
305                         {
306                             String buildopt = format("-DLOOP=%d", i) + opts;
307                             ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
308                             int idx = 0;
309                             UMat inpMat = (i == 0) ? inputs[0] : UMat();
310                             float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i];
311                             float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1];
312                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0]));
313                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1]));
314                             kernel.set(idx++, (int)plane_size);
315                             kernel.set(idx++, (float)coeff1);
316                             kernel.set(idx++, (float)coeff2);
317                             kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0]));
318                             bool ret = kernel.run(1, globalsize, localsize, false);
319                             if (!ret)
320                                 return false;
321                         }
322                     }
323                     else
324                     {
325                         if (inputs_.depth() == CV_16S)
326                             return false;
327
328                         float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
329                         float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
330                         UMat mul0, mul1;
331                         multiply(coeff1, inputs[0], mul0);
332                         multiply(coeff2, inputs[1], mul1);
333                         add(mul0, mul1, outputs[0]);
334                         for (int i = 2; i < inputs.size(); ++i)
335                         {
336                             float coeff = coeffs.empty() ? 1.f : coeffs[i];
337                             multiply(coeff, inputs[i], mul0);
338                             add(mul0, outputs[0], outputs[0]);
339                         }
340                     }
341                 }
342                 break;
343             case PROD:
344                 multiply(inputs[0], inputs[1], outputs[0]);
345                 for (int i = 2; i < inputs.size(); ++i)
346                     multiply(inputs[i], outputs[0], outputs[0]);
347                 break;
348             case MAX:
349                 max(inputs[0], inputs[1], outputs[0]);
350                 for (int i = 2; i < inputs.size(); ++i)
351                     max(inputs[i], outputs[0], outputs[0]);
352                 break;
353             default:
354                 return false;
355         }
356         return true;
357     }
358 #endif
359
360     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
361     {
362         CV_TRACE_FUNCTION();
363         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
364
365         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
366                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
367
368         if (inputs_arr.depth() == CV_16S)
369         {
370             forward_fallback(inputs_arr, outputs_arr, internals_arr);
371             return;
372         }
373
374         std::vector<Mat> inputs, outputs;
375         inputs_arr.getMatVector(inputs);
376         outputs_arr.getMatVector(outputs);
377
378         CV_Assert(outputs.size() == 1);
379         const int nstripes = getNumThreads();
380         EltwiseInvoker::run(&inputs[0], (int)inputs.size(), outputs[0],
381                             coeffs, op, activ.get(), nstripes);
382     }
383
384 #ifdef HAVE_CUDA
385     Ptr<BackendNode> initCUDA(
386         void *context_,
387         const std::vector<Ptr<BackendWrapper>>& inputs,
388         const std::vector<Ptr<BackendWrapper>>& outputs
389     ) override
390     {
391         auto context = reinterpret_cast<csl::CSLContext*>(context_);
392
393         auto op_ = [this] {
394             switch (op) {
395             case MAX: return cuda4dnn::EltwiseOpType::MAX;
396             case SUM: return cuda4dnn::EltwiseOpType::SUM;
397             case PROD: return cuda4dnn::EltwiseOpType::PRODUCT;
398             }
399             return cuda4dnn::EltwiseOpType::SUM;
400         }();
401
402         return make_cuda_node<cuda4dnn::EltwiseOp>(preferableTarget, std::move(context->stream), op_, coeffs);
403     }
404 #endif
405
406     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
407     {
408 #ifdef HAVE_HALIDE
409         Halide::Var x("x"), y("y"), c("c"), n("n");
410         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
411         Halide::Expr topExpr;
412         std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
413         switch (op)
414         {
415             case SUM:
416                 if (coeffs.empty())
417                 {
418                     topExpr = inputBuffers[0](x, y, c, n) +
419                               inputBuffers[1](x, y, c, n);
420                     for (int i = 2; i < inputBuffers.size(); ++i)
421                         topExpr += inputBuffers[i](x, y, c, n);
422                 }
423                 else
424                 {
425                   topExpr = coeffs[0] * inputBuffers[0](x, y, c, n) +
426                             coeffs[1] * inputBuffers[1](x, y, c, n);
427                   for (int i = 2; i < inputBuffers.size(); ++i)
428                       topExpr += coeffs[i] * inputBuffers[i](x, y, c, n);
429                 }
430                 break;
431             case PROD:
432                 topExpr = inputBuffers[0](x, y, c, n) *
433                           inputBuffers[1](x, y, c, n);
434                 for (int i = 2; i < inputBuffers.size(); ++i)
435                     topExpr *= inputBuffers[i](x, y, c, n);
436                 break;
437             case MAX:
438                 topExpr = max(inputBuffers[0](x, y, c, n),
439                               inputBuffers[1](x, y, c, n));
440                 for (int i = 2; i < inputBuffers.size(); ++i)
441                     topExpr = max(topExpr, inputBuffers[i](x, y, c, n));
442                 break;
443             default:
444                 return Ptr<BackendNode>();
445         }
446         top(x, y, c, n) = topExpr;
447         return Ptr<BackendNode>(new HalideBackendNode(top));
448 #endif  // HAVE_HALIDE
449         return Ptr<BackendNode>();
450     }
451
452 #ifdef HAVE_INF_ENGINE
453     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
454     {
455         InferenceEngine::Builder::EltwiseLayer ieLayer(name);
456
457         ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
458
459         if (op == SUM)
460             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM);
461         else if (op == PROD)
462             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL);
463         else if (op == MAX)
464             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX);
465         else
466             CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
467
468         InferenceEngine::Builder::Layer l = ieLayer;
469         if (!coeffs.empty())
470             l.getParameters()["coeff"] = coeffs;
471
472         return Ptr<BackendNode>(new InfEngineBackendNode(l));
473     }
474 #endif  // HAVE_INF_ENGINE
475
476     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
477                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
478     {
479         CV_UNUSED(outputs); // suppress unused variable warning
480         CV_Assert(inputs.size());
481
482         long flops = inputs.size() * total(inputs[0]);
483
484         return flops;
485     }
486
487     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
488     {
489         if (activ.empty() || layer.empty())
490         {
491             activ = layer;
492             return !activ.empty();
493         }
494         else
495             return false;
496     }
497
498     Ptr<ActivationLayer> activ;
499 };
500
501 Ptr<EltwiseLayer> EltwiseLayer::create(const LayerParams& params)
502 {
503     return Ptr<EltwiseLayer>(new EltwiseLayerImpl(params));
504 }
505
506 }
507 }