47430680f32d418400d9b299100fbeeebd9aa50a
[platform/upstream/opencv.git] / modules / dnn / src / layers / eltwise_layer.cpp
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "../precomp.hpp"
44 #include "layers_common.hpp"
45 #include "../op_halide.hpp"
46 #include "../op_inf_engine.hpp"
47
48 #ifdef HAVE_OPENCL
49 #include "opencl_kernels_dnn.hpp"
50 #endif
51
52 namespace cv
53 {
54 namespace dnn
55 {
56
57 class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
58 {
59 public:
60     enum EltwiseOp
61     {
62         PROD = 0,
63         SUM = 1,
64         MAX = 2,
65     } op;
66     std::vector<float> coeffs;
67
68     EltwiseLayerImpl(const LayerParams& params)
69     {
70         setParamsFrom(params);
71         op = SUM;
72         if (params.has("operation"))
73         {
74             String operation = toLowerCase(params.get<String>("operation"));
75             if (operation == "prod")
76                 op = PROD;
77             else if (operation == "sum")
78                 op = SUM;
79             else if (operation == "max")
80                 op = MAX;
81             else
82                 CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
83         }
84
85         if (params.has("coeff"))
86         {
87             DictValue paramCoeff = params.get("coeff");
88             int i, n = paramCoeff.size();
89             coeffs.resize(n);
90             for (i = 0; i < n; i++)
91             {
92                 coeffs[i] = paramCoeff.get<float>(i);
93             }
94         }
95     }
96
97     virtual bool supportBackend(int backendId) CV_OVERRIDE
98     {
99         return backendId == DNN_BACKEND_OPENCV ||
100                backendId == DNN_BACKEND_HALIDE ||
101                (backendId == DNN_BACKEND_INFERENCE_ENGINE &&
102                 (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()));
103     }
104
105     bool getMemoryShapes(const std::vector<MatShape> &inputs,
106                          const int requiredOutputs,
107                          std::vector<MatShape> &outputs,
108                          std::vector<MatShape> &internals) const CV_OVERRIDE
109     {
110         CV_Assert(inputs.size() >= 2);
111         CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
112         CV_Assert(op == SUM || coeffs.size() == 0);
113
114         for (int i = 1; i < inputs.size(); i++)
115         {
116             CV_Assert(inputs[0] == inputs[i]);
117         }
118
119         outputs.assign(1, inputs[0]);
120
121         return false;
122     }
123
124     class EltwiseInvoker : public ParallelLoopBody
125     {
126     public:
127         const Mat* srcs;
128         int nsrcs;
129         Mat* dst;
130         const std::vector<float>* coeffs;
131         EltwiseOp op;
132         int nstripes;
133         const ActivationLayer* activ;
134         int channels;
135         size_t planeSize;
136
137         EltwiseInvoker() : srcs(0), nsrcs(0), dst(0), coeffs(0), op(PROD), nstripes(0), activ(0), channels(0), planeSize(0)  {}
138
139         static void run(const Mat* srcs, int nsrcs, Mat& dst,
140                         const std::vector<float>& coeffs, EltwiseOp op,
141                         const ActivationLayer* activ, int nstripes)
142         {
143             CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
144             CV_Assert(coeffs.empty() || coeffs.size() == (size_t)nsrcs);
145
146             for( int i = 0; i < nsrcs; i++ )
147             {
148                 CV_Assert(srcs[i].size == dst.size &&
149                           srcs[i].type() == dst.type() &&
150                           srcs[i].isContinuous());
151             }
152
153             EltwiseInvoker p;
154             p.srcs = srcs;
155             p.nsrcs = nsrcs;
156             p.dst = &dst;
157             p.op = op;
158             p.nstripes = nstripes;
159             p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
160
161             p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
162             CV_Assert(dst.total() == dst.size[0] * p.channels * p.planeSize);
163
164             bool simpleCoeffs = true;
165             if( op == SUM && !coeffs.empty() )
166             {
167                 CV_Assert( coeffs.size() == (size_t)nsrcs );
168
169                 for( size_t i = 0; i < coeffs.size(); i++ )
170                     if( coeffs[i] != 1 )
171                     {
172                         simpleCoeffs = false;
173                         break;
174                     }
175             }
176             p.coeffs = simpleCoeffs ? 0 : &coeffs;
177             p.activ = activ;
178
179             parallel_for_(Range(0, nstripes), p, nstripes);
180         }
181
182         void operator()(const Range& r) const CV_OVERRIDE
183         {
184             size_t total = dst->size[0]*planeSize;
185             size_t stripeSize = (total + nstripes - 1)/nstripes;
186             size_t stripeStart = r.start*stripeSize;
187             size_t stripeEnd = std::min(r.end*stripeSize, total);
188             int c, j, k, n = nsrcs;
189             const float* coeffsptr = coeffs && !coeffs->empty() ? &coeffs->at(0) : 0;
190             float* dstptr0 = dst->ptr<float>();
191             int blockSize0 = 1 << 12, blockSize;
192
193             for( size_t ofs = stripeStart; ofs < stripeEnd; ofs += blockSize )
194             {
195                 int sampleIdx = (int)(ofs / planeSize);
196                 int delta = (int)ofs - sampleIdx * planeSize;
197                 blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
198                 if( blockSize <= 0 )
199                     break;
200
201                 for( c = 0; c < channels; c++ )
202                 {
203                     size_t globalDelta = delta + (sampleIdx*channels + c)*planeSize;
204                     const float* srcptr0 = srcs[0].ptr<float>() + globalDelta;
205                     float* dstptr = dstptr0 + globalDelta;
206
207                     if( op == PROD )
208                     {
209                         for( k = 1; k < n; k++ )
210                         {
211                             const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
212                             for( j = 0; j < blockSize; j++ )
213                             {
214                                 dstptr[j] = srcptr0[j]*srcptr1[j];
215                             }
216                             srcptr0 = (const float*)dstptr;
217                         }
218                     }
219                     else if( op == MAX )
220                     {
221                         for( k = 1; k < n; k++ )
222                         {
223                             const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
224                             for( j = 0; j < blockSize; j++ )
225                             {
226                                 dstptr[j] = std::max(srcptr0[j], srcptr1[j]);
227                             }
228                             srcptr0 = (const float*)dstptr;
229                         }
230                     }
231                     else if( !coeffsptr )
232                     {
233                         for( k = 1; k < n; k++ )
234                         {
235                             const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
236                             for( j = 0; j < blockSize; j++ )
237                             {
238                                 dstptr[j] = srcptr0[j] + srcptr1[j];
239                             }
240                             srcptr0 = (const float*)dstptr;
241                         }
242                     }
243                     else
244                     {
245                         float c0 = coeffsptr[0];
246                         for( k = 1; k < n; k++ )
247                         {
248                             const float* srcptr1 = srcs[k].ptr<float>() + globalDelta;
249                             float c1 = coeffsptr[k];
250                             for( j = 0; j < blockSize; j++ )
251                             {
252                                 dstptr[j] = c0*srcptr0[j] + c1*srcptr1[j];
253                             }
254                             srcptr0 = (const float*)dstptr;
255                             c0 = 1;
256                         }
257                     }
258                 }
259
260                 if( activ )
261                 {
262                     float* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
263                     activ->forwardSlice(ptr, ptr, blockSize, planeSize, 0, channels);
264                 }
265             }
266         }
267     };
268
269 #ifdef HAVE_OPENCL
270     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
271     {
272         std::vector<UMat> inputs;
273         std::vector<UMat> outputs;
274
275         if (inputs_.depth() == CV_16S && op != SUM)
276             return false;
277
278         inputs_.getUMatVector(inputs);
279         outputs_.getUMatVector(outputs);
280
281         switch (op)
282         {
283             case SUM:
284                 {
285                     int channels = total(shape(outputs[0]), 0, 2);
286                     int plane_size = total(shape(outputs[0]), 2);
287                     if (channels % 4 == 0 && plane_size % 4 == 0)
288                     {
289                         size_t localsize[] = { 128 };
290                         size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
291                         String opts;
292                         if (inputs_.depth() == CV_16S)
293                             opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
294                         else
295                             opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
296
297                         for (int i = 0; i < (inputs.size() - 1); ++i)
298                         {
299                             String buildopt = format("-DLOOP=%d", i) + opts;
300                             ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
301                             int idx = 0;
302                             UMat inpMat = (i == 0) ? inputs[0] : UMat();
303                             float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i];
304                             float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1];
305                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0]));
306                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1]));
307                             kernel.set(idx++, (int)plane_size);
308                             kernel.set(idx++, (float)coeff1);
309                             kernel.set(idx++, (float)coeff2);
310                             kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0]));
311                             bool ret = kernel.run(1, globalsize, localsize, false);
312                             if (!ret)
313                                 return false;
314                         }
315                     }
316                     else
317                     {
318                         if (inputs_.depth() == CV_16S)
319                             return false;
320
321                         float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
322                         float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
323                         UMat mul0, mul1;
324                         multiply(coeff1, inputs[0], mul0);
325                         multiply(coeff2, inputs[1], mul1);
326                         add(mul0, mul1, outputs[0]);
327                         for (int i = 2; i < inputs.size(); ++i)
328                         {
329                             float coeff = coeffs.empty() ? 1.f : coeffs[i];
330                             multiply(coeff, inputs[i], mul0);
331                             add(mul0, outputs[0], outputs[0]);
332                         }
333                     }
334                 }
335                 break;
336             case PROD:
337                 multiply(inputs[0], inputs[1], outputs[0]);
338                 for (int i = 2; i < inputs.size(); ++i)
339                     multiply(inputs[i], outputs[0], outputs[0]);
340                 break;
341             case MAX:
342                 max(inputs[0], inputs[1], outputs[0]);
343                 for (int i = 2; i < inputs.size(); ++i)
344                     max(inputs[i], outputs[0], outputs[0]);
345                 break;
346             default:
347                 return false;
348         }
349         return true;
350     }
351 #endif
352
353     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
354     {
355         CV_TRACE_FUNCTION();
356         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
357
358         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
359                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
360
361         if (inputs_arr.depth() == CV_16S)
362         {
363             forward_fallback(inputs_arr, outputs_arr, internals_arr);
364             return;
365         }
366
367         std::vector<Mat> inputs, outputs;
368         inputs_arr.getMatVector(inputs);
369         outputs_arr.getMatVector(outputs);
370
371         CV_Assert(outputs.size() == 1);
372         const int nstripes = getNumThreads();
373         EltwiseInvoker::run(&inputs[0], (int)inputs.size(), outputs[0],
374                             coeffs, op, activ.get(), nstripes);
375     }
376
377     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
378     {
379 #ifdef HAVE_HALIDE
380         Halide::Var x("x"), y("y"), c("c"), n("n");
381         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
382         Halide::Expr topExpr;
383         std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
384         switch (op)
385         {
386             case SUM:
387                 if (coeffs.empty())
388                 {
389                     topExpr = inputBuffers[0](x, y, c, n) +
390                               inputBuffers[1](x, y, c, n);
391                     for (int i = 2; i < inputBuffers.size(); ++i)
392                         topExpr += inputBuffers[i](x, y, c, n);
393                 }
394                 else
395                 {
396                   topExpr = coeffs[0] * inputBuffers[0](x, y, c, n) +
397                             coeffs[1] * inputBuffers[1](x, y, c, n);
398                   for (int i = 2; i < inputBuffers.size(); ++i)
399                       topExpr += coeffs[i] * inputBuffers[i](x, y, c, n);
400                 }
401                 break;
402             case PROD:
403                 topExpr = inputBuffers[0](x, y, c, n) *
404                           inputBuffers[1](x, y, c, n);
405                 for (int i = 2; i < inputBuffers.size(); ++i)
406                     topExpr *= inputBuffers[i](x, y, c, n);
407                 break;
408             case MAX:
409                 topExpr = max(inputBuffers[0](x, y, c, n),
410                               inputBuffers[1](x, y, c, n));
411                 for (int i = 2; i < inputBuffers.size(); ++i)
412                     topExpr = max(topExpr, inputBuffers[i](x, y, c, n));
413                 break;
414             default:
415                 return Ptr<BackendNode>();
416         }
417         top(x, y, c, n) = topExpr;
418         return Ptr<BackendNode>(new HalideBackendNode(top));
419 #endif  // HAVE_HALIDE
420         return Ptr<BackendNode>();
421     }
422
423 #ifdef HAVE_INF_ENGINE
424     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
425     {
426         InferenceEngine::Builder::EltwiseLayer ieLayer(name);
427
428         ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
429
430         if (op == SUM)
431             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM);
432         else if (op == PROD)
433             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL);
434         else if (op == MAX)
435             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX);
436         else
437             CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
438
439         InferenceEngine::Builder::Layer l = ieLayer;
440         if (!coeffs.empty())
441             l.getParameters()["coeff"] = coeffs;
442
443         return Ptr<BackendNode>(new InfEngineBackendNode(l));
444     }
445 #endif  // HAVE_INF_ENGINE
446
447     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
448                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
449     {
450         CV_UNUSED(outputs); // suppress unused variable warning
451         CV_Assert(inputs.size());
452
453         long flops = inputs.size() * total(inputs[0]);
454
455         return flops;
456     }
457
458     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
459     {
460         if (activ.empty() || layer.empty())
461         {
462             activ = layer;
463             return !activ.empty();
464         }
465         else
466             return false;
467     }
468
469     Ptr<ActivationLayer> activ;
470 };
471
472 Ptr<EltwiseLayer> EltwiseLayer::create(const LayerParams& params)
473 {
474     return Ptr<EltwiseLayer>(new EltwiseLayerImpl(params));
475 }
476
477 }
478 }