modules/dnn/src/layers/eltwise_layer.cpp

   1 /*M///////////////////////////////////////////////////////////////////////////////////////
   2 //
   3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
   4 //
   5 //  By downloading, copying, installing or using the software you agree to this license.
   6 //  If you do not agree to this license, do not download, install,
   7 //  copy or use the software.
   8 //
   9 //
  10 //                           License Agreement
  11 //                For Open Source Computer Vision Library
  12 //
  13 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
  14 // Copyright (C) 2017, Intel Corporation, all rights reserved.
  15 // Third party copyrights are property of their respective owners.
  16 //
  17 // Redistribution and use in source and binary forms, with or without modification,
  18 // are permitted provided that the following conditions are met:
  19 //
  20 //   * Redistribution's of source code must retain the above copyright notice,
  21 //     this list of conditions and the following disclaimer.
  22 //
  23 //   * Redistribution's in binary form must reproduce the above copyright notice,
  24 //     this list of conditions and the following disclaimer in the documentation
  25 //     and/or other materials provided with the distribution.
  26 //
  27 //   * The name of the copyright holders may not be used to endorse or promote products
  28 //     derived from this software without specific prior written permission.
  29 //
  30 // This software is provided by the copyright holders and contributors "as is" and
  31 // any express or implied warranties, including, but not limited to, the implied
  32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
  33 // In no event shall the Intel Corporation or contributors be liable for any direct,
  34 // indirect, incidental, special, exemplary, or consequential damages
  35 // (including, but not limited to, procurement of substitute goods or services;
  36 // loss of use, data, or profits; or business interruption) however caused
  37 // and on any theory of liability, whether in contract, strict liability,
  38 // or tort (including negligence or otherwise) arising in any way out of
  39 // the use of this software, even if advised of the possibility of such damage.
  40 //
  41 //M*/
  42
  43 #include "../precomp.hpp"
  44 #include "layers_common.hpp"
  45 #include "../op_cuda.hpp"
  46 #include "../op_halide.hpp"
  47 #include "../op_inf_engine.hpp"
  48 #include "../ie_ngraph.hpp"
  49
  50 #ifdef HAVE_OPENCL
  51 #include "opencl_kernels_dnn.hpp"
  52 #endif
  53
  54 #ifdef HAVE_CUDA
  55 #include "../cuda4dnn/primitives/eltwise.hpp"
  56 using namespace cv::dnn::cuda4dnn;
  57 #endif
  58
  59 namespace cv
  60 {
  61 namespace dnn
  62 {
  63
  64 class EltwiseLayerImpl CV_FINAL : public EltwiseLayer
  65 {
  66 public:
  67     enum EltwiseOp
  68     {
  69         PROD = 0,
  70         SUM = 1,
  71         MAX = 2,
  72         DIV = 3
  73     } op;
  74     std::vector<float> coeffs;
  75
  76     enum OutputChannelsMode
  77     {
  78         ELTWISE_CHANNNELS_SAME = 0,              //!< number of channels from inputs must be the same and equal to output's number of channels
  79         ELTWISE_CHANNNELS_INPUT_0,               //!< number of channels from inputs may be different,
  80                                                  //!< output's number of channels is equal to number of channels of first input
  81                                                  //!< number of channels of other inputs should not be greater than number of channels of first input
  82         ELTWISE_CHANNNELS_INPUT_0_TRUNCATE,      //!< number of channels from inputs may be different,
  83                                                  //!< output's number of channels is equal to number of channels of first input
  84                                                  //!< there is restriction on number of channels of other inputs
  85                                                  //!< extra channels of other inputs is ignored
  86         ELTWISE_CHANNNELS_USE_MAX,               //!< number of channels from inputs may be different,
  87                                                  //!< output's number of channels is equal to maximal number of input channels
  88                                                  //!< @note supported operation: `SUM`
  89     } channelsModeInput;
  90
  91
  92     mutable OutputChannelsMode channelsMode;     //!< "optimized" channels mode (switch to ELTWISE_CHANNNELS_SAME if number of input channels are equal)
  93     mutable /*size_t*/int outputChannels;
  94
  95     EltwiseLayerImpl(const LayerParams& params)
  96         : outputChannels(0)
  97     {
  98         setParamsFrom(params);
  99         op = SUM;
 100         if (params.has("operation"))
 101         {
 102             String operation = toLowerCase(params.get<String>("operation"));
 103             if (operation == "prod")
 104                 op = PROD;
 105             else if (operation == "sum")
 106                 op = SUM;
 107             else if (operation == "max")
 108                 op = MAX;
 109             else if (operation == "div")
 110                 op = DIV;
 111             else
 112                 CV_Error(cv::Error::StsBadArg, "Unknown operation type \"" + operation + "\"");
 113         }
 114
 115         if (params.has("coeff"))
 116         {
 117             DictValue paramCoeff = params.get("coeff");
 118             int i, n = paramCoeff.size();
 119             coeffs.resize(n);
 120             for (i = 0; i < n; i++)
 121             {
 122                 coeffs[i] = paramCoeff.get<float>(i);
 123             }
 124         }
 125
 126         channelsModeInput = ELTWISE_CHANNNELS_SAME;
 127         if (params.has("output_channels_mode"))
 128         {
 129             String v = toLowerCase(params.get<String>("output_channels_mode"));
 130             if (v == "same")
 131             {
 132                 channelsModeInput = ELTWISE_CHANNNELS_SAME;
 133             }
 134             else if (v == "input_0")
 135             {
 136                 channelsModeInput = ELTWISE_CHANNNELS_INPUT_0;
 137             }
 138             else if (v == "input_0_truncate")
 139             {
 140                 channelsModeInput = ELTWISE_CHANNNELS_INPUT_0_TRUNCATE;
 141             }
 142             else if (v == "max_input_channels")
 143             {
 144                 channelsModeInput = ELTWISE_CHANNNELS_USE_MAX;
 145                 if (op != SUM)
 146                     CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") 'max' channels mode is limited to SUM operation only");
 147             }
 148             else
 149                 CV_Error(cv::Error::StsBadArg, "[" + type + "]:(" + name + ") unknown channels mode: \"" + v + "\"");
 150         }
 151         channelsMode = channelsModeInput;
 152
 153         // TODO Must have checks for other unknown options
 154     }
 155
 156     virtual bool supportBackend(int backendId) CV_OVERRIDE
 157     {
 158         return backendId == DNN_BACKEND_OPENCV ||
 159                backendId == DNN_BACKEND_CUDA ||
 160                (backendId == DNN_BACKEND_HALIDE && op != DIV) ||  // TODO: not implemented, see PR #15811
 161                ((((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 && (preferableTarget != DNN_TARGET_OPENCL || coeffs.empty()))
 162                 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) && channelsMode == ELTWISE_CHANNNELS_SAME));
 163     }
 164
 165     bool getMemoryShapes(const std::vector<MatShape> &inputs,
 166                          const int requiredOutputs,
 167                          std::vector<MatShape> &outputs,
 168                          std::vector<MatShape> &internals) const CV_OVERRIDE
 169     {
 170         CV_Assert(inputs.size() >= 2);
 171         CV_Assert(inputs[0].size() >= 2);
 172         CV_Assert(coeffs.size() == 0 || coeffs.size() == inputs.size());
 173         CV_Assert(op == SUM || coeffs.size() == 0);
 174
 175         int dims = inputs[0].size();
 176         // Number of channels in output shape is determined by the first input tensor.
 177         bool variableChannels = false;
 178         int numChannels = inputs[0][1];
 179         for (size_t i = 1; i < inputs.size(); i++)
 180         {
 181             CV_Assert(inputs[0][0] == inputs[i][0]);  // batch sizes are equal
 182
 183             int input_channels = inputs[i][1];
 184             if (numChannels != input_channels)
 185                 variableChannels = true;
 186
 187             if (channelsModeInput == ELTWISE_CHANNNELS_SAME)
 188             {
 189                 CV_Assert(numChannels == input_channels);
 190             }
 191             else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0)
 192             {
 193                 CV_Assert(numChannels >= input_channels);
 194             }
 195             else if (channelsModeInput == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
 196             {
 197                 // nothing to check
 198             }
 199             else if (channelsModeInput == ELTWISE_CHANNNELS_USE_MAX)
 200             {
 201                 numChannels = std::max(numChannels, input_channels);
 202             }
 203             else
 204             {
 205                 CV_Assert(0 && "Internal error");
 206             }
 207
 208             for (size_t j = 2; j < dims; j++)
 209                 CV_Assert(inputs[0][j] == inputs[i][j]);
 210         }
 211
 212         channelsMode = variableChannels ? channelsModeInput : ELTWISE_CHANNNELS_SAME;
 213         outputChannels = numChannels;
 214
 215         outputs.assign(1, inputs[0]);
 216         outputs[0][1] = numChannels;
 217         return false;
 218     }
 219
 220
 221     class EltwiseInvoker : public ParallelLoopBody
 222     {
 223         EltwiseLayerImpl& self;
 224         std::vector<const Mat*> srcs;
 225         std::vector<int> srcNumChannels;
 226         int nsrcs;
 227         Mat* dst;
 228         std::vector<float> coeffs;
 229         int nstripes;
 230         const ActivationLayer* activ;
 231         int channels;
 232         size_t planeSize;
 233
 234         EltwiseInvoker(EltwiseLayerImpl& self_)
 235             : self(self_)
 236             , nsrcs(0), dst(0), nstripes(0), activ(0), channels(0)
 237             , planeSize(0)
 238         {}
 239
 240     public:
 241         static void run(EltwiseLayerImpl& self,
 242                         const Mat* srcs, int nsrcs, Mat& dst,
 243                         int nstripes)
 244         {
 245             const EltwiseOp op = self.op;
 246             CV_Check(dst.dims, 1 < dst.dims && dst.dims <= 5, ""); CV_CheckTypeEQ(dst.type(), CV_32FC1, ""); CV_Assert(dst.isContinuous());
 247             CV_Assert(self.coeffs.empty() || self.coeffs.size() == (size_t)nsrcs);
 248             CV_CheckGE(nsrcs, 2, "");
 249
 250             CV_Assert(self.outputChannels == dst.size[1]);
 251
 252             EltwiseInvoker p(self);
 253             p.srcs.resize(nsrcs);
 254             p.srcNumChannels.resize(nsrcs);
 255             p.coeffs = self.coeffs;  // can be sorted
 256
 257             bool sortInputs = false;
 258             for( int i = 0; i < nsrcs; i++ )
 259             {
 260                 p.srcs[i] = &srcs[i];
 261                 CV_CheckEQ(srcs[i].dims, dst.dims, "");
 262                 CV_Assert(srcs[i].isContinuous());
 263                 CV_Assert(srcs[i].type() == dst.type());
 264                 p.srcNumChannels[i] = (srcs[i].dims >= 4) ? srcs[i].size[1] : 1;
 265
 266                 if (self.channelsMode == ELTWISE_CHANNNELS_SAME)
 267                 {
 268                     CV_Assert(srcs[i].size == dst.size);
 269                 }
 270                 else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0)
 271                 {
 272                     if (i == 0)
 273                         CV_Assert(srcs[0].size == dst.size);
 274                     CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
 275                     sortInputs = true;
 276                 }
 277                 else if (self.channelsMode == ELTWISE_CHANNNELS_INPUT_0_TRUNCATE)
 278                 {
 279                     if (i == 0)
 280                         CV_Assert(srcs[0].size == dst.size);
 281                     sortInputs = true;
 282                 }
 283                 else if (self.channelsMode == ELTWISE_CHANNNELS_USE_MAX)
 284                 {
 285                     CV_Assert(op == SUM);
 286                     CV_Assert(self.outputChannels >= p.srcNumChannels[i]);
 287                     sortInputs = true;
 288                 }
 289                 else
 290                 {
 291                     CV_Assert(0 && "Internal error");
 292                 }
 293
 294                 if (sortInputs)
 295                 {
 296                     // Sort srcs and coefficients in the desc order by number of channels
 297                     for (int j = i; j >= 1; j--)
 298                     {
 299                         if (std::min(self.outputChannels, p.srcs[j - 1]->size[1]) < std::min(self.outputChannels, p.srcs[j]->size[1]))
 300                         {
 301                             std::swap(p.srcs[j - 1], p.srcs[j]);
 302                             std::swap(p.srcNumChannels[j - 1], p.srcNumChannels[j]);
 303                             if (!p.coeffs.empty())
 304                                 std::swap(p.coeffs[j - 1], p.coeffs[j]);
 305                         }
 306                         else
 307                             break;
 308                     }
 309                 }
 310             }
 311
 312             p.nsrcs = nsrcs;
 313             p.dst = &dst;
 314             p.nstripes = nstripes;
 315             p.channels = (dst.dims >= 4 ? dst.size[1] : 1);
 316
 317             p.planeSize = dst.total(dst.dims >= 4 ? 2 : 1);
 318             CV_CheckEQ(dst.total(), dst.size[0] * p.channels * p.planeSize, "");
 319
 320             bool simpleCoeffs = true;
 321             if (op == SUM && !p.coeffs.empty())
 322             {
 323                 CV_CheckEQ(p.coeffs.size(), (size_t)nsrcs, "");
 324
 325                 for (size_t i = 0; i < p.coeffs.size(); i++)
 326                 {
 327                     if (p.coeffs[i] != 1)
 328                     {
 329                         simpleCoeffs = false;
 330                         break;
 331                     }
 332                 }
 333             }
 334             if (simpleCoeffs)
 335                 p.coeffs.clear();
 336             p.activ = self.activ.get();
 337
 338             parallel_for_(Range(0, nstripes), p, nstripes);
 339         }
 340
 341         void operator()(const Range& r) const CV_OVERRIDE
 342         {
 343             const EltwiseOp op = self.op;
 344             size_t total = dst->size[0]*planeSize;
 345             size_t stripeSize = (total + nstripes - 1)/nstripes;
 346             size_t stripeStart = r.start*stripeSize;
 347             size_t stripeEnd = std::min(r.end*stripeSize, total);
 348             const float* coeffsptr = !coeffs.empty() ? &coeffs[0] : 0;
 349             float* dstptr0 = dst->ptr<float>();
 350             int blockSize0 = 1 << 12;
 351
 352             for (size_t ofs = stripeStart; ofs < stripeEnd; )
 353             {
 354                 int sampleIdx = (int)(ofs / planeSize);
 355                 int delta = (int)ofs - sampleIdx * planeSize;
 356                 int blockSize = std::min(blockSize0, std::min((int)(stripeEnd - ofs), (int)planeSize - delta));
 357                 if( blockSize <= 0 )
 358                     break;
 359                 ofs += blockSize;
 360
 361                 for (int c = 0; c < channels; c++)
 362                 {
 363                     size_t dstIdx = delta + (sampleIdx*channels + c)*planeSize;
 364                     float* dstptr = dstptr0 + dstIdx;
 365
 366                     // process first two inputs
 367                     {
 368                         const float* srcptr0 = srcs[0]->ptr<float>() + dstIdx;
 369
 370                         const int inputIdx = 1;
 371                         int src1_channels = srcNumChannels[inputIdx];
 372                         if (c >= src1_channels)
 373                         {
 374                             // no data from second input
 375                             if (!coeffsptr || coeffsptr[0] == 1.0f)
 376                             {
 377                                 for (int j = 0; j < blockSize; j++)
 378                                 {
 379                                     dstptr[j] = srcptr0[j];
 380                                 }
 381                             }
 382                             else
 383                             {
 384                                 float c0 = coeffsptr[0];
 385                                 for (int j = 0; j < blockSize; j++)
 386                                 {
 387                                     dstptr[j] = c0*srcptr0[j];
 388                                 }
 389                             }
 390                         }
 391                         else
 392                         {
 393                             size_t srcIdx = delta + (sampleIdx * src1_channels + c) * planeSize;
 394                             const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
 395
 396                             if (op == PROD)
 397                             {
 398                                 for (int j = 0; j < blockSize; j++)
 399                                 {
 400                                     dstptr[j] = srcptr0[j] * srcptrI[j];
 401                                 }
 402                             }
 403                             else if (op == DIV)
 404                             {
 405                                 for (int j = 0; j < blockSize; j++)
 406                                 {
 407                                     dstptr[j] = srcptr0[j] / srcptrI[j];
 408                                 }
 409                             }
 410                             else if (op == MAX)
 411                             {
 412                                 for (int j = 0; j < blockSize; j++)
 413                                 {
 414                                     dstptr[j] = std::max(srcptr0[j], srcptrI[j]);
 415                                 }
 416                             }
 417                             else if (op == SUM)
 418                             {
 419                                 if (!coeffsptr || (coeffsptr[0] == 1.0f && coeffsptr[1] == 1.0f))
 420                                 {
 421                                     for (int j = 0; j < blockSize; j++)
 422                                     {
 423                                         dstptr[j] = srcptr0[j] + srcptrI[j];
 424                                     }
 425                                 }
 426                                 else
 427                                 {
 428                                     float c0 = coeffsptr[0];
 429                                     float c1 = coeffsptr[1];
 430                                     for (int j = 0; j < blockSize; j++)
 431                                     {
 432                                         dstptr[j] = c0*srcptr0[j] + c1*srcptrI[j];
 433                                     }
 434                                 }
 435                             }
 436                             else
 437                                 CV_Error(Error::StsInternal, "");
 438                         }
 439                     }
 440
 441                     // aggregate other inputs (3+)
 442                     for (size_t inputIdx = 2; inputIdx < nsrcs; inputIdx++)
 443                     {
 444                         int srcI_channels = srcNumChannels[inputIdx];
 445                         if (c >= srcI_channels)
 446                             continue;  // no data from second input
 447                         size_t srcIdx = delta + (sampleIdx * srcI_channels + c) * planeSize;
 448                         const float* srcptrI = srcs[inputIdx]->ptr<float>() + srcIdx;
 449
 450                         if (op == PROD)
 451                         {
 452                             for (int j = 0; j < blockSize; j++)
 453                             {
 454                                 dstptr[j] *= srcptrI[j];
 455                             }
 456                         }
 457                         else if (op == DIV)
 458                         {
 459                             for (int j = 0; j < blockSize; j++)
 460                             {
 461                                 dstptr[j] /= srcptrI[j];
 462                             }
 463                         }
 464                         else if (op == MAX)
 465                         {
 466                             for (int j = 0; j < blockSize; j++)
 467                             {
 468                                 dstptr[j] = std::max(dstptr[j], srcptrI[j]);
 469                             }
 470                         }
 471                         else if (op == SUM)
 472                         {
 473                             if (!coeffsptr || coeffsptr[inputIdx] == 1.0f)
 474                             {
 475                                 for (int j = 0; j < blockSize; j++)
 476                                 {
 477                                     dstptr[j] += srcptrI[j];
 478                                 }
 479                             }
 480                             else
 481                             {
 482                                 float cI = coeffsptr[inputIdx];
 483                                 for (int j = 0; j < blockSize; j++)
 484                                 {
 485                                     dstptr[j] += cI * srcptrI[j];
 486                                 }
 487                             }
 488                         }
 489                         else
 490                             CV_Error(Error::StsInternal, "");
 491                     }
 492                 }
 493
 494                 if( activ )
 495                 {
 496                     float* ptr = dstptr0 + delta + sampleIdx*channels*planeSize;
 497                     activ->forwardSlice(ptr, ptr, blockSize, planeSize, 0, channels);
 498                 }
 499             }
 500         }
 501     };
 502
 503 #ifdef HAVE_OPENCL
 504     bool forward_ocl(InputArrayOfArrays inputs_, OutputArrayOfArrays outputs_, OutputArrayOfArrays internals_)
 505     {
 506         std::vector<UMat> inputs;
 507         std::vector<UMat> outputs;
 508
 509         if ((inputs_.depth() == CV_16S && op != SUM) || (channelsMode != ELTWISE_CHANNNELS_SAME))
 510             return false;
 511
 512         inputs_.getUMatVector(inputs);
 513         outputs_.getUMatVector(outputs);
 514
 515         switch (op)
 516         {
 517             case SUM:
 518                 {
 519                     int channels = total(shape(outputs[0]), 0, 2);
 520                     int plane_size = total(shape(outputs[0]), 2);
 521                     if (channels % 4 == 0 && plane_size % 4 == 0)
 522                     {
 523                         size_t localsize[] = { 128 };
 524                         size_t globalsize[] = { (size_t)channels / 4 * localsize[0] };
 525                         String opts;
 526                         if (inputs_.depth() == CV_16S)
 527                             opts = " -DDtype=half -DDtype4=half4 -DDtype8=half8";
 528                         else
 529                             opts = " -DDtype=float -DDtype4=float4 -DDtype8=float8";
 530
 531                         for (int i = 0; i < (inputs.size() - 1); ++i)
 532                         {
 533                             String buildopt = format("-DLOOP=%d", i) + opts;
 534                             ocl::Kernel kernel("op_sum4", ocl::dnn::eltwise_oclsrc, buildopt);
 535                             int idx = 0;
 536                             UMat inpMat = (i == 0) ? inputs[0] : UMat();
 537                             float coeff1 = (coeffs.empty() || i > 0) ? 1.0f : coeffs[i];
 538                             float coeff2 = coeffs.empty() ? 1.0f : coeffs[i + 1];
 539                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[0]));
 540                             kernel.set(idx++, ocl::KernelArg::PtrReadOnly(inputs[1]));
 541                             kernel.set(idx++, (int)plane_size);
 542                             kernel.set(idx++, (float)coeff1);
 543                             kernel.set(idx++, (float)coeff2);
 544                             kernel.set(idx++, ocl::KernelArg::PtrReadWrite(outputs[0]));
 545                             bool ret = kernel.run(1, globalsize, localsize, false);
 546                             if (!ret)
 547                                 return false;
 548                         }
 549                     }
 550                     else
 551                     {
 552                         if (inputs_.depth() == CV_16S)
 553                             return false;
 554
 555                         float coeff1 = coeffs.empty() ? 1.f : coeffs[0];
 556                         float coeff2 = coeffs.empty() ? 1.f : coeffs[1];
 557                         UMat mul0, mul1;
 558                         multiply(coeff1, inputs[0], mul0);
 559                         multiply(coeff2, inputs[1], mul1);
 560                         add(mul0, mul1, outputs[0]);
 561                         for (int i = 2; i < inputs.size(); ++i)
 562                         {
 563                             float coeff = coeffs.empty() ? 1.f : coeffs[i];
 564                             multiply(coeff, inputs[i], mul0);
 565                             add(mul0, outputs[0], outputs[0]);
 566                         }
 567                     }
 568                 }
 569                 break;
 570             case PROD:
 571                 multiply(inputs[0], inputs[1], outputs[0]);
 572                 for (int i = 2; i < inputs.size(); ++i)
 573                     multiply(inputs[i], outputs[0], outputs[0]);
 574                 break;
 575             case DIV:
 576                 divide(inputs[0], inputs[1], outputs[0]);
 577                 for (int i = 2; i < inputs.size(); ++i)
 578                     divide(outputs[0], inputs[i], outputs[0]);
 579                 break;
 580             case MAX:
 581                 max(inputs[0], inputs[1], outputs[0]);
 582                 for (int i = 2; i < inputs.size(); ++i)
 583                     max(inputs[i], outputs[0], outputs[0]);
 584                 break;
 585             default:
 586                 return false;
 587         }
 588         return true;
 589     }
 590 #endif
 591
 592     void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
 593     {
 594         CV_TRACE_FUNCTION();
 595         CV_TRACE_ARG_VALUE(name, "name", name.c_str());
 596
 597         CV_OCL_RUN(IS_DNN_OPENCL_TARGET(preferableTarget),
 598                    forward_ocl(inputs_arr, outputs_arr, internals_arr))
 599
 600         if (inputs_arr.depth() == CV_16S)
 601         {
 602             forward_fallback(inputs_arr, outputs_arr, internals_arr);
 603             return;
 604         }
 605
 606         std::vector<Mat> inputs, outputs;
 607         inputs_arr.getMatVector(inputs);
 608         outputs_arr.getMatVector(outputs);
 609
 610         CV_Assert(outputs.size() == 1);
 611         const int nstripes = getNumThreads();
 612         EltwiseInvoker::run(*this,
 613                             &inputs[0], (int)inputs.size(), outputs[0],
 614                             nstripes);
 615     }
 616
 617 #ifdef HAVE_CUDA
 618     Ptr<BackendNode> initCUDA(
 619         void *context_,
 620         const std::vector<Ptr<BackendWrapper>>& inputs,
 621         const std::vector<Ptr<BackendWrapper>>& outputs
 622     ) override
 623     {
 624         auto context = reinterpret_cast<csl::CSLContext*>(context_);
 625
 626         auto op_ = [this] {
 627             switch (op) {
 628             case MAX: return cuda4dnn::EltwiseOpType::MAX;
 629             case SUM: return cuda4dnn::EltwiseOpType::SUM;
 630             case PROD: return cuda4dnn::EltwiseOpType::PRODUCT;
 631             case DIV: return cuda4dnn::EltwiseOpType::DIV;
 632             }
 633             return cuda4dnn::EltwiseOpType::SUM;
 634         }();
 635
 636         return make_cuda_node<cuda4dnn::EltwiseOp>(preferableTarget, std::move(context->stream), op_, coeffs);
 637     }
 638 #endif
 639
 640     virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &input) CV_OVERRIDE
 641     {
 642 #ifdef HAVE_HALIDE
 643         Halide::Var x("x"), y("y"), c("c"), n("n");
 644         Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
 645         Halide::Expr topExpr;
 646         std::vector<Halide::Buffer<> > inputBuffers = halideBuffers(input);
 647         switch (op)
 648         {
 649             case SUM:
 650                 if (coeffs.empty())
 651                 {
 652                     topExpr = inputBuffers[0](x, y, c, n) +
 653                               inputBuffers[1](x, y, c, n);
 654                     for (int i = 2; i < inputBuffers.size(); ++i)
 655                         topExpr += inputBuffers[i](x, y, c, n);
 656                 }
 657                 else
 658                 {
 659                   topExpr = coeffs[0] * inputBuffers[0](x, y, c, n) +
 660                             coeffs[1] * inputBuffers[1](x, y, c, n);
 661                   for (int i = 2; i < inputBuffers.size(); ++i)
 662                       topExpr += coeffs[i] * inputBuffers[i](x, y, c, n);
 663                 }
 664                 break;
 665             case PROD:
 666                 topExpr = inputBuffers[0](x, y, c, n) *
 667                           inputBuffers[1](x, y, c, n);
 668                 for (int i = 2; i < inputBuffers.size(); ++i)
 669                     topExpr *= inputBuffers[i](x, y, c, n);
 670                 break;
 671             case DIV:
 672                 topExpr = inputBuffers[0](x, y, c, n) /
 673                           inputBuffers[1](x, y, c, n);
 674                 for (int i = 2; i < inputBuffers.size(); ++i)
 675                     topExpr /= inputBuffers[i](x, y, c, n);
 676                 break;
 677             case MAX:
 678                 topExpr = max(inputBuffers[0](x, y, c, n),
 679                               inputBuffers[1](x, y, c, n));
 680                 for (int i = 2; i < inputBuffers.size(); ++i)
 681                     topExpr = max(topExpr, inputBuffers[i](x, y, c, n));
 682                 break;
 683             default:
 684                 return Ptr<BackendNode>();
 685         }
 686         top(x, y, c, n) = topExpr;
 687         return Ptr<BackendNode>(new HalideBackendNode(top));
 688 #endif  // HAVE_HALIDE
 689         return Ptr<BackendNode>();
 690     }
 691
 692 #ifdef HAVE_INF_ENGINE
 693     virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
 694     {
 695         InferenceEngine::Builder::EltwiseLayer ieLayer(name);
 696
 697         ieLayer.setInputPorts(std::vector<InferenceEngine::Port>(inputs.size()));
 698
 699         if (op == SUM)
 700             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::SUM);
 701         else if (op == PROD)
 702             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MUL);
 703         else if (op == DIV)
 704             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::DIV);
 705         else if (op == MAX)
 706             ieLayer.setEltwiseType(InferenceEngine::Builder::EltwiseLayer::EltwiseType::MAX);
 707         else
 708             CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
 709
 710         InferenceEngine::Builder::Layer l = ieLayer;
 711         if (!coeffs.empty())
 712             l.getParameters()["coeff"] = coeffs;
 713
 714         return Ptr<BackendNode>(new InfEngineBackendNode(l));
 715     }
 716 #endif  // HAVE_INF_ENGINE
 717
 718
 719 #ifdef HAVE_DNN_NGRAPH
 720     virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs,
 721                                         const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
 722     {
 723         auto curr_node = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
 724         if (!coeffs.empty()) {
 725             auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[0]);
 726             curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
 727         }
 728
 729         for (size_t i = 1; i < nodes.size(); i++)
 730         {
 731             auto next_node = nodes[i].dynamicCast<InfEngineNgraphNode>()->node;
 732             if (!coeffs.empty()) {
 733                 auto coeff = std::make_shared<ngraph::op::Constant>(ngraph::element::f32, ngraph::Shape{1}, &coeffs[i]);
 734                 next_node = std::make_shared<ngraph::op::v1::Multiply>(next_node, coeff, ngraph::op::AutoBroadcastType::NUMPY);
 735             }
 736             switch (op) {
 737                 case SUM:  curr_node = std::make_shared<ngraph::op::v1::Add>(curr_node, next_node); break;
 738                 case PROD: curr_node = std::make_shared<ngraph::op::v1::Multiply>(curr_node, next_node); break;
 739                 case DIV:  curr_node = std::make_shared<ngraph::op::v1::Divide>(curr_node, next_node); break;
 740                 case MAX:  curr_node = std::make_shared<ngraph::op::v1::Maximum>(curr_node, next_node); break;
 741                 default: CV_Error(Error::StsNotImplemented, "Unsupported eltwise operation");
 742             }
 743         }
 744         return Ptr<BackendNode>(new InfEngineNgraphNode(curr_node));
 745     }
 746 #endif  // HAVE_DNN_NGRAPH
 747
 748     virtual int64 getFLOPS(const std::vector<MatShape> &inputs,
 749                            const std::vector<MatShape> &outputs) const CV_OVERRIDE
 750     {
 751         CV_UNUSED(outputs); // suppress unused variable warning
 752         CV_Assert(inputs.size());
 753
 754         // FIXIT: handle inputs with different number of channels
 755         long flops = inputs.size() * total(inputs[0]);
 756
 757         return flops;
 758     }
 759
 760     bool setActivation(const Ptr<ActivationLayer>& layer) CV_OVERRIDE
 761     {
 762         if (activ.empty() || layer.empty())
 763         {
 764             activ = layer;
 765             return !activ.empty();
 766         }
 767         else
 768             return false;
 769     }
 770
 771     Ptr<ActivationLayer> activ;
 772 };
 773
 774 Ptr<EltwiseLayer> EltwiseLayer::create(const LayerParams& params)
 775 {
 776     return Ptr<EltwiseLayer>(new EltwiseLayerImpl(params));
 777 }
 778
 779 }
 780 }