Merge remote-tracking branch 'upstream/3.4' into merge-3.4
authorAlexander Alekhin <alexander.a.alekhin@gmail.com>
Fri, 9 Oct 2020 20:08:00 +0000 (20:08 +0000)
committerAlexander Alekhin <alexander.a.alekhin@gmail.com>
Fri, 9 Oct 2020 20:09:26 +0000 (20:09 +0000)
1  2 
modules/dnn/src/dnn.cpp
modules/dnn/src/layers/convolution_layer.cpp
modules/dnn/test/test_common.impl.hpp
modules/dnn/test/test_darknet_importer.cpp
modules/dnn/test/test_layers.cpp
modules/dnn/test/test_torch_importer.cpp

@@@ -2652,30 -2412,43 +2652,63 @@@ struct Net::Impl : public detail::NetIm
                          break;
                  }
  
 -                // fuse convolution layer followed by eltwise + relu
 -                while (nextData && IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution")  // semantic of 'if'
 +                // OpenCL: fuse convolution layer followed by eltwise + relu
 +                // CUDA: fuse convolution layer followed by eltwise (and optional activation)
-                 if ((IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) &&
-                     ld.layerInstance->type == "Convolution" )
++                while (nextData &&
++                    (IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) &&
++                    ld.layerInstance->type == "Convolution"
++                )  // semantic of 'if'
                  {
-                     Ptr<EltwiseLayer> nextEltwiseLayer;
-                     if( nextData )
-                         nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
+                     Ptr<EltwiseLayer> nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
+                     if (nextEltwiseLayer.empty())
+                         break;
 +#ifdef HAVE_CUDA
 +                    // CUDA backend supports fusion with eltwise sum (without variable channels)
 +                    // `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion
 +                    if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
 +                    {
 +                        // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
 +                        cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
 +                        const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
 +                        const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
 +                        // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used.
 +                        // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors.
 +                        if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
 +                            nextEltwiseLayer = Ptr<EltwiseLayer>();
 +                    }
 +#endif
-                     if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2)
++
+                     if (pinsToKeep.count(lpNext) != 0)
+                         break;
+                     if (nextData->inputBlobsId.size() != 2)
+                         break;
 -                    if (!nextData->params.has("operation") || nextData->params.get<String>("operation").toLowerCase() == "sum")
++                    if (!nextData->params.has("operation") || toLowerCase(nextData->params.get<String>("operation")) == "sum")
+                     {
+                         if (nextData->params.has("coeff"))
+                         {
+                             DictValue paramCoeff = nextData->params.get("coeff");
+                             int n = paramCoeff.size();
+                             bool isCoeffOneOne = (n == 2);
+                             for (int i = 0; isCoeffOneOne && i < n; i++)
+                             {
+                                 float c = paramCoeff.get<float>(i);
+                                 isCoeffOneOne &= (c == 1.0f);
+                             }
+                             if (!isCoeffOneOne)
+                             {
+                                 CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion of 'Sum' without coeffs (or {1.0, 1.0}) is supported only");
+                                 break;
+                             }
+                         }
+                     }
+                     else
+                     {
+                         CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion with eltwise operation is not supported: " << nextData->params.get<String>("operation"));
+                         break;
+                     }
                      {
                          LayerData *eltwiseData = nextData;
  
                                  {
                                      nextData = &layers[eltwiseData->consumers[0].lid];
                                      lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
 -                                    Ptr<ActivationLayer> nextActivLayer;
 -                                    if( nextData )
 -                                        nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
 -
 -                                    Ptr<PowerLayer> activ_power;
 -                                    if( !nextActivLayer.empty() &&
 -                                            (!nextData->type.compare("ReLU") ||
 -                                             !nextData->type.compare("ChannelsPReLU") ||
 -                                             (!nextData->type.compare("Power") && (activ_power = nextActivLayer.dynamicCast<PowerLayer>()) && activ_power->scale == 1.0f)
 -                                            ) &&
 -                                            currLayer->setActivation(nextActivLayer) )
 +                                    CV_Assert(nextData);
 +                                    if (nextData->outputBlobs.size() == 1)
 +                                        nextFusabeleActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
 +                                }
 +                                else
 +                                {
 +                                    // OCL backend cannot fuse in this case but the CUDA backend can continue with just eltwise
 +                                    nextData = 0;
 +                                }
 +
 +                                // the requirements of OCV OpenCL backend and CUDA backend are different
 +                                // we need to check them separately; hence, the fuse variables
 +                                bool fuse_eltwise = false, fuse_activation = false;
 +
++                                Ptr<PowerLayer> activ_power;
 +                                if (IS_DNN_OPENCL_TARGET(preferableTarget) && !nextFusabeleActivLayer.empty() &&
 +                                    nextData &&
 +                                    (!nextData->type.compare("ReLU") ||
 +                                     !nextData->type.compare("ChannelsPReLU") ||
-                                      !nextData->type.compare("Power")) &&
++                                     (!nextData->type.compare("Power") && (activ_power = nextFusabeleActivLayer.dynamicCast<PowerLayer>()) && activ_power->scale == 1.0f)
++                                    ) &&
 +                                    currLayer->setActivation(nextFusabeleActivLayer))
 +                                {
 +                                    fuse_eltwise = true;
 +                                    fuse_activation = true;
 +                                }
 +
 +                                if (IS_DNN_CUDA_TARGET(preferableTarget))
 +                                {
 +                                    /* supported fusion options:
 +                                     * => convolution + eltwise
 +                                     * => activation(convolution) + eltwise
 +                                     *    > convolution + activation would have been fused already; we have to fuse eltwise
 +                                     * => activation(convolution + eltwise)
 +                                     *    > fuse eltwise and then activation
 +                                     */
 +                                    auto layer = nextEltwiseLayer.staticCast<Layer>();
 +                                    if (currLayer->tryFuse(layer))
 +                                    {
 +                                        fuse_eltwise = true; /* eltwise was successfully fused */
 +                                        if (!nextFusabeleActivLayer.empty() && nextData)
 +                                        {
 +                                            if ((!nextData->type.compare("ReLU") ||
 +                                                 !nextData->type.compare("ReLU6") ||
 +                                                 !nextData->type.compare("Power") ||
 +                                                 !nextData->type.compare("TanH") ||
 +                                                 !nextData->type.compare("Sigmoid") ||
 +                                                 !nextData->type.compare("Swish") ||
 +                                                 !nextData->type.compare("Mish")) &&
 +                                                currLayer->setActivation(nextFusabeleActivLayer))
 +                                            {
 +                                                // activation was fused
 +                                                fuse_activation = true;
 +                                            }
 +                                        }
 +                                    }
 +                                }
 +
 +                                CV_Assert(!fuse_activation || fuse_eltwise); /* cannot fuse activation without eltwise */
 +                                if(fuse_eltwise && fuse_activation)
 +                                {
 +                                    CV_Assert(nextData);
 +                                    CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
 +                                    ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
 +                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
 +                                    printf_(("\tfused with %s\n", nextFusabeleActivLayer->name.c_str()));
 +                                    eltwiseData->skip = true;
 +                                    nextData->skip = true;
 +                                    // This optimization for cases like
 +                                    // some_layer   conv
 +                                    //   |             |
 +                                    //   +-- eltwise --+
 +                                    //          |
 +                                    //        activ
 +                                    // This way all the element-wise computations
 +                                    // (i.e. some_layer+conv or some_layer*conv)
 +                                    // would be done at [conv] layer. So we need to
 +                                    // replace [conv]'s output blob to [eltwise]'s one
 +                                    // considering that [activ] is an in-place layer.
 +                                    // Also we need to move all the consumers' references.
 +                                    // To prevent memory collisions (i.e. when input of
 +                                    // [conv] and output of [eltwise] is the same blob)
 +                                    // we allocate a new blob.
 +                                    CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
 +                                    ld.outputBlobs[0] = ld.outputBlobs[0].clone();
 +                                    ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
 +
 +                                    eltwiseData->outputBlobs = ld.outputBlobs;
 +                                    nextData->outputBlobs = ld.outputBlobs;
 +                                    eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
 +                                    nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
 +
 +                                    // Move references of [activ] layer consumers to the newly allocated blob.
 +                                    for (int i = 0; i < nextData->consumers.size(); ++i)
                                      {
 -                                        CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
 -                                        ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
 -                                        printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
 -                                        printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
 -                                        eltwiseData->skip = true;
 -                                        nextData->skip = true;
 -                                        // This optimization for cases like
 -                                        // some_layer   conv
 -                                        //   |             |
 -                                        //   +-- eltwise --+
 -                                        //          |
 -                                        //        activ
 -                                        // This way all the element-wise computations
 -                                        // (i.e. some_layer+conv or some_layer*conv)
 -                                        // would be done at [conv] layer. So we need to
 -                                        // replace [conv]'s output blob to [eltwise]'s one
 -                                        // considering that [activ] is an in-place layer.
 -                                        // Also we need to move all the consumers' references.
 -                                        // To prevent memory collisions (i.e. when input of
 -                                        // [conv] and output of [eltwise] is the same blob)
 -                                        // we allocate a new blob.
 -                                        CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
 -                                        ld.outputBlobs[0] = ld.outputBlobs[0].clone();
 -                                        ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
 -
 -                                        eltwiseData->outputBlobs = ld.outputBlobs;
 -                                        nextData->outputBlobs = ld.outputBlobs;
 -                                        eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
 -                                        nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
 -
 -                                        // Move references of [activ] layer consumers to the newly allocated blob.
 -                                        for (int i = 0; i < nextData->consumers.size(); ++i)
 +                                        LayerData& consumer = layers[nextData->consumers[i].lid];
 +                                        for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
                                          {
 -                                            LayerData& consumer = layers[nextData->consumers[i].lid];
 -                                            for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
 +                                            if (consumer.inputBlobsId[j].lid == lpNext.lid)
                                              {
 -                                                if (consumer.inputBlobsId[j].lid == lpNext.lid)
 -                                                {
 -                                                    consumer.inputBlobs[j] = &ld.outputBlobs[0];
 -                                                    consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
 -                                                    break;
 -                                                }
 +                                                consumer.inputBlobs[j] = &ld.outputBlobs[0];
 +                                                consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
 +                                                break;
 +                                            }
 +                                        }
 +                                    }
 +                                }
 +                                else if (fuse_eltwise) // conv + eltwise (note: conv could have fused activations before eltwise)
 +                                {
 +                                    CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
 +                                    CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
 +                                    ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
 +                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
 +                                    eltwiseData->skip = true;
 +                                    // This optimization is for cases like
 +                                    // some_layer   conv (maybe fused with activ)
 +                                    //   |             |
 +                                    //   +-- eltwise --+
 +                                    //
 +                                    // This way all the element-wise computations
 +                                    // (i.e. some_layer+conv or some_layer*conv)
 +                                    // would be done at [conv] layer. So we need to
 +                                    // replace [conv]'s output blob to [eltwise]'s one.
 +                                    // Also we need to move all the consumers' references.
 +                                    // To prevent memory collisions (i.e. when input of
 +                                    // [conv] and output of [eltwise] is the same blob)
 +                                    // we allocate a new blob.
 +                                    CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
 +                                    ld.outputBlobs[0] = ld.outputBlobs[0].clone();
 +                                    ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
 +
 +                                    eltwiseData->outputBlobs = ld.outputBlobs;
 +                                    eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
 +
 +                                    // Move references of [eltwise] layer consumers to the newly allocated blob.
 +                                    for (int i = 0; i < eltwiseData->consumers.size(); ++i)
 +                                    {
 +                                        LayerData& consumer = layers[eltwiseData->consumers[i].lid];
 +                                        for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
 +                                        {
 +                                            if (consumer.inputBlobsId[j].lid == eltwiseData->id)
 +                                            {
 +                                                consumer.inputBlobs[j] = &ld.outputBlobs[0];
 +                                                consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
 +                                                break;
                                              }
                                          }
                                      }
  #include "../op_halide.hpp"
  #include "../op_inf_engine.hpp"
  #include "../ie_ngraph.hpp"
 +#include "../op_vkcom.hpp"
  
+ #include <opencv2/core/utils/logger.hpp>
  #include "opencv2/core/hal/hal.hpp"
  #include "opencv2/core/hal/intrin.hpp"
  #include <iostream>
Simple merge
Simple merge