Merge remote-tracking branch 'upstream/3.4' into merge-3.4

author Alexander Alekhin <alexander.a.alekhin@gmail.com>

Fri, 9 Oct 2020 20:08:00 +0000 (20:08 +0000)

committer Alexander Alekhin <alexander.a.alekhin@gmail.com>

Fri, 9 Oct 2020 20:09:26 +0000 (20:09 +0000)
author Alexander Alekhin <alexander.a.alekhin@gmail.com>
Fri, 9 Oct 2020 20:08:00 +0000 (20:08 +0000)
committer Alexander Alekhin <alexander.a.alekhin@gmail.com>
Fri, 9 Oct 2020 20:09:26 +0000 (20:09 +0000)
diff --cc modules/dnn/src/dnn.cpp

index f39305eef4486a9505c3bd9d8f93b4fc1faeae5d,c789638793a40cf8070d8bc06f3ee7689ded5c1e..46ee650ef9d9af22eac9d229512038a2fa9489d9
--- 1/modules/dnn/src/dnn.cpp
--- 2/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@@ -2652,30 -2412,43 +2652,63 @@@ struct Net::Impl : public detail::NetIm
                           break;
                   }
   
- -                // fuse convolution layer followed by eltwise + relu
- -                while (nextData && IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution")  // semantic of 'if'
+ +                // OpenCL: fuse convolution layer followed by eltwise + relu
+ +                // CUDA: fuse convolution layer followed by eltwise (and optional activation)
-                 if ((IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) &&
-                     ld.layerInstance->type == "Convolution" )
++                while (nextData &&
++                    (IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) &&
++                    ld.layerInstance->type == "Convolution"
++                )  // semantic of 'if'
                   {
-                     Ptr<EltwiseLayer> nextEltwiseLayer;
-                     if( nextData )
-                         nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
+                     Ptr<EltwiseLayer> nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
+                     if (nextEltwiseLayer.empty())
+                         break;
+ 
+ +#ifdef HAVE_CUDA
+ +                    // CUDA backend supports fusion with eltwise sum (without variable channels)
+ +                    // `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion
+ +                    if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
+ +                    {
+ +                        // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
+ +                        cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
+ +                        const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
+ +                        const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
+ +                        // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used.
+ +                        // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors.
+ +                        if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
+ +                            nextEltwiseLayer = Ptr<EltwiseLayer>();
+ +                    }
+ +#endif
-                     if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2)
++
+                     if (pinsToKeep.count(lpNext) != 0)
+                         break;
+                     if (nextData->inputBlobsId.size() != 2)
+                         break;
+ 
- -                    if (!nextData->params.has("operation") || nextData->params.get<String>("operation").toLowerCase() == "sum")
++                    if (!nextData->params.has("operation") || toLowerCase(nextData->params.get<String>("operation")) == "sum")
+                     {
+                         if (nextData->params.has("coeff"))
+                         {
+                             DictValue paramCoeff = nextData->params.get("coeff");
+                             int n = paramCoeff.size();
+                             bool isCoeffOneOne = (n == 2);
+                             for (int i = 0; isCoeffOneOne && i < n; i++)
+                             {
+                                 float c = paramCoeff.get<float>(i);
+                                 isCoeffOneOne &= (c == 1.0f);
+                             }
+                             if (!isCoeffOneOne)
+                             {
+                                 CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion of 'Sum' without coeffs (or {1.0, 1.0}) is supported only");
+                                 break;
+                             }
+                         }
+                     }
+                     else
+                     {
+                         CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion with eltwise operation is not supported: " << nextData->params.get<String>("operation"));
+                         break;
+                     }
+ 
                       {
                           LayerData *eltwiseData = nextData;
   
@@@ -2718,149 -2484,60 +2751,151 @@@
                                   {
                                       nextData = &layers[eltwiseData->consumers[0].lid];
                                       lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
- -                                    Ptr<ActivationLayer> nextActivLayer;
- -                                    if( nextData )
- -                                        nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
- -
- -                                    Ptr<PowerLayer> activ_power;
- -                                    if( !nextActivLayer.empty() &&
- -                                            (!nextData->type.compare("ReLU") ||
- -                                             !nextData->type.compare("ChannelsPReLU") ||
- -                                             (!nextData->type.compare("Power") && (activ_power = nextActivLayer.dynamicCast<PowerLayer>()) && activ_power->scale == 1.0f)
- -                                            ) &&
- -                                            currLayer->setActivation(nextActivLayer) )
+ +                                    CV_Assert(nextData);
+ +                                    if (nextData->outputBlobs.size() == 1)
+ +                                        nextFusabeleActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
+ +                                }
+ +                                else
+ +                                {
+ +                                    // OCL backend cannot fuse in this case but the CUDA backend can continue with just eltwise
+ +                                    nextData = 0;
+ +                                }
+ +
+ +                                // the requirements of OCV OpenCL backend and CUDA backend are different
+ +                                // we need to check them separately; hence, the fuse variables
+ +                                bool fuse_eltwise = false, fuse_activation = false;
+ +
++                                Ptr<PowerLayer> activ_power;
+ +                                if (IS_DNN_OPENCL_TARGET(preferableTarget) && !nextFusabeleActivLayer.empty() &&
+ +                                    nextData &&
+ +                                    (!nextData->type.compare("ReLU") ||
+ +                                     !nextData->type.compare("ChannelsPReLU") ||
-                                      !nextData->type.compare("Power")) &&
++                                     (!nextData->type.compare("Power") && (activ_power = nextFusabeleActivLayer.dynamicCast<PowerLayer>()) && activ_power->scale == 1.0f)
++                                    ) &&
+ +                                    currLayer->setActivation(nextFusabeleActivLayer))
+ +                                {
+ +                                    fuse_eltwise = true;
+ +                                    fuse_activation = true;
+ +                                }
+ +
+ +                                if (IS_DNN_CUDA_TARGET(preferableTarget))
+ +                                {
+ +                                    /* supported fusion options:
+ +                                     * => convolution + eltwise
+ +                                     * => activation(convolution) + eltwise
+ +                                     *    > convolution + activation would have been fused already; we have to fuse eltwise
+ +                                     * => activation(convolution + eltwise)
+ +                                     *    > fuse eltwise and then activation
+ +                                     */
+ +                                    auto layer = nextEltwiseLayer.staticCast<Layer>();
+ +                                    if (currLayer->tryFuse(layer))
+ +                                    {
+ +                                        fuse_eltwise = true; /* eltwise was successfully fused */
+ +                                        if (!nextFusabeleActivLayer.empty() && nextData)
+ +                                        {
+ +                                            if ((!nextData->type.compare("ReLU") ||
+ +                                                 !nextData->type.compare("ReLU6") ||
+ +                                                 !nextData->type.compare("Power") ||
+ +                                                 !nextData->type.compare("TanH") ||
+ +                                                 !nextData->type.compare("Sigmoid") ||
+ +                                                 !nextData->type.compare("Swish") ||
+ +                                                 !nextData->type.compare("Mish")) &&
+ +                                                currLayer->setActivation(nextFusabeleActivLayer))
+ +                                            {
+ +                                                // activation was fused
+ +                                                fuse_activation = true;
+ +                                            }
+ +                                        }
+ +                                    }
+ +                                }
+ +
+ +                                CV_Assert(!fuse_activation || fuse_eltwise); /* cannot fuse activation without eltwise */
+ +                                if(fuse_eltwise && fuse_activation)
+ +                                {
+ +                                    CV_Assert(nextData);
+ +                                    CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
+ +                                    ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
+ +                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+ +                                    printf_(("\tfused with %s\n", nextFusabeleActivLayer->name.c_str()));
+ +                                    eltwiseData->skip = true;
+ +                                    nextData->skip = true;
+ +                                    // This optimization for cases like
+ +                                    // some_layer   conv
+ +                                    //   |             |
+ +                                    //   +-- eltwise --+
+ +                                    //          |
+ +                                    //        activ
+ +                                    // This way all the element-wise computations
+ +                                    // (i.e. some_layer+conv or some_layer*conv)
+ +                                    // would be done at [conv] layer. So we need to
+ +                                    // replace [conv]'s output blob to [eltwise]'s one
+ +                                    // considering that [activ] is an in-place layer.
+ +                                    // Also we need to move all the consumers' references.
+ +                                    // To prevent memory collisions (i.e. when input of
+ +                                    // [conv] and output of [eltwise] is the same blob)
+ +                                    // we allocate a new blob.
+ +                                    CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
+ +                                    ld.outputBlobs[0] = ld.outputBlobs[0].clone();
+ +                                    ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
+ +
+ +                                    eltwiseData->outputBlobs = ld.outputBlobs;
+ +                                    nextData->outputBlobs = ld.outputBlobs;
+ +                                    eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
+ +                                    nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
+ +
+ +                                    // Move references of [activ] layer consumers to the newly allocated blob.
+ +                                    for (int i = 0; i < nextData->consumers.size(); ++i)
                                       {
- -                                        CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
- -                                        ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
- -                                        printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
- -                                        printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
- -                                        eltwiseData->skip = true;
- -                                        nextData->skip = true;
- -                                        // This optimization for cases like
- -                                        // some_layer   conv
- -                                        //   |             |
- -                                        //   +-- eltwise --+
- -                                        //          |
- -                                        //        activ
- -                                        // This way all the element-wise computations
- -                                        // (i.e. some_layer+conv or some_layer*conv)
- -                                        // would be done at [conv] layer. So we need to
- -                                        // replace [conv]'s output blob to [eltwise]'s one
- -                                        // considering that [activ] is an in-place layer.
- -                                        // Also we need to move all the consumers' references.
- -                                        // To prevent memory collisions (i.e. when input of
- -                                        // [conv] and output of [eltwise] is the same blob)
- -                                        // we allocate a new blob.
- -                                        CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
- -                                        ld.outputBlobs[0] = ld.outputBlobs[0].clone();
- -                                        ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
- -
- -                                        eltwiseData->outputBlobs = ld.outputBlobs;
- -                                        nextData->outputBlobs = ld.outputBlobs;
- -                                        eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
- -                                        nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
- -
- -                                        // Move references of [activ] layer consumers to the newly allocated blob.
- -                                        for (int i = 0; i < nextData->consumers.size(); ++i)
+ +                                        LayerData& consumer = layers[nextData->consumers[i].lid];
+ +                                        for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
                                           {
- -                                            LayerData& consumer = layers[nextData->consumers[i].lid];
- -                                            for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
+ +                                            if (consumer.inputBlobsId[j].lid == lpNext.lid)
                                               {
- -                                                if (consumer.inputBlobsId[j].lid == lpNext.lid)
- -                                                {
- -                                                    consumer.inputBlobs[j] = &ld.outputBlobs[0];
- -                                                    consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
- -                                                    break;
- -                                                }
+ +                                                consumer.inputBlobs[j] = &ld.outputBlobs[0];
+ +                                                consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
+ +                                                break;
+ +                                            }
+ +                                        }
+ +                                    }
+ +                                }
+ +                                else if (fuse_eltwise) // conv + eltwise (note: conv could have fused activations before eltwise)
+ +                                {
+ +                                    CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
+ +                                    CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
+ +                                    ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
+ +                                    printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+ +                                    eltwiseData->skip = true;
+ +                                    // This optimization is for cases like
+ +                                    // some_layer   conv (maybe fused with activ)
+ +                                    //   |             |
+ +                                    //   +-- eltwise --+
+ +                                    //
+ +                                    // This way all the element-wise computations
+ +                                    // (i.e. some_layer+conv or some_layer*conv)
+ +                                    // would be done at [conv] layer. So we need to
+ +                                    // replace [conv]'s output blob to [eltwise]'s one.
+ +                                    // Also we need to move all the consumers' references.
+ +                                    // To prevent memory collisions (i.e. when input of
+ +                                    // [conv] and output of [eltwise] is the same blob)
+ +                                    // we allocate a new blob.
+ +                                    CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
+ +                                    ld.outputBlobs[0] = ld.outputBlobs[0].clone();
+ +                                    ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
+ +
+ +                                    eltwiseData->outputBlobs = ld.outputBlobs;
+ +                                    eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
+ +
+ +                                    // Move references of [eltwise] layer consumers to the newly allocated blob.
+ +                                    for (int i = 0; i < eltwiseData->consumers.size(); ++i)
+ +                                    {
+ +                                        LayerData& consumer = layers[eltwiseData->consumers[i].lid];
+ +                                        for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
+ +                                        {
+ +                                            if (consumer.inputBlobsId[j].lid == eltwiseData->id)
+ +                                            {
+ +                                                consumer.inputBlobs[j] = &ld.outputBlobs[0];
+ +                                                consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
+ +                                                break;
                                               }
                                           }
                                       }
diff --cc modules/dnn/src/layers/convolution_layer.cpp

index 023c8b40d83a4cea6aeedffb0bda54f73d72e73f,473c07b755033e208e430b591cb73e3f4b8259b6..679fd8f9efd199bbd060f9ff0922e63f93a34e43
--- 1/modules/dnn/src/layers/convolution_layer.cpp
--- 2/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@@ -46,8 -45,9 +46,10 @@@
   #include "../op_halide.hpp"
   #include "../op_inf_engine.hpp"
   #include "../ie_ngraph.hpp"
+ +#include "../op_vkcom.hpp"
   
+ #include <opencv2/core/utils/logger.hpp>
+ 
   #include "opencv2/core/hal/hal.hpp"
   #include "opencv2/core/hal/intrin.hpp"
   #include <iostream>
diff --cc modules/dnn/test/test_common.impl.hpp
Simple merge
diff --cc modules/dnn/test/test_darknet_importer.cpp
Simple merge
diff --cc modules/dnn/test/test_layers.cpp
Simple merge
diff --cc modules/dnn/test/test_torch_importer.cpp
Simple merge
author	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Fri, 9 Oct 2020 20:08:00 +0000 (20:08 +0000)
committer	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Fri, 9 Oct 2020 20:09:26 +0000 (20:09 +0000)
		1	2
modules/dnn/src/dnn.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/dnn/src/layers/convolution_layer.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/dnn/test/test_common.impl.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/dnn/test/test_darknet_importer.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/dnn/test/test_layers.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/dnn/test/test_torch_importer.cpp	patch \|	diff1 \|	diff2 \|	blob \| history