break;
}
- // fuse convolution layer followed by eltwise + relu
- while (nextData && IS_DNN_OPENCL_TARGET(preferableTarget) && ld.layerInstance->type == "Convolution") // semantic of 'if'
+ // OpenCL: fuse convolution layer followed by eltwise + relu
+ // CUDA: fuse convolution layer followed by eltwise (and optional activation)
- if ((IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) &&
- ld.layerInstance->type == "Convolution" )
++ while (nextData &&
++ (IS_DNN_OPENCL_TARGET(preferableTarget) || IS_DNN_CUDA_TARGET(preferableTarget)) &&
++ ld.layerInstance->type == "Convolution"
++ ) // semantic of 'if'
{
- Ptr<EltwiseLayer> nextEltwiseLayer;
- if( nextData )
- nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
+ Ptr<EltwiseLayer> nextEltwiseLayer = nextData->layerInstance.dynamicCast<EltwiseLayer>();
+ if (nextEltwiseLayer.empty())
+ break;
+
+#ifdef HAVE_CUDA
+ // CUDA backend supports fusion with eltwise sum (without variable channels)
+ // `nextEltwiseLayer` is reset if eltwise layer doesn't have a compatible configuration for fusion
+ if (IS_DNN_CUDA_TARGET(preferableTarget) && !nextEltwiseLayer.empty())
+ {
+ // we create a temporary backend node for eltwise layer to obtain the eltwise configuration
+ cuda4dnn::csl::CSLContext context; // assume that initCUDA and EltwiseOp do not use the context during init
+ const auto node = nextData->layerInstance->initCUDA(&context, nextData->inputBlobsWrappers, nextData->outputBlobsWrappers);
+ const auto eltwiseNode = node.dynamicCast<cuda4dnn::EltwiseOpBase>();
+ // CUDA backend uses EltwiseOp when all operands have the same number of channels; otherwise, ShortcutOp is used.
+ // Hence, a successful cast to EltwiseOp implies that the number of channels is same in all operand tensors.
+ if (eltwiseNode.empty() || eltwiseNode->op != cuda4dnn::EltwiseOpType::SUM || !eltwiseNode->coeffs.empty())
+ nextEltwiseLayer = Ptr<EltwiseLayer>();
+ }
+#endif
- if (!nextEltwiseLayer.empty() && nextData && nextData->inputBlobsId.size() == 2)
++
+ if (pinsToKeep.count(lpNext) != 0)
+ break;
+ if (nextData->inputBlobsId.size() != 2)
+ break;
+
- if (!nextData->params.has("operation") || nextData->params.get<String>("operation").toLowerCase() == "sum")
++ if (!nextData->params.has("operation") || toLowerCase(nextData->params.get<String>("operation")) == "sum")
+ {
+ if (nextData->params.has("coeff"))
+ {
+ DictValue paramCoeff = nextData->params.get("coeff");
+ int n = paramCoeff.size();
+ bool isCoeffOneOne = (n == 2);
+ for (int i = 0; isCoeffOneOne && i < n; i++)
+ {
+ float c = paramCoeff.get<float>(i);
+ isCoeffOneOne &= (c == 1.0f);
+ }
+ if (!isCoeffOneOne)
+ {
+ CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion of 'Sum' without coeffs (or {1.0, 1.0}) is supported only");
+ break;
+ }
+ }
+ }
+ else
+ {
+ CV_LOG_DEBUG(NULL, "DNN/OpenCL: fusion with eltwise operation is not supported: " << nextData->params.get<String>("operation"));
+ break;
+ }
+
{
LayerData *eltwiseData = nextData;
{
nextData = &layers[eltwiseData->consumers[0].lid];
lpNext = LayerPin(eltwiseData->consumers[0].lid, 0);
- Ptr<ActivationLayer> nextActivLayer;
- if( nextData )
- nextActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
-
- Ptr<PowerLayer> activ_power;
- if( !nextActivLayer.empty() &&
- (!nextData->type.compare("ReLU") ||
- !nextData->type.compare("ChannelsPReLU") ||
- (!nextData->type.compare("Power") && (activ_power = nextActivLayer.dynamicCast<PowerLayer>()) && activ_power->scale == 1.0f)
- ) &&
- currLayer->setActivation(nextActivLayer) )
+ CV_Assert(nextData);
+ if (nextData->outputBlobs.size() == 1)
+ nextFusabeleActivLayer = nextData->layerInstance.dynamicCast<ActivationLayer>();
+ }
+ else
+ {
+ // OCL backend cannot fuse in this case but the CUDA backend can continue with just eltwise
+ nextData = 0;
+ }
+
+ // the requirements of OCV OpenCL backend and CUDA backend are different
+ // we need to check them separately; hence, the fuse variables
+ bool fuse_eltwise = false, fuse_activation = false;
+
++ Ptr<PowerLayer> activ_power;
+ if (IS_DNN_OPENCL_TARGET(preferableTarget) && !nextFusabeleActivLayer.empty() &&
+ nextData &&
+ (!nextData->type.compare("ReLU") ||
+ !nextData->type.compare("ChannelsPReLU") ||
- !nextData->type.compare("Power")) &&
++ (!nextData->type.compare("Power") && (activ_power = nextFusabeleActivLayer.dynamicCast<PowerLayer>()) && activ_power->scale == 1.0f)
++ ) &&
+ currLayer->setActivation(nextFusabeleActivLayer))
+ {
+ fuse_eltwise = true;
+ fuse_activation = true;
+ }
+
+ if (IS_DNN_CUDA_TARGET(preferableTarget))
+ {
+ /* supported fusion options:
+ * => convolution + eltwise
+ * => activation(convolution) + eltwise
+ * > convolution + activation would have been fused already; we have to fuse eltwise
+ * => activation(convolution + eltwise)
+ * > fuse eltwise and then activation
+ */
+ auto layer = nextEltwiseLayer.staticCast<Layer>();
+ if (currLayer->tryFuse(layer))
+ {
+ fuse_eltwise = true; /* eltwise was successfully fused */
+ if (!nextFusabeleActivLayer.empty() && nextData)
+ {
+ if ((!nextData->type.compare("ReLU") ||
+ !nextData->type.compare("ReLU6") ||
+ !nextData->type.compare("Power") ||
+ !nextData->type.compare("TanH") ||
+ !nextData->type.compare("Sigmoid") ||
+ !nextData->type.compare("Swish") ||
+ !nextData->type.compare("Mish")) &&
+ currLayer->setActivation(nextFusabeleActivLayer))
+ {
+ // activation was fused
+ fuse_activation = true;
+ }
+ }
+ }
+ }
+
+ CV_Assert(!fuse_activation || fuse_eltwise); /* cannot fuse activation without eltwise */
+ if(fuse_eltwise && fuse_activation)
+ {
+ CV_Assert(nextData);
+ CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
+ ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
+ printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+ printf_(("\tfused with %s\n", nextFusabeleActivLayer->name.c_str()));
+ eltwiseData->skip = true;
+ nextData->skip = true;
+ // This optimization for cases like
+ // some_layer conv
+ // | |
+ // +-- eltwise --+
+ // |
+ // activ
+ // This way all the element-wise computations
+ // (i.e. some_layer+conv or some_layer*conv)
+ // would be done at [conv] layer. So we need to
+ // replace [conv]'s output blob to [eltwise]'s one
+ // considering that [activ] is an in-place layer.
+ // Also we need to move all the consumers' references.
+ // To prevent memory collisions (i.e. when input of
+ // [conv] and output of [eltwise] is the same blob)
+ // we allocate a new blob.
+ CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
+ ld.outputBlobs[0] = ld.outputBlobs[0].clone();
+ ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
+
+ eltwiseData->outputBlobs = ld.outputBlobs;
+ nextData->outputBlobs = ld.outputBlobs;
+ eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
+ nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
+
+ // Move references of [activ] layer consumers to the newly allocated blob.
+ for (int i = 0; i < nextData->consumers.size(); ++i)
{
- CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
- ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
- printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
- printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
- eltwiseData->skip = true;
- nextData->skip = true;
- // This optimization for cases like
- // some_layer conv
- // | |
- // +-- eltwise --+
- // |
- // activ
- // This way all the element-wise computations
- // (i.e. some_layer+conv or some_layer*conv)
- // would be done at [conv] layer. So we need to
- // replace [conv]'s output blob to [eltwise]'s one
- // considering that [activ] is an in-place layer.
- // Also we need to move all the consumers' references.
- // To prevent memory collisions (i.e. when input of
- // [conv] and output of [eltwise] is the same blob)
- // we allocate a new blob.
- CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
- ld.outputBlobs[0] = ld.outputBlobs[0].clone();
- ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
-
- eltwiseData->outputBlobs = ld.outputBlobs;
- nextData->outputBlobs = ld.outputBlobs;
- eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
- nextData->outputBlobsWrappers = ld.outputBlobsWrappers;
-
- // Move references of [activ] layer consumers to the newly allocated blob.
- for (int i = 0; i < nextData->consumers.size(); ++i)
+ LayerData& consumer = layers[nextData->consumers[i].lid];
+ for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
{
- LayerData& consumer = layers[nextData->consumers[i].lid];
- for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
+ if (consumer.inputBlobsId[j].lid == lpNext.lid)
{
- if (consumer.inputBlobsId[j].lid == lpNext.lid)
- {
- consumer.inputBlobs[j] = &ld.outputBlobs[0];
- consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
- break;
- }
+ consumer.inputBlobs[j] = &ld.outputBlobs[0];
+ consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
+ break;
+ }
+ }
+ }
+ }
+ else if (fuse_eltwise) // conv + eltwise (note: conv could have fused activations before eltwise)
+ {
+ CV_Assert(IS_DNN_CUDA_TARGET(preferableTarget));
+ CV_Assert_N(biasLayerData->outputBlobsWrappers.size() == 1, ld.inputBlobsWrappers.size() == 1);
+ ld.inputBlobsWrappers.push_back(biasLayerData->outputBlobsWrappers[0]);
+ printf_(("\tfused with %s\n", nextEltwiseLayer->name.c_str()));
+ eltwiseData->skip = true;
+ // This optimization is for cases like
+ // some_layer conv (maybe fused with activ)
+ // | |
+ // +-- eltwise --+
+ //
+ // This way all the element-wise computations
+ // (i.e. some_layer+conv or some_layer*conv)
+ // would be done at [conv] layer. So we need to
+ // replace [conv]'s output blob to [eltwise]'s one.
+ // Also we need to move all the consumers' references.
+ // To prevent memory collisions (i.e. when input of
+ // [conv] and output of [eltwise] is the same blob)
+ // we allocate a new blob.
+ CV_Assert_N(ld.outputBlobs.size() == 1, ld.outputBlobsWrappers.size() == 1);
+ ld.outputBlobs[0] = ld.outputBlobs[0].clone();
+ ld.outputBlobsWrappers[0] = wrap(ld.outputBlobs[0]);
+
+ eltwiseData->outputBlobs = ld.outputBlobs;
+ eltwiseData->outputBlobsWrappers = ld.outputBlobsWrappers;
+
+ // Move references of [eltwise] layer consumers to the newly allocated blob.
+ for (int i = 0; i < eltwiseData->consumers.size(); ++i)
+ {
+ LayerData& consumer = layers[eltwiseData->consumers[i].lid];
+ for (int j = 0; j < consumer.inputBlobsId.size(); ++j)
+ {
+ if (consumer.inputBlobsId[j].lid == eltwiseData->id)
+ {
+ consumer.inputBlobs[j] = &ld.outputBlobs[0];
+ consumer.inputBlobsWrappers[j] = ld.outputBlobsWrappers[0];
+ break;
}
}
}