class CV_EXPORTS ActivationLayer;
class CV_EXPORTS BatchNormLayer;
+ class CV_EXPORTS ScaleLayer;
/** @brief This interface class allows to build new Layers - are building blocks of networks.
*
*/
virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
+ /**
+ * @brief Tries to attach to the layer the subsequent scaling layer, i.e. do the layer fusion in a partial case.
+ * @param[in] layer The subsequent scaling layer.
+ *
+ * Returns true if the scaling layer has been attached successfully.
+ */
+ virtual bool setScale(const Ptr<ScaleLayer>& layer);
+
+ /**
+ * @brief "Deattaches" all the layers, attached to particular layer.
+ */
+ virtual void unsetAttached();
+
virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
const int requiredOutputs,
std::vector<MatShape> &outputs,
/** @overload */
CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
- const int layerId,
- std::vector<MatShape>* inLayerShapes,
- std::vector<MatShape>* outLayerShapes) const;
+ const int layerId,
+ std::vector<MatShape>* inLayerShapes,
+ std::vector<MatShape>* outLayerShapes) const;
+
/** @brief Computes FLOP for whole loaded model with specified input shapes.
* @param netInputShapes vector of shapes for all net inputs.
* @returns computed FLOP.
CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
/** @overload */
CV_WRAP int64 getFLOPS(const int layerId,
- const std::vector<MatShape>& netInputShapes) const;
+ const std::vector<MatShape>& netInputShapes) const;
/** @overload */
CV_WRAP int64 getFLOPS(const int layerId,
- const MatShape& netInputShape) const;
+ const MatShape& netInputShape) const;
/** @brief Returns list of types for layer used in model.
* @param layersTypes output parameter for returning types.
CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
CV_OUT std::vector<size_t>& blobs) const;
- private:
+ /** @brief Enables or disables layer fusion in the network.
+ * @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
+ */
+ CV_WRAP void enableFusion(bool fusion);
+
+ private:
struct Impl;
Ptr<Impl> impl;
};
}
}
- void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst)
+ void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool force)
{
- std::map<LayerPin, Mat>::iterator hostIt;
- std::map<LayerPin, int>::iterator refIt;
-
- const int targetTotal = total(shape);
Mat bestBlob;
- int bestBlobTotal = INT_MAX;
LayerPin bestBlobPin;
- for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
+
+ if( !force )
{
- refIt = refCounter.find(hostIt->first);
- // Use only blobs that had references before because if not,
- // it might be used as output.
- if (refIt != refCounter.end() && refIt->second == 0)
+ std::map<LayerPin, Mat>::iterator hostIt;
+ std::map<LayerPin, int>::iterator refIt;
+
+ const int targetTotal = total(shape);
+ int bestBlobTotal = INT_MAX;
+
+ for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
{
- Mat& unusedBlob = hostIt->second;
- if (unusedBlob.total() >= targetTotal &&
- unusedBlob.total() < bestBlobTotal)
+ refIt = refCounter.find(hostIt->first);
+ // Use only blobs that had references before because if not,
+ // it might be used as output.
+ if (refIt != refCounter.end() && refIt->second == 0)
{
- bestBlobPin = hostIt->first;
- bestBlob = unusedBlob;
- bestBlobTotal = unusedBlob.total();
+ Mat& unusedBlob = hostIt->second;
+ if (unusedBlob.total() >= targetTotal &&
+ unusedBlob.total() < bestBlobTotal)
+ {
+ bestBlobPin = hostIt->first;
+ bestBlob = unusedBlob;
+ bestBlobTotal = unusedBlob.total();
+ }
}
}
}
}
void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
- std::vector<LayerPin>& pinsForInternalBlobs)
+ std::vector<LayerPin>& pinsForInternalBlobs,
+ bool maximizeReuse)
{
CV_TRACE_FUNCTION();
}
std::map<int, std::vector<int> >::reverse_iterator it;
+ bool force = !maximizeReuse && ld.inputBlobsId.size() > 1;
for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
{
for(int j = 0; j < it->second.size(); j++)
if (total(shapes[index]))
{
LayerPin blobPin(ld.id, index);
- if (index < outShapes.size() && inPlace)
+ if (index < outShapes.size() && inPlace && !force)
{
CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
}
else
{
- reuseOrCreate(shapes[index], blobPin, *blobs[index]);
+ reuseOrCreate(shapes[index], blobPin, *blobs[index], force);
}
}
}
lastLayerId = 1;
netWasAllocated = false;
+ fusion = true;
preferableBackend = DNN_BACKEND_DEFAULT;
preferableTarget = DNN_TARGET_CPU;
}
int lastLayerId;
bool netWasAllocated;
+ bool fusion;
void compileHalide()
{
if( currLayer.empty() )
continue;
- currLayer->setActivation(Ptr<ActivationLayer>());
- currLayer->setBatchNorm(Ptr<BatchNormLayer>());
+ currLayer->unsetAttached();
Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
if( !poolingLayer.empty() )
poolingLayer->computeMaxIdx = true;
}
}
+ it = layers.find(0);
+ CV_Assert(it != layers.end());
+ it->second.skipFlags[DNN_BACKEND_DEFAULT] = true;
}
-
void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
{
CV_TRACE_FUNCTION();
LayerData& getLayerData(const DictValue &layerDesc)
{
+ CV_Assert(layerDesc.isInt() || layerDesc.isString());
if (layerDesc.isInt())
return getLayerData(layerDesc.get<int>());
- else if (layerDesc.isString())
+ else /*if (layerDesc.isString())*/
return getLayerData(layerDesc.get<String>());
-
- CV_Assert(layerDesc.isInt() || layerDesc.isString());
- return *((LayerData*)NULL);
}
static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
CV_Assert(layerShapesIt != layersShapes.end());
std::vector<LayerPin> pinsForInternalBlobs;
- blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
+ bool maximizeReuse = preferableBackend == DNN_BACKEND_HALIDE;
+ blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, maximizeReuse);
Ptr<Layer> layerPtr = ld.getLayerInstance();
{
ld.flag = 1;
}
+#if 0
+#define printf_(args) printf args
+#else
+#define printf_(args)
+#endif
+
void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
{
+ if( !fusion || preferableBackend == DNN_BACKEND_HALIDE )
+ return;
+
CV_TRACE_FUNCTION();
// scan through all the layers. If there is convolution layer followed by the activation layer,
LayerData& ld = layers[lid];
if( ld.skipFlags[DNN_BACKEND_DEFAULT] )
{
+ printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
continue;
}
+ printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
if( ld.consumers.size() == 0 )
outnames.push_back(ld.layerInstance->name);
+ // the optimization #1. try to fuse batch norm, scaling and/or activation layers
+ // with the current layer if they follow it. Normally, the are fused with the convolution layer,
+ // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
+ // some other layers.
Ptr<Layer>& currLayer = ld.layerInstance;
if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
{
nextData = 0;
if( currLayer->setBatchNorm(nextBNormLayer) )
{
+ printf_(("\tfused with %s\n", nextBNormLayer->name.c_str()));
bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true;
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
if( bnormData->consumers.size() == 1 )
nextData = &layers[bnormData->consumers[0].lid];
+ lpNext = LayerPin(bnormData->consumers[0].lid, 0);
+ }
+ }
+
+ Ptr<ScaleLayer> nextScaleLayer;
+ if( nextData )
+ nextScaleLayer = nextData->layerInstance.dynamicCast<ScaleLayer>();
+ if( !nextScaleLayer.empty() && pinsToKeep.count(lpNext) == 0 )
+ {
+ LayerData* scaleData = nextData;
+ nextData = 0;
+ if( currLayer->setScale(nextScaleLayer) )
+ {
+ printf_(("\tfused with %s\n", nextScaleLayer->name.c_str()));
+ scaleData->skipFlags[DNN_BACKEND_DEFAULT] = true;
+ ld.outputBlobs = layers[lpNext.lid].outputBlobs;
+ if( scaleData->consumers.size() == 1 )
+ nextData = &layers[scaleData->consumers[0].lid];
}
}
if( !nextActivLayer.empty() && currLayer->setActivation(nextActivLayer) )
{
- //printf("successfully merged %s and %s\n", currLayer->name.c_str(), nextActivLayer->name.c_str());
+ printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
ld.outputBlobs = layers[lpNext.lid].outputBlobs;
}
}
+
+ // the optimization #2. if there is no layer that takes max pooling layer's computed
+ // max indices (and only some semantical segmentation networks might need this;
+ // many others only take the maximum values), then we switch the max pooling
+ // layer to the faster operating mode.
Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
if( !poolingLayer.empty() && !ld.consumers.empty() )
{
if( i >= nconsumers )
{
poolingLayer->computeMaxIdx = false;
- //printf("simplified pooling layer %s\n", poolingLayer->name.c_str());
+ printf_(("\tsimplified pooling layer %s\n", poolingLayer->name.c_str()));
+ }
+ }
+
+ // the optimization #3. if there is concat layer that concatenates channels
+ // from the inputs together (i.e. axis == 1) then we make the inputs of
+ // the concat layer to write to the concatetion output buffer
+ // (and so we eliminate the concatenation layer, because the channels
+ // are concatenated implicitly).
+ Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
+ if( !concatLayer.empty() && concatLayer->axis == 1 &&
+ ld.outputBlobs.size() == 1 )
+ {
+ Mat& output = ld.outputBlobs[0];
+
+ // TODO: in general, this optimization can always be done, but
+ // many layers currently check that the input/output blobs are
+ // continuous arrays. Unfortunately, this is not true when
+ // the concatenation optimization is applied with batch_size > 1.
+ // so, for now, we only apply this optimization in the most popular
+ // case batch_size == 1.
+ if( output.dims == 4 && output.size[0] == 1 )
+ {
+ size_t i, ninputs = ld.inputBlobsId.size();
+ std::vector<LayerPin> realinputs(ninputs);
+ for( i = 0; i < ninputs; i++ )
+ {
+ LayerPin pin = ld.inputBlobsId[i];
+ LayerData* inp_i_data = &layers[pin.lid];
+ while(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT] &&
+ inp_i_data->inputBlobsId.size() == 1)
+ {
+ pin = inp_i_data->inputBlobsId[0];
+ inp_i_data = &layers[pin.lid];
+ }
+ printf_(("\treal input for %s is %s\n",
+ layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
+ inp_i_data->getLayerInstance()->name.c_str()));
+
+ if(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT])
+ break;
+ realinputs[i] = pin;
+ }
+
+ if( i >= ninputs )
+ {
+ Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
+ int ofs = 0;
+ for( i = 0; i < ninputs; i++ )
+ {
+ LayerPin pin = realinputs[i];
+ LayerData* inp_i_data = &layers[pin.lid];
+ int channels_i = ld.inputBlobs[i]->size[1];
+ chrange[1] = Range(ofs, ofs + channels_i);
+ printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
+ pin.oid, ofs, ofs + channels_i));
+ ofs += channels_i;
+ Mat output_slice = output(chrange);
+ Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
+ CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
+ curr_output = output_slice;
+ }
+ ld.skipFlags[DNN_BACKEND_DEFAULT] = true;
+ printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
+ }
}
}
}
CV_TRACE_FUNCTION();
CV_TRACE_ARG(backendId);
- impl->netWasAllocated = impl->netWasAllocated &&
- impl->preferableBackend == backendId;
- impl->preferableBackend = backendId;
+ if( impl->preferableBackend != backendId )
+ {
+ impl->preferableBackend = backendId;
+ impl->netWasAllocated = false;
+ impl->clear();
+ }
}
void Net::setPreferableTarget(int targetId)
CV_TRACE_FUNCTION();
CV_TRACE_ARG(targetId);
- impl->netWasAllocated = impl->netWasAllocated &&
- impl->preferableTarget == targetId;
- impl->preferableTarget = targetId;
+ if( impl->preferableTarget != targetId )
+ {
+ impl->preferableTarget = targetId;
+ impl->netWasAllocated = false;
+ impl->clear();
+ }
}
void Net::setInputsNames(const std::vector<String> &inputBlobNames)
weights, blobs);
}
+void Net::enableFusion(bool fusion)
+{
+ if( impl->fusion != fusion )
+ {
+ impl->fusion = fusion;
+ impl->netWasAllocated = false;
+ impl->clear();
+ }
+}
+
void Net::setHalideScheduler(const String& scheduler)
{
CV_TRACE_FUNCTION();
bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
bool Layer::setBatchNorm(const Ptr<BatchNormLayer>&) { return false; }
+bool Layer::setScale(const Ptr<ScaleLayer>&) { return false; }
+void Layer::unsetAttached()
+{
+ setActivation(Ptr<ActivationLayer>());
+ setBatchNorm(Ptr<BatchNormLayer>());
+ setScale(Ptr<ScaleLayer>());
+}
template <typename T>
static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1; // By channels
}
+ class ChannelConcatInvoker : public ParallelLoopBody
+ {
+ public:
+ std::vector<Mat*>* inputs;
+ Mat* output;
+ int nstripes;
+ std::vector<const float*> chptrs;
+
+ static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
+ {
+ ChannelConcatInvoker cc;
+ cc.inputs = &inputs;
+ cc.output = &output;
+ cc.nstripes = nstripes;
+
+ size_t i, ninputs = inputs.size();
+ int nchannels = 0, batchsz = output.size[0];
+ for( i = 0; i < ninputs; i++ )
+ {
+ Mat& inp = *inputs[i];
+ CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
+ inp.dims == 4 && inp.size[0] == output.size[0] &&
+ inp.size[2] == output.size[2] &&
+ inp.size[3] == output.size[3] );
+ nchannels += inp.size[1];
+ }
+ CV_Assert( nchannels == output.size[1] );
+ CV_Assert( output.isContinuous() && output.type() == CV_32F );
+
+ cc.chptrs.resize(nchannels*batchsz);
+
+ int ofs = 0;
+ for( i = 0; i < ninputs; i++)
+ {
+ Mat& inp = *inputs[i];
+ for( int j = 0; j < batchsz; j++ )
+ for( int k = 0; k < inp.size[1]; k++ )
+ {
+ const float* ptr = inp.ptr<float>(j, k);
+ cc.chptrs[ofs + j*nchannels + k] = ptr;
+ }
+ ofs += inp.size[1];
+ }
+
+ parallel_for_(Range(0, nstripes), cc, nstripes);
+ }
+
+ ChannelConcatInvoker() {}
+
+ void operator()(const Range& r) const
+ {
+ size_t planeSize = (size_t)output->size[2]*output->size[3];
+ size_t nch = chptrs.size();
+ size_t total = nch*planeSize;
+ size_t stripeSize = (total + nstripes - 1)/nstripes;
+ size_t stripeStart = r.start*stripeSize;
+ size_t stripeEnd = std::min(total, r.end*stripeSize);
+ const float** ptrs = (const float**)&chptrs[0];
+ float* outptr = output->ptr<float>();
+ size_t blockSize0 = 1 << 16;
+
+ for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+ {
+ size_t ch = ofs0/planeSize;
+ size_t ofs = ofs0 - ch*planeSize;
+ size_t blockSize = std::min(blockSize0, planeSize - ofs);
+ memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
+ ofs0 += blockSize;
+ }
+ }
+ };
+
void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
{
CV_TRACE_FUNCTION();
int cAxis = clamp(axis, inputs[0]->dims);
Mat& outMat = outputs[0];
- std::vector<Range> ranges(outputs[0].dims, Range::all());
- ranges[cAxis].start = 0;
- for (size_t i = 0; i < inputs.size(); i++)
+ if( cAxis == 1 && outMat.dims == 4 )
+ {
+ int nstripes = getNumThreads();
+ ChannelConcatInvoker::run(inputs, outMat, nstripes);
+ }
+ else
{
- ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
- inputs[i]->copyTo(outMat(&ranges[0]));
- ranges[cAxis].start = ranges[cAxis].end;
+ std::vector<Range> ranges(outputs[0].dims, Range::all());
+
+ ranges[cAxis].start = 0;
+ for (size_t i = 0; i < inputs.size(); i++)
+ {
+ ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
+ inputs[i]->copyTo(outMat(&ranges[0]));
+ ranges[cAxis].start = ranges[cAxis].end;
+ }
}
}
std::vector<float> reluslope;
Ptr<ActivationLayer> activ;
Ptr<BatchNormLayer> bnorm;
+ Ptr<ScaleLayer> scaleLayer;
MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
{
bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
{
+ // for now the scale layer followed by the batch norm cannot be fused, only vice versa.
+ if( !scaleLayer.empty() )
+ return false;
bnorm = layer;
// we will need to re-compute the weights with the batch
// norm coefficients taken into account
return !bnorm.empty();
}
+ bool setScale(const Ptr<ScaleLayer>& layer)
+ {
+ scaleLayer = layer;
+ // we will need to re-compute the weights with the scaling
+ // coefficients taken into account
+ weightsMat.release();
+ return !scaleLayer.empty();
+ }
+
virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
{
#ifdef HAVE_HALIDE
biasvec[k] = biasMat.at<float>(k);
}
- if( !bnorm.empty() )
+ if( !bnorm.empty() || !scaleLayer.empty() )
{
- Mat scale, shift;
- bnorm->getScaleShift(scale, shift);
+ Mat scale, shift, scale2, shift2;
+ const float *scaleptr = 0, *shiftptr = 0;
+ const float *scaleptr2 = 0, *shiftptr2 = 0;
- CV_Assert( scale.isContinuous() && shift.isContinuous() &&
- scale.type() == CV_32F && shift.type() == CV_32F &&
- scale.total() == (size_t)outCn &&
- shift.total() == (size_t)outCn );
+ if( !bnorm.empty() )
+ {
+ bnorm->getScaleShift(scale, shift);
+ CV_Assert( scale.isContinuous() && shift.isContinuous() &&
+ scale.type() == CV_32F && shift.type() == CV_32F &&
+ scale.total() == (size_t)outCn &&
+ shift.total() == (size_t)outCn );
+ scaleptr = scale.ptr<float>();
+ shiftptr = shift.ptr<float>();
+ }
+ if( !scaleLayer.empty() )
+ {
+ scale2 = scaleLayer->blobs[0];
+ CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F &&
+ scale2.total() == (size_t)outCn );
+ scaleptr2 = scale2.ptr<float>();
+ if( scaleLayer->hasBias )
+ {
+ shift2 = scaleLayer->blobs[1];
+ CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F &&
+ shift2.total() == (size_t)outCn );
+ shiftptr2 = shift2.ptr<float>();
+ }
+ }
for( int i = 0; i < outCn; i++ )
{
- float s = scale.at<float>(i);
- float delta = shift.at<float>(i);
+ float s1 = scaleptr ? scaleptr[i] : 1.f;
+ float delta1 = shiftptr ? shiftptr[i] : 0.f;
+ float s2 = scaleptr2 ? scaleptr2[i] : 1.f;
+ float delta2 = shiftptr2 ? shiftptr2[i] : 0.f;
float* w_i = weightsMat.ptr<float>(i);
int j, wcols = weightsMat.cols;
for( j = 0; j < wcols; j++ )
- w_i[j] *= s;
+ w_i[j] *= (s1*s2);
- biasvec[i] = biasvec[i]*s + delta;
+ biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
}
}
biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
}
+ reluslope.clear();
if( activ )
{
Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();