optimize out scaleLayer & concatLayer whenever possible

author Vadim Pisarevsky <vadim.pisarevsky@gmail.com>

Tue, 4 Jul 2017 14:23:47 +0000 (17:23 +0300)

committer Vadim Pisarevsky <vadim.pisarevsky@gmail.com>

Fri, 14 Jul 2017 15:30:53 +0000 (18:30 +0300)
author Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Tue, 4 Jul 2017 14:23:47 +0000 (17:23 +0300)
committer Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
Fri, 14 Jul 2017 15:30:53 +0000 (18:30 +0300)
diff --git a/modules/dnn/include/opencv2/dnn/dnn.hpp b/modules/dnn/include/opencv2/dnn/dnn.hpp

index f4369ee..8324fe9 100644 (file)
--- a/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@ -152,6 +152,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
  
      class CV_EXPORTS ActivationLayer;
      class CV_EXPORTS BatchNormLayer;
+    class CV_EXPORTS ScaleLayer;
  
      /** @brief This interface class allows to build new Layers - are building blocks of networks.
       *
@@ -269,6 +270,19 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
           */
          virtual bool setBatchNorm(const Ptr<BatchNormLayer>& layer);
  
+        /**
+         * @brief Tries to attach to the layer the subsequent scaling layer, i.e. do the layer fusion in a partial case.
+         * @param[in] layer The subsequent scaling layer.
+         *
+         * Returns true if the scaling layer has been attached successfully.
+         */
+        virtual bool setScale(const Ptr<ScaleLayer>& layer);
+
+        /**
+         * @brief "Deattaches" all the layers, attached to particular layer.
+         */
+        virtual void unsetAttached();
+
          virtual bool getMemoryShapes(const std::vector<MatShape> &inputs,
                                       const int requiredOutputs,
                                       std::vector<MatShape> &outputs,
@@ -495,9 +509,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
  
          /** @overload */
          CV_WRAP void getLayerShapes(const std::vector<MatShape>& netInputShapes,
-                                     const int layerId,
-                                     std::vector<MatShape>* inLayerShapes,
-                                     std::vector<MatShape>* outLayerShapes) const;
+                                    const int layerId,
+                                    std::vector<MatShape>* inLayerShapes,
+                                    std::vector<MatShape>* outLayerShapes) const;
+
          /** @brief Computes FLOP for whole loaded model with specified input shapes.
           * @param netInputShapes vector of shapes for all net inputs.
           * @returns computed FLOP.
@@ -507,10 +522,10 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
          CV_WRAP int64 getFLOPS(const MatShape& netInputShape) const;
          /** @overload */
          CV_WRAP int64 getFLOPS(const int layerId,
-                              const std::vector<MatShape>& netInputShapes) const;
+                               const std::vector<MatShape>& netInputShapes) const;
          /** @overload */
          CV_WRAP int64 getFLOPS(const int layerId,
-                              const MatShape& netInputShape) const;
+                               const MatShape& netInputShape) const;
  
          /** @brief Returns list of types for layer used in model.
           * @param layersTypes output parameter for returning types.
@@ -557,8 +572,13 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
          CV_WRAP void getMemoryConsumption(const MatShape& netInputShape,
                                            CV_OUT std::vector<int>& layerIds, CV_OUT std::vector<size_t>& weights,
                                            CV_OUT std::vector<size_t>& blobs) const;
-    private:
  
+        /** @brief Enables or disables layer fusion in the network.
+         * @param fusion true to enable the fusion, false to disable. The fusion is enabled by default.
+         */
+        CV_WRAP void enableFusion(bool fusion);
+
+    private:
          struct Impl;
          Ptr<Impl> impl;
      };
diff --git a/modules/dnn/src/dnn.cpp b/modules/dnn/src/dnn.cpp

index a371b18..2743328 100644 (file)
--- a/modules/dnn/src/dnn.cpp
+++ b/modules/dnn/src/dnn.cpp
@@ -464,29 +464,34 @@ public:
          }
      }
  
-    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst)
+    void reuseOrCreate(const MatShape& shape, const LayerPin& lp, Mat& dst, bool force)
      {
-        std::map<LayerPin, Mat>::iterator hostIt;
-        std::map<LayerPin, int>::iterator refIt;
-
-        const int targetTotal = total(shape);
          Mat bestBlob;
-        int bestBlobTotal = INT_MAX;
          LayerPin bestBlobPin;
-        for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
+
+        if( !force )
          {
-            refIt = refCounter.find(hostIt->first);
-            // Use only blobs that had references before because if not,
-            // it might be used as output.
-            if (refIt != refCounter.end() && refIt->second == 0)
+            std::map<LayerPin, Mat>::iterator hostIt;
+            std::map<LayerPin, int>::iterator refIt;
+
+            const int targetTotal = total(shape);
+            int bestBlobTotal = INT_MAX;
+
+            for (hostIt = memHosts.begin(); hostIt != memHosts.end(); ++hostIt)
              {
-                Mat& unusedBlob = hostIt->second;
-                if (unusedBlob.total() >= targetTotal &&
-                    unusedBlob.total() < bestBlobTotal)
+                refIt = refCounter.find(hostIt->first);
+                // Use only blobs that had references before because if not,
+                // it might be used as output.
+                if (refIt != refCounter.end() && refIt->second == 0)
                  {
-                    bestBlobPin = hostIt->first;
-                    bestBlob = unusedBlob;
-                    bestBlobTotal = unusedBlob.total();
+                    Mat& unusedBlob = hostIt->second;
+                    if (unusedBlob.total() >= targetTotal &&
+                        unusedBlob.total() < bestBlobTotal)
+                    {
+                        bestBlobPin = hostIt->first;
+                        bestBlob = unusedBlob;
+                        bestBlobTotal = unusedBlob.total();
+                    }
                  }
              }
          }
@@ -505,7 +510,8 @@ public:
      }
  
      void allocateBlobsForLayer(LayerData &ld, const LayerShapes& layerShapes,
-                               std::vector<LayerPin>& pinsForInternalBlobs)
+                               std::vector<LayerPin>& pinsForInternalBlobs,
+                               bool maximizeReuse)
      {
          CV_TRACE_FUNCTION();
  
@@ -561,6 +567,7 @@ public:
          }
  
          std::map<int, std::vector<int> >::reverse_iterator it;
+        bool force = !maximizeReuse && ld.inputBlobsId.size() > 1;
          for(it = idxSizes.rbegin(); it != idxSizes.rend(); it++)
          {
              for(int j = 0; j < it->second.size(); j++)
@@ -569,7 +576,7 @@ public:
                  if (total(shapes[index]))
                  {
                      LayerPin blobPin(ld.id, index);
-                    if (index < outShapes.size() && inPlace)
+                    if (index < outShapes.size() && inPlace && !force)
                      {
                          CV_Assert(ld.inputBlobs[0]->total() == total(shapes[index]));
                          ld.outputBlobs[index] = ld.inputBlobs[0]->reshape(1, shapes[index]);
@@ -577,7 +584,7 @@ public:
                      }
                      else
                      {
-                        reuseOrCreate(shapes[index], blobPin, *blobs[index]);
+                        reuseOrCreate(shapes[index], blobPin, *blobs[index], force);
                      }
                  }
              }
@@ -628,6 +635,7 @@ struct Net::Impl
  
          lastLayerId = 1;
          netWasAllocated = false;
+        fusion = true;
          preferableBackend = DNN_BACKEND_DEFAULT;
          preferableTarget = DNN_TARGET_CPU;
      }
@@ -647,6 +655,7 @@ struct Net::Impl
      int lastLayerId;
  
      bool netWasAllocated;
+    bool fusion;
  
      void compileHalide()
      {
@@ -695,8 +704,7 @@ struct Net::Impl
              if( currLayer.empty() )
                  continue;
  
-            currLayer->setActivation(Ptr<ActivationLayer>());
-            currLayer->setBatchNorm(Ptr<BatchNormLayer>());
+            currLayer->unsetAttached();
  
              Ptr<PoolingLayer> poolingLayer = currLayer.dynamicCast<PoolingLayer>();
              if( !poolingLayer.empty() )
@@ -704,9 +712,11 @@ struct Net::Impl
                  poolingLayer->computeMaxIdx = true;
              }
          }
+        it = layers.find(0);
+        CV_Assert(it != layers.end());
+        it->second.skipFlags[DNN_BACKEND_DEFAULT] = true;
      }
  
-
      void setUpNet(const std::vector<LayerPin>& blobsToKeep_ = std::vector<LayerPin>())
      {
          CV_TRACE_FUNCTION();
@@ -783,13 +793,11 @@ struct Net::Impl
  
      LayerData& getLayerData(const DictValue &layerDesc)
      {
+        CV_Assert(layerDesc.isInt() || layerDesc.isString());
          if (layerDesc.isInt())
              return getLayerData(layerDesc.get<int>());
-        else if (layerDesc.isString())
+        else /*if (layerDesc.isString())*/
              return getLayerData(layerDesc.get<String>());
-
-        CV_Assert(layerDesc.isInt() || layerDesc.isString());
-        return *((LayerData*)NULL);
      }
  
      static void addLayerInput(LayerData &ld, int inNum, LayerPin from)
@@ -1021,7 +1029,8 @@ struct Net::Impl
          CV_Assert(layerShapesIt != layersShapes.end());
  
          std::vector<LayerPin> pinsForInternalBlobs;
-        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs);
+        bool maximizeReuse = preferableBackend == DNN_BACKEND_HALIDE;
+        blobManager.allocateBlobsForLayer(ld, layerShapesIt->second, pinsForInternalBlobs, maximizeReuse);
  
          Ptr<Layer> layerPtr = ld.getLayerInstance();
          {
@@ -1044,8 +1053,17 @@ struct Net::Impl
          ld.flag = 1;
      }
  
+#if 0
+#define printf_(args) printf args
+#else
+#define printf_(args)
+#endif
+
      void fuseLayers(const std::vector<LayerPin>& blobsToKeep_)
      {
+        if( !fusion || preferableBackend == DNN_BACKEND_HALIDE )
+            return;
+
          CV_TRACE_FUNCTION();
  
          // scan through all the layers. If there is convolution layer followed by the activation layer,
@@ -1060,11 +1078,17 @@ struct Net::Impl
              LayerData& ld = layers[lid];
              if( ld.skipFlags[DNN_BACKEND_DEFAULT] )
              {
+                printf_(("skipped %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
                  continue;
              }
+            printf_(("analyzing %s: %s\n", ld.layerInstance->name.c_str(), ld.layerInstance->type.c_str()));
              if( ld.consumers.size() == 0 )
                  outnames.push_back(ld.layerInstance->name);
  
+            // the optimization #1. try to fuse batch norm, scaling and/or activation layers
+            // with the current layer if they follow it. Normally, the are fused with the convolution layer,
+            // but some of them (like activation) may be fused with fully-connected, elemwise (+) and
+            // some other layers.
              Ptr<Layer>& currLayer = ld.layerInstance;
              if( ld.consumers.size() == 1 && pinsToKeep.count(LayerPin(lid, 0)) == 0 )
              {
@@ -1078,10 +1102,29 @@ struct Net::Impl
                      nextData = 0;
                      if( currLayer->setBatchNorm(nextBNormLayer) )
                      {
+                        printf_(("\tfused with %s\n", nextBNormLayer->name.c_str()));
                          bnormData->skipFlags[DNN_BACKEND_DEFAULT] = true;
                          ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                          if( bnormData->consumers.size() == 1 )
                              nextData = &layers[bnormData->consumers[0].lid];
+                        lpNext = LayerPin(bnormData->consumers[0].lid, 0);
+                    }
+                }
+
+                Ptr<ScaleLayer> nextScaleLayer;
+                if( nextData )
+                    nextScaleLayer = nextData->layerInstance.dynamicCast<ScaleLayer>();
+                if( !nextScaleLayer.empty() && pinsToKeep.count(lpNext) == 0 )
+                {
+                    LayerData* scaleData = nextData;
+                    nextData = 0;
+                    if( currLayer->setScale(nextScaleLayer) )
+                    {
+                        printf_(("\tfused with %s\n", nextScaleLayer->name.c_str()));
+                        scaleData->skipFlags[DNN_BACKEND_DEFAULT] = true;
+                        ld.outputBlobs = layers[lpNext.lid].outputBlobs;
+                        if( scaleData->consumers.size() == 1 )
+                            nextData = &layers[scaleData->consumers[0].lid];
                      }
                  }
  
@@ -1091,11 +1134,16 @@ struct Net::Impl
  
                  if( !nextActivLayer.empty() && currLayer->setActivation(nextActivLayer) )
                  {
-                    //printf("successfully merged %s and %s\n", currLayer->name.c_str(), nextActivLayer->name.c_str());
+                    printf_(("\tfused with %s\n", nextActivLayer->name.c_str()));
                      nextData->skipFlags[DNN_BACKEND_DEFAULT] = true;
                      ld.outputBlobs = layers[lpNext.lid].outputBlobs;
                  }
              }
+
+            // the optimization #2. if there is no layer that takes max pooling layer's computed
+            // max indices (and only some semantical segmentation networks might need this;
+            // many others only take the maximum values), then we switch the max pooling
+            // layer to the faster operating mode.
              Ptr<PoolingLayer> poolingLayer = ld.layerInstance.dynamicCast<PoolingLayer>();
              if( !poolingLayer.empty() && !ld.consumers.empty() )
              {
@@ -1108,7 +1156,71 @@ struct Net::Impl
                  if( i >= nconsumers )
                  {
                      poolingLayer->computeMaxIdx = false;
-                    //printf("simplified pooling layer %s\n", poolingLayer->name.c_str());
+                    printf_(("\tsimplified pooling layer %s\n", poolingLayer->name.c_str()));
+                }
+            }
+
+            // the optimization #3. if there is concat layer that concatenates channels
+            // from the inputs together (i.e. axis == 1) then we make the inputs of
+            // the concat layer to write to the concatetion output buffer
+            // (and so we eliminate the concatenation layer, because the channels
+            // are concatenated implicitly).
+            Ptr<ConcatLayer> concatLayer = ld.layerInstance.dynamicCast<ConcatLayer>();
+            if( !concatLayer.empty() && concatLayer->axis == 1 &&
+                ld.outputBlobs.size() == 1 )
+            {
+                Mat& output = ld.outputBlobs[0];
+
+                // TODO: in general, this optimization can always be done, but
+                // many layers currently check that the input/output blobs are
+                // continuous arrays. Unfortunately, this is not true when
+                // the concatenation optimization is applied with batch_size > 1.
+                // so, for now, we only apply this optimization in the most popular
+                // case batch_size == 1.
+                if( output.dims == 4 && output.size[0] == 1 )
+                {
+                    size_t i, ninputs = ld.inputBlobsId.size();
+                    std::vector<LayerPin> realinputs(ninputs);
+                    for( i = 0; i < ninputs; i++ )
+                    {
+                        LayerPin pin = ld.inputBlobsId[i];
+                        LayerData* inp_i_data = &layers[pin.lid];
+                        while(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT] &&
+                              inp_i_data->inputBlobsId.size() == 1)
+                        {
+                            pin = inp_i_data->inputBlobsId[0];
+                            inp_i_data = &layers[pin.lid];
+                        }
+                        printf_(("\treal input for %s is %s\n",
+                               layers[ld.inputBlobsId[i].lid].getLayerInstance()->name.c_str(),
+                               inp_i_data->getLayerInstance()->name.c_str()));
+
+                        if(inp_i_data->skipFlags[DNN_BACKEND_DEFAULT])
+                            break;
+                        realinputs[i] = pin;
+                    }
+
+                    if( i >= ninputs )
+                    {
+                        Range chrange[] = { Range::all(), Range::all(), Range::all(), Range::all() };
+                        int ofs = 0;
+                        for( i = 0; i < ninputs; i++ )
+                        {
+                            LayerPin pin = realinputs[i];
+                            LayerData* inp_i_data = &layers[pin.lid];
+                            int channels_i = ld.inputBlobs[i]->size[1];
+                            chrange[1] = Range(ofs, ofs + channels_i);
+                            printf_(("\toutput %s(%d) to channels (%d, %d)\n", inp_i_data->layerInstance->name.c_str(),
+                                   pin.oid, ofs, ofs + channels_i));
+                            ofs += channels_i;
+                            Mat output_slice = output(chrange);
+                            Mat& curr_output = inp_i_data->outputBlobs[pin.oid];
+                            CV_Assert(output_slice.isContinuous() && output_slice.size == curr_output.size);
+                            curr_output = output_slice;
+                        }
+                        ld.skipFlags[DNN_BACKEND_DEFAULT] = true;
+                        printf_(("\toptimized out Concat layer %s\n", concatLayer->name.c_str()));
+                    }
                  }
              }
          }
@@ -1458,9 +1570,12 @@ void Net::setPreferableBackend(int backendId)
      CV_TRACE_FUNCTION();
      CV_TRACE_ARG(backendId);
  
-    impl->netWasAllocated = impl->netWasAllocated &&
-                            impl->preferableBackend == backendId;
-    impl->preferableBackend = backendId;
+    if( impl->preferableBackend != backendId )
+    {
+        impl->preferableBackend = backendId;
+        impl->netWasAllocated = false;
+        impl->clear();
+    }
  }
  
  void Net::setPreferableTarget(int targetId)
@@ -1468,9 +1583,12 @@ void Net::setPreferableTarget(int targetId)
      CV_TRACE_FUNCTION();
      CV_TRACE_ARG(targetId);
  
-    impl->netWasAllocated = impl->netWasAllocated &&
-                            impl->preferableTarget == targetId;
-    impl->preferableTarget = targetId;
+    if( impl->preferableTarget != targetId )
+    {
+        impl->preferableTarget = targetId;
+        impl->netWasAllocated = false;
+        impl->clear();
+    }
  }
  
  void Net::setInputsNames(const std::vector<String> &inputBlobNames)
@@ -1825,6 +1943,16 @@ void Net::getMemoryConsumption(const MatShape& netInputShape, std::vector<int>&
                           weights, blobs);
  }
  
+void Net::enableFusion(bool fusion)
+{
+    if( impl->fusion != fusion )
+    {
+        impl->fusion = fusion;
+        impl->netWasAllocated = false;
+        impl->clear();
+    }
+}
+
  void Net::setHalideScheduler(const String& scheduler)
  {
      CV_TRACE_FUNCTION();
@@ -1950,6 +2078,13 @@ Ptr<BackendNode> Layer::tryAttach(const Ptr<BackendNode>& node)
  
  bool Layer::setActivation(const Ptr<ActivationLayer>&) { return false; }
  bool Layer::setBatchNorm(const Ptr<BatchNormLayer>&) { return false; }
+bool Layer::setScale(const Ptr<ScaleLayer>&) { return false; }
+void Layer::unsetAttached()
+{
+    setActivation(Ptr<ActivationLayer>());
+    setBatchNorm(Ptr<BatchNormLayer>());
+    setScale(Ptr<ScaleLayer>());
+}
  
  template <typename T>
  static void vecToPVec(const std::vector<T> &v, std::vector<T*> &pv)
diff --git a/modules/dnn/src/layers/concat_layer.cpp b/modules/dnn/src/layers/concat_layer.cpp

index f2d6d4e..662be1d 100644 (file)
--- a/modules/dnn/src/layers/concat_layer.cpp
+++ b/modules/dnn/src/layers/concat_layer.cpp
@@ -94,6 +94,78 @@ public:
                 backendId == DNN_BACKEND_HALIDE && haveHalide() && axis == 1;  // By channels
      }
  
+    class ChannelConcatInvoker : public ParallelLoopBody
+    {
+    public:
+        std::vector<Mat*>* inputs;
+        Mat* output;
+        int nstripes;
+        std::vector<const float*> chptrs;
+
+        static void run(std::vector<Mat*>& inputs, Mat& output, int nstripes)
+        {
+            ChannelConcatInvoker cc;
+            cc.inputs = &inputs;
+            cc.output = &output;
+            cc.nstripes = nstripes;
+
+            size_t i, ninputs = inputs.size();
+            int nchannels = 0, batchsz = output.size[0];
+            for( i = 0; i < ninputs; i++ )
+            {
+                Mat& inp = *inputs[i];
+                CV_Assert( inp.isContinuous() && inp.type() == CV_32F &&
+                           inp.dims == 4 && inp.size[0] == output.size[0] &&
+                           inp.size[2] == output.size[2] &&
+                           inp.size[3] == output.size[3] );
+                nchannels += inp.size[1];
+            }
+            CV_Assert( nchannels == output.size[1] );
+            CV_Assert( output.isContinuous() && output.type() == CV_32F );
+
+            cc.chptrs.resize(nchannels*batchsz);
+
+            int ofs = 0;
+            for( i = 0; i < ninputs; i++)
+            {
+                Mat& inp = *inputs[i];
+                for( int j = 0; j < batchsz; j++ )
+                    for( int k = 0; k < inp.size[1]; k++ )
+                    {
+                        const float* ptr = inp.ptr<float>(j, k);
+                        cc.chptrs[ofs + j*nchannels + k] = ptr;
+                    }
+                ofs += inp.size[1];
+            }
+
+            parallel_for_(Range(0, nstripes), cc, nstripes);
+        }
+
+        ChannelConcatInvoker() {}
+
+        void operator()(const Range& r) const
+        {
+            size_t planeSize = (size_t)output->size[2]*output->size[3];
+            size_t nch = chptrs.size();
+            size_t total = nch*planeSize;
+            size_t stripeSize = (total + nstripes - 1)/nstripes;
+            size_t stripeStart = r.start*stripeSize;
+            size_t stripeEnd = std::min(total, r.end*stripeSize);
+            const float** ptrs = (const float**)&chptrs[0];
+            float* outptr = output->ptr<float>();
+            size_t blockSize0 = 1 << 16;
+
+            for( size_t ofs0 = stripeStart; ofs0 < stripeEnd; )
+            {
+                size_t ch = ofs0/planeSize;
+                size_t ofs = ofs0 - ch*planeSize;
+                size_t blockSize = std::min(blockSize0, planeSize - ofs);
+                memcpy(outptr + ofs0, ptrs[ch] + ofs, blockSize*sizeof(outptr[0]));
+                ofs0 += blockSize;
+            }
+        }
+    };
+
      void forward(std::vector<Mat*> &inputs, std::vector<Mat> &outputs, std::vector<Mat> &internals)
      {
          CV_TRACE_FUNCTION();
@@ -101,14 +173,23 @@ public:
  
          int cAxis = clamp(axis, inputs[0]->dims);
          Mat& outMat = outputs[0];
-        std::vector<Range> ranges(outputs[0].dims, Range::all());
  
-        ranges[cAxis].start = 0;
-        for (size_t i = 0; i < inputs.size(); i++)
+        if( cAxis == 1 && outMat.dims == 4 )
+        {
+            int nstripes = getNumThreads();
+            ChannelConcatInvoker::run(inputs, outMat, nstripes);
+        }
+        else
          {
-            ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
-            inputs[i]->copyTo(outMat(&ranges[0]));
-            ranges[cAxis].start = ranges[cAxis].end;
+            std::vector<Range> ranges(outputs[0].dims, Range::all());
+
+            ranges[cAxis].start = 0;
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                ranges[cAxis].end = ranges[cAxis].start + inputs[i]->size[cAxis];
+                inputs[i]->copyTo(outMat(&ranges[0]));
+                ranges[cAxis].start = ranges[cAxis].end;
+            }
          }
      }
  
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp

index 6e09c8c..3dd63a3 100644 (file)
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -148,6 +148,7 @@ public:
      std::vector<float> reluslope;
      Ptr<ActivationLayer> activ;
      Ptr<BatchNormLayer> bnorm;
+    Ptr<ScaleLayer> scaleLayer;
  
      MatShape computeColRowShape(const MatShape &inpShape, const MatShape &outShape) const
      {
@@ -202,6 +203,9 @@ public:
  
      bool setBatchNorm(const Ptr<BatchNormLayer>& layer )
      {
+        // for now the scale layer followed by the batch norm cannot be fused, only vice versa.
+        if( !scaleLayer.empty() )
+            return false;
          bnorm = layer;
          // we will need to re-compute the weights with the batch
          // norm coefficients taken into account
@@ -209,6 +213,15 @@ public:
          return !bnorm.empty();
      }
  
+    bool setScale(const Ptr<ScaleLayer>& layer)
+    {
+        scaleLayer = layer;
+        // we will need to re-compute the weights with the scaling
+        // coefficients taken into account
+        weightsMat.release();
+        return !scaleLayer.empty();
+    }
+
      virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> > &inputs)
      {
  #ifdef HAVE_HALIDE
@@ -678,32 +691,56 @@ public:
                      biasvec[k] = biasMat.at<float>(k);
              }
  
-            if( !bnorm.empty() )
+            if( !bnorm.empty() || !scaleLayer.empty() )
              {
-                Mat scale, shift;
-                bnorm->getScaleShift(scale, shift);
+                Mat scale, shift, scale2, shift2;
+                const float *scaleptr = 0, *shiftptr = 0;
+                const float *scaleptr2 = 0, *shiftptr2 = 0;
  
-                CV_Assert( scale.isContinuous() && shift.isContinuous() &&
-                           scale.type() == CV_32F && shift.type() == CV_32F &&
-                           scale.total() == (size_t)outCn &&
-                           shift.total() == (size_t)outCn );
+                if( !bnorm.empty() )
+                {
+                    bnorm->getScaleShift(scale, shift);
+                    CV_Assert( scale.isContinuous() && shift.isContinuous() &&
+                               scale.type() == CV_32F && shift.type() == CV_32F &&
+                               scale.total() == (size_t)outCn &&
+                               shift.total() == (size_t)outCn );
+                    scaleptr = scale.ptr<float>();
+                    shiftptr = shift.ptr<float>();
+                }
+                if( !scaleLayer.empty() )
+                {
+                    scale2 = scaleLayer->blobs[0];
+                    CV_Assert( scale2.isContinuous() && scale2.type() == CV_32F &&
+                               scale2.total() == (size_t)outCn );
+                    scaleptr2 = scale2.ptr<float>();
+                    if( scaleLayer->hasBias )
+                    {
+                        shift2 = scaleLayer->blobs[1];
+                        CV_Assert( shift2.isContinuous() && shift2.type() == CV_32F &&
+                                   shift2.total() == (size_t)outCn );
+                        shiftptr2 = shift2.ptr<float>();
+                    }
+                }
  
                  for( int i = 0; i < outCn; i++ )
                  {
-                    float s = scale.at<float>(i);
-                    float delta = shift.at<float>(i);
+                    float s1 = scaleptr ? scaleptr[i] : 1.f;
+                    float delta1 = shiftptr ? shiftptr[i] : 0.f;
+                    float s2 = scaleptr2 ? scaleptr2[i] : 1.f;
+                    float delta2 = shiftptr2 ? shiftptr2[i] : 0.f;
                      float* w_i = weightsMat.ptr<float>(i);
                      int j, wcols = weightsMat.cols;
  
                      for( j = 0; j < wcols; j++ )
-                        w_i[j] *= s;
+                        w_i[j] *= (s1*s2);
  
-                    biasvec[i] = biasvec[i]*s + delta;
+                    biasvec[i] = biasvec[i]*(s1*s2) + (delta1*s2 + delta2);
                  }
              }
              biasvec[outCn] = biasvec[outCn+1] = biasvec[outCn-1];
          }
  
+        reluslope.clear();
          if( activ )
          {
              Ptr<ReLULayer> activ_relu = activ.dynamicCast<ReLULayer>();
diff --git a/modules/dnn/test/test_halide_layers.cpp b/modules/dnn/test/test_halide_layers.cpp

index b2edf3a..6801a7c 100644 (file)
--- a/modules/dnn/test/test_halide_layers.cpp
+++ b/modules/dnn/test/test_halide_layers.cpp
@@ -517,7 +517,8 @@ TEST_P(Concat, Accuracy)
  
      Net net;
  
-    std::vector<int> convLayerIds(numChannels.channels);
+    std::vector<int> convLayerIds;
+    convLayerIds.reserve(numChannels.channels);
      for (int i = 0, n = numChannels.channels; i < n; ++i)
      {
          if (!numChannels[i])
@@ -537,8 +538,9 @@ TEST_P(Concat, Accuracy)
          convParam.name = ss.str();
          convParam.blobs.push_back(weights);
  
-        convLayerIds[i] = net.addLayer(convParam.name, convParam.type, convParam);
-        net.connect(0, 0, convLayerIds[i], 0);
+        int layerId = net.addLayer(convParam.name, convParam.type, convParam);
+        convLayerIds.push_back(layerId);
+        net.connect(0, 0, layerId, 0);
      }
  
      LayerParams concatParam;
author	Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
	Tue, 4 Jul 2017 14:23:47 +0000 (17:23 +0300)
committer	Vadim Pisarevsky <vadim.pisarevsky@gmail.com>
	Fri, 14 Jul 2017 15:30:53 +0000 (18:30 +0300)
modules/dnn/include/opencv2/dnn/dnn.hpp		patch \| blob \| history
modules/dnn/src/dnn.cpp		patch \| blob \| history
modules/dnn/src/layers/concat_layer.cpp		patch \| blob \| history
modules/dnn/src/layers/convolution_layer.cpp		patch \| blob \| history
modules/dnn/test/test_halide_layers.cpp		patch \| blob \| history