Merge pull request #22840 from zihaomu:optimze_conv_memory_usage

author Zihao Mu <zihaomu@outlook.com>

Thu, 8 Dec 2022 12:57:13 +0000 (20:57 +0800)

committer GitHub <noreply@github.com>

Thu, 8 Dec 2022 12:57:13 +0000 (12:57 +0000)
author Zihao Mu <zihaomu@outlook.com>
Thu, 8 Dec 2022 12:57:13 +0000 (20:57 +0800)
committer GitHub <noreply@github.com>
Thu, 8 Dec 2022 12:57:13 +0000 (12:57 +0000)
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp

index 46db47bc4c8f93dd648c739a4574968666893b79..2fb150bb3598f993d095be7ce89ef9dadd5eb1ef 100644 (file)
--- a/modules/dnn/perf/perf_net.cpp
+++ b/modules/dnn/perf/perf_net.cpp
@@ -198,6 +198,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow)
  
  PERF_TEST_P_(DNNTestNetwork, YOLOv3)
  {
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
      if (backend == DNN_BACKEND_HALIDE)
          throw SkipTestException("");
  #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000)  // nGraph compilation failure
@@ -220,6 +221,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3)
  
  PERF_TEST_P_(DNNTestNetwork, YOLOv4)
  {
+    applyTestTag(CV_TEST_TAG_MEMORY_2GB);
      if (backend == DNN_BACKEND_HALIDE)
          throw SkipTestException("");
      if (target == DNN_TARGET_MYRIAD)  // not enough resources
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp

index bc1acd0f72b48b98ef4ffb3c95eca94824e0cd26..cc395932324c3aecae6b4b58175440d4501f0847 100644 (file)
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -2112,8 +2112,11 @@ public:
                  int dilation_h = dilations[dilations.size() - 2];
                  int dilation_w = dilations.back();
  
+                // Winograd only works well on input h and w >12.
+                bool canUseWinograd = useWinograd && inputs[0].size[2] >= 12 && inputs[0].size[3] >= 12;
+
                  fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h, dilation_w,
-                                                dilation_h, pads_begin, pads_end, weightsMat, &biasvec[0], useWinograd);
+                                                dilation_h, pads_begin, pads_end, weightsMat, &biasvec[0], canUseWinograd);
              }
  
              if (fastConv2dImpl)
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp

index 8002f396f133be2c95a74adff9e94df435b3748c..fba57e7ee0e7ae18bbe5f3507e2c1ae32779a895 100644 (file)
--- a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
@@ -83,86 +83,85 @@ Ptr<FastConv2d> initFastConv2d(
                  weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k];
          }});
      }
-    else
+    else if(conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd
      {
-        if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd
+        static const float ktm[8][3] = {
+                {1.0f,      0.0f,      0.0f},
+                {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                {32.f/45, 16.f/45, 8.f/45},
+                {32.f/45, -16.f/45, 8.f/45},
+                {0.0f, 0.0f, 1.0f}
+        };
+
+        // the weights are packed as 6-dim tensor:
+        // ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE,
+        // where W is the size of Winograd-transformed kernel (8x8),
+        // ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32),
+        // KBLOCK is some platform-dependent constant dependent on the number of SIMD registers.
+        int ksize = _FX_WINO_KSIZE * _FX_WINO_KSIZE;
+        int Cg = C/ngroups;
+        int Kg = K/ngroups;
+        int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK;
+        size_t nweights = ngroups*Kg_nblocks*Cg*_FX_WINO_KBLOCK*_FX_WINO_AREA;
+        conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN);
+        conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN);
+        float* wptrWino = conv->weightsWinoBufPtr;
+        memset(wptrWino, 0, nweights * sizeof(wptrWino[0]));
+
+        parallel_for_(Range(0, K), [&](const Range& r0){
+        float kernelTm[_FX_WINO_AREA];
+        for (int k = r0.start; k < r0.end; k++)
          {
-            static const float ktm[8][3] = {
-                    {1.0f,      0.0f,      0.0f},
-                    {-2.0f / 9, -2.0f / 9, -2.0f / 9},
-                    {-2.0f / 9, 2.0f / 9, -2.0f / 9},
-                    {1.0f / 90, 1.0f / 45, 2.0f / 45},
-                    {1.0f / 90, -1.0f / 45, 2.0f / 45},
-                    {32.f/45, 16.f/45, 8.f/45},
-                    {32.f/45, -16.f/45, 8.f/45},
-                    {0.0f, 0.0f, 1.0f}
-            };
-
-            // the weights are packed as 6-dim tensor:
-            // ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE,
-            // where W is the size of Winograd-transformed kernel (8x8),
-            // ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32),
-            // KBLOCK is some platform-dependent constant dependent on the number of SIMD registers.
-            int ksize = _FX_WINO_KSIZE * _FX_WINO_KSIZE;
-            int Cg = C/ngroups;
-            int Kg = K/ngroups;
-            int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK;
-            size_t nweights = ngroups*Kg_nblocks*Cg*_FX_WINO_KBLOCK*_FX_WINO_AREA;
-            conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN);
-            conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN);
-            float* wptrWino = conv->weightsWinoBufPtr;
-            memset(wptrWino, 0, nweights * sizeof(wptrWino[0]));
-
-            parallel_for_(Range(0, K), [&](const Range& r0){
-            float kernelTm[_FX_WINO_AREA];
-            for (int k = r0.start; k < r0.end; k++)
+            int g = k / Kg;
+            int k_ = k - g*Kg;
+            int ki = k_ / _FX_WINO_KBLOCK;
+            int dk = k_ - ki*_FX_WINO_KBLOCK;
+
+            for (int c = 0; c < Cg; c++)
              {
-                int g = k / Kg;
-                int k_ = k - g*Kg;
-                int ki = k_ / _FX_WINO_KBLOCK;
-                int dk = k_ - ki*_FX_WINO_KBLOCK;
+                // wstep = Hk*Wk*Cg
+                const float *kernel0 = srcWeights + k * wstep + c * ksize;
+
+                // transform kernel, transposed
+                const float *k0 = kernel0;
+                const float *k1 = kernel0 + 3;
+                const float *k2 = kernel0 + 6;
  
-                for (int c = 0; c < Cg; c++)
+                // h
+                float tmp[8][3];
+                for (int i = 0; i < 8; i++)
                  {
-                    // wstep = Hk*Wk*Cg
-                    const float *kernel0 = srcWeights + k * wstep + c * ksize;
+                    tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                    tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                    tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+                }
  
-                    // transform kernel, transposed
-                    const float *k0 = kernel0;
-                    const float *k1 = kernel0 + 3;
-                    const float *k2 = kernel0 + 6;
+                // v
+                for (int j = 0; j < 8; j++)
+                {
+                    float *tmpp = &tmp[j][0];
  
-                    // h
-                    float tmp[8][3];
                      for (int i = 0; i < 8; i++)
-                    {
-                        tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
-                        tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
-                        tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
-                    }
-
-                    // v
-                    for (int j = 0; j < 8; j++)
-                    {
-                        float *tmpp = &tmp[j][0];
-
-                        for (int i = 0; i < 8; i++)
-                            kernelTm[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
-                    }
-
-                    // repack the data.
-                    float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *_FX_WINO_KBLOCK*_FX_WINO_AREA +
-                                  (c*_FX_WINO_KBLOCK + dk)*_FX_WINO_ATOM_F32;
-                    for (int i = 0; i < _FX_WINO_NATOMS_F32; i++,
-                            wptr += Cg * _FX_WINO_KBLOCK * _FX_WINO_ATOM_F32)
-                    {
-                        CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + _FX_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
-                        memcpy(wptr, kernelTm + i * _FX_WINO_ATOM_F32, _FX_WINO_ATOM_F32*sizeof (wptr[0]));
-                    }
+                        kernelTm[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                  }
-            }});
-        }
  
+                // repack the data.
+                float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *_FX_WINO_KBLOCK*_FX_WINO_AREA +
+                              (c*_FX_WINO_KBLOCK + dk)*_FX_WINO_ATOM_F32;
+                for (int i = 0; i < _FX_WINO_NATOMS_F32; i++,
+                        wptr += Cg * _FX_WINO_KBLOCK * _FX_WINO_ATOM_F32)
+                {
+                    CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + _FX_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights);
+                    memcpy(wptr, kernelTm + i * _FX_WINO_ATOM_F32, _FX_WINO_ATOM_F32*sizeof (wptr[0]));
+                }
+            }
+        }});
+    }
+    else if (conv->conv_type == _FX_CONV_TYPE_GENERIC)
+    {
          // The weights are packed as
          // ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk) x CONV_MR tensor
          int Kg = K/ngroups, Cg = max(C/ngroups, 1);
@@ -202,6 +201,8 @@ Ptr<FastConv2d> initFastConv2d(
              }
          }});
      }
+    else
+        CV_Error(CV_StsUnsupportedFormat, "Unknown convolution type.");
  
      // store bias; append some zero's to make sure that
      // we can always read MR elements starting from any valid index
@@ -271,7 +272,7 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr<FastConv2d>
          CV_Assert(fusedAddMat.empty()); // Depthwise-Convolution layer should not be followed by Add layer.
          return runDepthwise(input, output, conv, minval, maxval, activ, ifMinMaxAct);
      }
-    else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && inputShape[2] >= 12 && inputShape[3] >= 12) // winograd
+    else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd
      {
          CV_Assert(conv->weightsWinoBufPtr);
          if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct))
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp

index bab49a970800b873d90aaf556c2659ad4f1d7a43..9b8765b92dab5478027fbb1a38773daef5908abe 100644 (file)
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -29,7 +29,7 @@ public:
      void processNet(std::string weights, std::string proto,
                      Mat inp, const std::string& outputLayer = "",
                      std::string halideScheduler = "",
-                    double l1 = 0.0, double lInf = 0.0, double detectionConfThresh = 0.2)
+                    double l1 = 0.0, double lInf = 0.0, double detectionConfThresh = 0.2, bool useWinograd = true)
      {
          checkBackend();
          l1 = l1 ? l1 : default_l1;
@@ -49,6 +49,7 @@ public:
          net.setInput(inp);
          net.setPreferableBackend(backend);
          net.setPreferableTarget(target);
+        net.enableWinograd(useWinograd);
          if (backend == DNN_BACKEND_HALIDE && !halideScheduler.empty())
          {
              halideScheduler = findDataFile(halideScheduler);
@@ -347,7 +348,8 @@ TEST_P(DNNTestNetwork, SSD_VGG16)
      }
  
      processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel",
-               "dnn/ssd_vgg16.prototxt", inp, "detection_out", "", scoreDiff, iouDiff);
+               "dnn/ssd_vgg16.prototxt", inp, "detection_out", "", scoreDiff,
+               iouDiff, 0.2, false);
      expectNoFallbacksFromIE(net);
  }
  
diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp

index 27265f880ae7a374f068c2cdd692ab22706e89be..4d11193d9611bc2e6bd16e58e27bd13b1dceb4e6 100644 (file)
--- a/modules/dnn/test/test_darknet_importer.cpp
+++ b/modules/dnn/test/test_darknet_importer.cpp
@@ -81,6 +81,7 @@ TEST(Test_Darknet, read_yolo_voc_stream)
          Net net = readNetFromDarknet(cfgFile, weightsFile);
          net.setInput(inp);
          net.setPreferableBackend(DNN_BACKEND_OPENCV);
+        net.enableWinograd(false);
          ref = net.forward();
      }
      // Import from bytes array.
@@ -92,6 +93,7 @@ TEST(Test_Darknet, read_yolo_voc_stream)
          Net net = readNetFromDarknet(cfg.data(), cfg.size(), weights.data(), weights.size());
          net.setInput(inp);
          net.setPreferableBackend(DNN_BACKEND_OPENCV);
+        net.enableWinograd(false);
          Mat out = net.forward();
          normAssert(ref, out);
      }
@@ -178,7 +180,8 @@ public:
                            const std::vector<std::vector<int> >& refClassIds,
                            const std::vector<std::vector<float> >& refConfidences,
                            const std::vector<std::vector<Rect2d> >& refBoxes,
-                          double scoreDiff, double iouDiff, float confThreshold = 0.24, float nmsThreshold = 0.4)
+                          double scoreDiff, double iouDiff, float confThreshold = 0.24,
+                          float nmsThreshold = 0.4, bool useWinograd = true)
      {
          checkBackend();
  
@@ -198,6 +201,7 @@ public:
                            findDataFile("dnn/" + weights, false));
          net.setPreferableBackend(backend);
          net.setPreferableTarget(target);
+        net.enableWinograd(useWinograd);
          net.setInput(inp);
          std::vector<Mat> outs;
          net.forward(outs, net.getUnconnectedOutLayersNames());
@@ -280,18 +284,19 @@ public:
                            const std::vector<int>& refClassIds,
                            const std::vector<float>& refConfidences,
                            const std::vector<Rect2d>& refBoxes,
-                          double scoreDiff, double iouDiff, float confThreshold = 0.24, float nmsThreshold = 0.4)
+                          double scoreDiff, double iouDiff, float confThreshold = 0.24,
+                          float nmsThreshold = 0.4, bool useWinograd = true)
      {
          testDarknetModel(cfg, weights,
                           std::vector<std::vector<int> >(1, refClassIds),
                           std::vector<std::vector<float> >(1, refConfidences),
                           std::vector<std::vector<Rect2d> >(1, refBoxes),
-                         scoreDiff, iouDiff, confThreshold, nmsThreshold);
+                         scoreDiff, iouDiff, confThreshold, nmsThreshold, useWinograd);
      }
  
      void testDarknetModel(const std::string& cfg, const std::string& weights,
                            const cv::Mat& ref, double scoreDiff, double iouDiff,
-                          float confThreshold = 0.24, float nmsThreshold = 0.4)
+                          float confThreshold = 0.24, float nmsThreshold = 0.4, bool useWinograd = true)
      {
          CV_Assert(ref.cols == 7);
          std::vector<std::vector<int> > refClassIds;
@@ -318,7 +323,7 @@ public:
              refBoxes[batchId].push_back(box);
          }
          testDarknetModel(cfg, weights, refClassIds, refScores, refBoxes,
-                         scoreDiff, iouDiff, confThreshold, nmsThreshold);
+                         scoreDiff, iouDiff, confThreshold, nmsThreshold, useWinograd);
      }
  };
  
@@ -396,7 +401,7 @@ TEST_P(Test_Darknet_nets, YoloVoc)
  
      {
      SCOPED_TRACE("batch size 1");
-    testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff);
+    testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff, 0.24, 0.4, false);
      }
  
  #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
@@ -410,7 +415,7 @@ TEST_P(Test_Darknet_nets, YoloVoc)
  #endif
      {
      SCOPED_TRACE("batch size 2");
-    testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, nmsThreshold);
+    testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, nmsThreshold, false);
      }
  
  #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
@@ -599,7 +604,7 @@ TEST_P(Test_Darknet_nets, YOLOv3)
  {
      applyTestTag(
              CV_TEST_TAG_LONG,
-            (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+            CV_TEST_TAG_MEMORY_2GB,
              CV_TEST_TAG_DEBUG_VERYLONG
      );
  
@@ -656,7 +661,7 @@ TEST_P(Test_Darknet_nets, YOLOv3)
  
      {
          SCOPED_TRACE("batch size 1");
-        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff);
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, 0.24, 0.4, false);
      }
  
  #if defined(INF_ENGINE_RELEASE)
@@ -674,7 +679,7 @@ TEST_P(Test_Darknet_nets, YOLOv3)
  
      {
          SCOPED_TRACE("batch size 2");
-        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, 0.4, false);
      }
  }
  
@@ -682,7 +687,7 @@ TEST_P(Test_Darknet_nets, YOLOv4)
  {
      applyTestTag(
              CV_TEST_TAG_LONG,
-            (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+            CV_TEST_TAG_MEMORY_2GB,
              CV_TEST_TAG_DEBUG_VERYLONG
              );
  
@@ -756,7 +761,7 @@ TEST_P(Test_Darknet_nets, YOLOv4)
  
      {
          SCOPED_TRACE("batch size 1");
-        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff);
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, 0.24, 0.4, false);
      }
  
      {
@@ -792,7 +797,7 @@ TEST_P(Test_Darknet_nets, YOLOv4)
          }
  #endif
  
-        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, 0.4, false);
      }
  
  #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000)
@@ -877,7 +882,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish)
  {
      applyTestTag(
              CV_TEST_TAG_LONG,
-            (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+            CV_TEST_TAG_MEMORY_2GB,
              CV_TEST_TAG_DEBUG_VERYLONG
              );
  
@@ -939,7 +944,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish)
  
      {
          SCOPED_TRACE("batch size 1");
-        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff);
+        testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, 0.24, 0.4, false);
      }
  
      {
@@ -958,7 +963,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish)
          }
  #endif
  
-        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff);
+        testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, 0.4, false);
      }
  }
author	Zihao Mu <zihaomu@outlook.com>
	Thu, 8 Dec 2022 12:57:13 +0000 (20:57 +0800)
committer	GitHub <noreply@github.com>
	Thu, 8 Dec 2022 12:57:13 +0000 (12:57 +0000)
modules/dnn/perf/perf_net.cpp		patch \| blob \| history
modules/dnn/src/layers/convolution_layer.cpp		patch \| blob \| history
modules/dnn/src/layers/fast_convolution/fast_convolution.cpp		patch \| blob \| history
modules/dnn/test/test_backends.cpp		patch \| blob \| history
modules/dnn/test/test_darknet_importer.cpp		patch \| blob \| history