From 0a650b573b29b150385d15240e418c45559d92a0 Mon Sep 17 00:00:00 2001 From: Zihao Mu Date: Thu, 8 Dec 2022 20:57:13 +0800 Subject: [PATCH] Merge pull request #22840 from zihaomu:optimze_conv_memory_usage DNN: reduce the memory used in convolution layer * reduce the memory in winograd and disabel the test when usage memory is larger than 2gb. * remove VERY_LOG tag --- modules/dnn/perf/perf_net.cpp | 2 + modules/dnn/src/layers/convolution_layer.cpp | 5 +- .../layers/fast_convolution/fast_convolution.cpp | 143 +++++++++++---------- modules/dnn/test/test_backends.cpp | 6 +- modules/dnn/test/test_darknet_importer.cpp | 37 +++--- 5 files changed, 103 insertions(+), 90 deletions(-) diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp index 46db47b..2fb150b 100644 --- a/modules/dnn/perf/perf_net.cpp +++ b/modules/dnn/perf/perf_net.cpp @@ -198,6 +198,7 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_SSD_TensorFlow) PERF_TEST_P_(DNNTestNetwork, YOLOv3) { + applyTestTag(CV_TEST_TAG_MEMORY_2GB); if (backend == DNN_BACKEND_HALIDE) throw SkipTestException(""); #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2020040000) // nGraph compilation failure @@ -220,6 +221,7 @@ PERF_TEST_P_(DNNTestNetwork, YOLOv3) PERF_TEST_P_(DNNTestNetwork, YOLOv4) { + applyTestTag(CV_TEST_TAG_MEMORY_2GB); if (backend == DNN_BACKEND_HALIDE) throw SkipTestException(""); if (target == DNN_TARGET_MYRIAD) // not enough resources diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp index bc1acd0..cc39593 100644 --- a/modules/dnn/src/layers/convolution_layer.cpp +++ b/modules/dnn/src/layers/convolution_layer.cpp @@ -2112,8 +2112,11 @@ public: int dilation_h = dilations[dilations.size() - 2]; int dilation_w = dilations.back(); + // Winograd only works well on input h and w >12. + bool canUseWinograd = useWinograd && inputs[0].size[2] >= 12 && inputs[0].size[3] >= 12; + fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h, dilation_w, - dilation_h, pads_begin, pads_end, weightsMat, &biasvec[0], useWinograd); + dilation_h, pads_begin, pads_end, weightsMat, &biasvec[0], canUseWinograd); } if (fastConv2dImpl) diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp index 8002f39..fba57e7 100644 --- a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp +++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp @@ -83,86 +83,85 @@ Ptr initFastConv2d( weightsBufPtr[c*padded_ksize + k] = srcWeights[c*wstep + k]; }}); } - else + else if(conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd { - if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd + static const float ktm[8][3] = { + {1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {32.f/45, 16.f/45, 8.f/45}, + {32.f/45, -16.f/45, 8.f/45}, + {0.0f, 0.0f, 1.0f} + }; + + // the weights are packed as 6-dim tensor: + // ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE, + // where W is the size of Winograd-transformed kernel (8x8), + // ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32), + // KBLOCK is some platform-dependent constant dependent on the number of SIMD registers. + int ksize = _FX_WINO_KSIZE * _FX_WINO_KSIZE; + int Cg = C/ngroups; + int Kg = K/ngroups; + int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK; + size_t nweights = ngroups*Kg_nblocks*Cg*_FX_WINO_KBLOCK*_FX_WINO_AREA; + conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN); + conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN); + float* wptrWino = conv->weightsWinoBufPtr; + memset(wptrWino, 0, nweights * sizeof(wptrWino[0])); + + parallel_for_(Range(0, K), [&](const Range& r0){ + float kernelTm[_FX_WINO_AREA]; + for (int k = r0.start; k < r0.end; k++) { - static const float ktm[8][3] = { - {1.0f, 0.0f, 0.0f}, - {-2.0f / 9, -2.0f / 9, -2.0f / 9}, - {-2.0f / 9, 2.0f / 9, -2.0f / 9}, - {1.0f / 90, 1.0f / 45, 2.0f / 45}, - {1.0f / 90, -1.0f / 45, 2.0f / 45}, - {32.f/45, 16.f/45, 8.f/45}, - {32.f/45, -16.f/45, 8.f/45}, - {0.0f, 0.0f, 1.0f} - }; - - // the weights are packed as 6-dim tensor: - // ngroups * ceil((K/ngroups)/KBLOCK) * (W*W/ATOM_SIZE) * (C/ngroups) * KBLOCK * ATOM_SIZE, - // where W is the size of Winograd-transformed kernel (8x8), - // ATOM_SIZE is number of lanes in SIMD register (4 for NEON and FP32), - // KBLOCK is some platform-dependent constant dependent on the number of SIMD registers. - int ksize = _FX_WINO_KSIZE * _FX_WINO_KSIZE; - int Cg = C/ngroups; - int Kg = K/ngroups; - int Kg_nblocks = (Kg + _FX_WINO_KBLOCK - 1)/_FX_WINO_KBLOCK; - size_t nweights = ngroups*Kg_nblocks*Cg*_FX_WINO_KBLOCK*_FX_WINO_AREA; - conv->weightsWinoBuf.reserve(nweights + VEC_ALIGN); - conv->weightsWinoBufPtr = alignPtr(conv->weightsWinoBuf.data(), VEC_ALIGN); - float* wptrWino = conv->weightsWinoBufPtr; - memset(wptrWino, 0, nweights * sizeof(wptrWino[0])); - - parallel_for_(Range(0, K), [&](const Range& r0){ - float kernelTm[_FX_WINO_AREA]; - for (int k = r0.start; k < r0.end; k++) + int g = k / Kg; + int k_ = k - g*Kg; + int ki = k_ / _FX_WINO_KBLOCK; + int dk = k_ - ki*_FX_WINO_KBLOCK; + + for (int c = 0; c < Cg; c++) { - int g = k / Kg; - int k_ = k - g*Kg; - int ki = k_ / _FX_WINO_KBLOCK; - int dk = k_ - ki*_FX_WINO_KBLOCK; + // wstep = Hk*Wk*Cg + const float *kernel0 = srcWeights + k * wstep + c * ksize; + + // transform kernel, transposed + const float *k0 = kernel0; + const float *k1 = kernel0 + 3; + const float *k2 = kernel0 + 6; - for (int c = 0; c < Cg; c++) + // h + float tmp[8][3]; + for (int i = 0; i < 8; i++) { - // wstep = Hk*Wk*Cg - const float *kernel0 = srcWeights + k * wstep + c * ksize; + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } - // transform kernel, transposed - const float *k0 = kernel0; - const float *k1 = kernel0 + 3; - const float *k2 = kernel0 + 6; + // v + for (int j = 0; j < 8; j++) + { + float *tmpp = &tmp[j][0]; - // h - float tmp[8][3]; for (int i = 0; i < 8; i++) - { - tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; - tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; - tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; - } - - // v - for (int j = 0; j < 8; j++) - { - float *tmpp = &tmp[j][0]; - - for (int i = 0; i < 8; i++) - kernelTm[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; - } - - // repack the data. - float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *_FX_WINO_KBLOCK*_FX_WINO_AREA + - (c*_FX_WINO_KBLOCK + dk)*_FX_WINO_ATOM_F32; - for (int i = 0; i < _FX_WINO_NATOMS_F32; i++, - wptr += Cg * _FX_WINO_KBLOCK * _FX_WINO_ATOM_F32) - { - CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + _FX_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights); - memcpy(wptr, kernelTm + i * _FX_WINO_ATOM_F32, _FX_WINO_ATOM_F32*sizeof (wptr[0])); - } + kernelTm[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; } - }}); - } + // repack the data. + float* wptr = wptrWino + (g*Kg_nblocks + ki) * Cg *_FX_WINO_KBLOCK*_FX_WINO_AREA + + (c*_FX_WINO_KBLOCK + dk)*_FX_WINO_ATOM_F32; + for (int i = 0; i < _FX_WINO_NATOMS_F32; i++, + wptr += Cg * _FX_WINO_KBLOCK * _FX_WINO_ATOM_F32) + { + CV_Assert(conv->weightsWinoBufPtr <= wptr && wptr + _FX_WINO_ATOM_F32 <= conv->weightsWinoBufPtr + nweights); + memcpy(wptr, kernelTm + i * _FX_WINO_ATOM_F32, _FX_WINO_ATOM_F32*sizeof (wptr[0])); + } + } + }}); + } + else if (conv->conv_type == _FX_CONV_TYPE_GENERIC) + { // The weights are packed as // ngroups x (ceil((K/ngroups)/CONV_MR)*CONV_MR) x (Cg*Hk*Wk) x CONV_MR tensor int Kg = K/ngroups, Cg = max(C/ngroups, 1); @@ -202,6 +201,8 @@ Ptr initFastConv2d( } }}); } + else + CV_Error(CV_StsUnsupportedFormat, "Unknown convolution type."); // store bias; append some zero's to make sure that // we can always read MR elements starting from any valid index @@ -271,7 +272,7 @@ void runFastConv2d(InputArray _input, OutputArray _output, const Ptr CV_Assert(fusedAddMat.empty()); // Depthwise-Convolution layer should not be followed by Add layer. return runDepthwise(input, output, conv, minval, maxval, activ, ifMinMaxAct); } - else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3 && inputShape[2] >= 12 && inputShape[3] >= 12) // winograd + else if (conv->conv_type == _FX_CONV_TYPE_WINOGRAD3X3) // winograd { CV_Assert(conv->weightsWinoBufPtr); if (runWinograd63(input, fusedAddMat, output, conv, ntasks, minval, maxval, activ, ifMinMaxAct)) diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp index bab49a9..9b8765b 100644 --- a/modules/dnn/test/test_backends.cpp +++ b/modules/dnn/test/test_backends.cpp @@ -29,7 +29,7 @@ public: void processNet(std::string weights, std::string proto, Mat inp, const std::string& outputLayer = "", std::string halideScheduler = "", - double l1 = 0.0, double lInf = 0.0, double detectionConfThresh = 0.2) + double l1 = 0.0, double lInf = 0.0, double detectionConfThresh = 0.2, bool useWinograd = true) { checkBackend(); l1 = l1 ? l1 : default_l1; @@ -49,6 +49,7 @@ public: net.setInput(inp); net.setPreferableBackend(backend); net.setPreferableTarget(target); + net.enableWinograd(useWinograd); if (backend == DNN_BACKEND_HALIDE && !halideScheduler.empty()) { halideScheduler = findDataFile(halideScheduler); @@ -347,7 +348,8 @@ TEST_P(DNNTestNetwork, SSD_VGG16) } processNet("dnn/VGG_ILSVRC2016_SSD_300x300_iter_440000.caffemodel", - "dnn/ssd_vgg16.prototxt", inp, "detection_out", "", scoreDiff, iouDiff); + "dnn/ssd_vgg16.prototxt", inp, "detection_out", "", scoreDiff, + iouDiff, 0.2, false); expectNoFallbacksFromIE(net); } diff --git a/modules/dnn/test/test_darknet_importer.cpp b/modules/dnn/test/test_darknet_importer.cpp index 27265f8..4d11193 100644 --- a/modules/dnn/test/test_darknet_importer.cpp +++ b/modules/dnn/test/test_darknet_importer.cpp @@ -81,6 +81,7 @@ TEST(Test_Darknet, read_yolo_voc_stream) Net net = readNetFromDarknet(cfgFile, weightsFile); net.setInput(inp); net.setPreferableBackend(DNN_BACKEND_OPENCV); + net.enableWinograd(false); ref = net.forward(); } // Import from bytes array. @@ -92,6 +93,7 @@ TEST(Test_Darknet, read_yolo_voc_stream) Net net = readNetFromDarknet(cfg.data(), cfg.size(), weights.data(), weights.size()); net.setInput(inp); net.setPreferableBackend(DNN_BACKEND_OPENCV); + net.enableWinograd(false); Mat out = net.forward(); normAssert(ref, out); } @@ -178,7 +180,8 @@ public: const std::vector >& refClassIds, const std::vector >& refConfidences, const std::vector >& refBoxes, - double scoreDiff, double iouDiff, float confThreshold = 0.24, float nmsThreshold = 0.4) + double scoreDiff, double iouDiff, float confThreshold = 0.24, + float nmsThreshold = 0.4, bool useWinograd = true) { checkBackend(); @@ -198,6 +201,7 @@ public: findDataFile("dnn/" + weights, false)); net.setPreferableBackend(backend); net.setPreferableTarget(target); + net.enableWinograd(useWinograd); net.setInput(inp); std::vector outs; net.forward(outs, net.getUnconnectedOutLayersNames()); @@ -280,18 +284,19 @@ public: const std::vector& refClassIds, const std::vector& refConfidences, const std::vector& refBoxes, - double scoreDiff, double iouDiff, float confThreshold = 0.24, float nmsThreshold = 0.4) + double scoreDiff, double iouDiff, float confThreshold = 0.24, + float nmsThreshold = 0.4, bool useWinograd = true) { testDarknetModel(cfg, weights, std::vector >(1, refClassIds), std::vector >(1, refConfidences), std::vector >(1, refBoxes), - scoreDiff, iouDiff, confThreshold, nmsThreshold); + scoreDiff, iouDiff, confThreshold, nmsThreshold, useWinograd); } void testDarknetModel(const std::string& cfg, const std::string& weights, const cv::Mat& ref, double scoreDiff, double iouDiff, - float confThreshold = 0.24, float nmsThreshold = 0.4) + float confThreshold = 0.24, float nmsThreshold = 0.4, bool useWinograd = true) { CV_Assert(ref.cols == 7); std::vector > refClassIds; @@ -318,7 +323,7 @@ public: refBoxes[batchId].push_back(box); } testDarknetModel(cfg, weights, refClassIds, refScores, refBoxes, - scoreDiff, iouDiff, confThreshold, nmsThreshold); + scoreDiff, iouDiff, confThreshold, nmsThreshold, useWinograd); } }; @@ -396,7 +401,7 @@ TEST_P(Test_Darknet_nets, YoloVoc) { SCOPED_TRACE("batch size 1"); - testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff); + testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff, 0.24, 0.4, false); } #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000) @@ -410,7 +415,7 @@ TEST_P(Test_Darknet_nets, YoloVoc) #endif { SCOPED_TRACE("batch size 2"); - testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, nmsThreshold); + testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, nmsThreshold, false); } #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000) @@ -599,7 +604,7 @@ TEST_P(Test_Darknet_nets, YOLOv3) { applyTestTag( CV_TEST_TAG_LONG, - (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB), + CV_TEST_TAG_MEMORY_2GB, CV_TEST_TAG_DEBUG_VERYLONG ); @@ -656,7 +661,7 @@ TEST_P(Test_Darknet_nets, YOLOv3) { SCOPED_TRACE("batch size 1"); - testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff); + testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, 0.24, 0.4, false); } #if defined(INF_ENGINE_RELEASE) @@ -674,7 +679,7 @@ TEST_P(Test_Darknet_nets, YOLOv3) { SCOPED_TRACE("batch size 2"); - testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff); + testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, 0.4, false); } } @@ -682,7 +687,7 @@ TEST_P(Test_Darknet_nets, YOLOv4) { applyTestTag( CV_TEST_TAG_LONG, - (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB), + CV_TEST_TAG_MEMORY_2GB, CV_TEST_TAG_DEBUG_VERYLONG ); @@ -756,7 +761,7 @@ TEST_P(Test_Darknet_nets, YOLOv4) { SCOPED_TRACE("batch size 1"); - testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff); + testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, 0.24, 0.4, false); } { @@ -792,7 +797,7 @@ TEST_P(Test_Darknet_nets, YOLOv4) } #endif - testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff); + testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, 0.4, false); } #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2022010000) @@ -877,7 +882,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish) { applyTestTag( CV_TEST_TAG_LONG, - (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB), + CV_TEST_TAG_MEMORY_2GB, CV_TEST_TAG_DEBUG_VERYLONG ); @@ -939,7 +944,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish) { SCOPED_TRACE("batch size 1"); - testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff); + testDarknetModel(config_file, weights_file, ref.rowRange(0, N0), scoreDiff, iouDiff, 0.24, 0.4, false); } { @@ -958,7 +963,7 @@ TEST_P(Test_Darknet_nets, YOLOv4x_mish) } #endif - testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff); + testDarknetModel(config_file, weights_file, ref, scoreDiff, iouDiff, 0.24, 0.4, false); } } -- 2.7.4