From 59b870a87ad22d89027e28eee3b8be1f3db1c0bf Mon Sep 17 00:00:00 2001
From: Zihao Mu <zihaomu@outlook.com>
Date: Fri, 1 Jul 2022 18:03:15 +0800
Subject: [PATCH] Merge pull request #21910 from zihaomu:fast_conv_ARM

DNN: Accelerating convolution

* Fast Conv of ARM, X86 and universal intrinsics.

* improve code style.

* error fixed.

* improve the License

* optimize memory allocated and Adjust the threshold.

* change FasterRCNN_vgg16 to 2GB memory.
---
 modules/dnn/src/layers/convolution_layer.cpp       |   54 +-
 .../fast_convolution/depthwise_convolution.cpp     |  385 ++++++
 .../fast_convolution/fast_convolution.avx2.cpp     |  361 ++++++
 .../layers/fast_convolution/fast_convolution.cpp   |  694 ++++++++++
 .../layers/fast_convolution/fast_convolution.hpp   |   89 ++
 .../fast_convolution/fast_convolution.simd.hpp     |  342 +++++
 .../layers/fast_convolution/winograd_3x3s1_f63.cpp | 1351 ++++++++++++++++++++
 modules/dnn/test/test_backends.cpp                 |    2 +-
 modules/dnn/test/test_caffe_importer.cpp           |    4 +-
 modules/dnn/test/test_int8_layers.cpp              |   14 +-
 modules/dnn/test/test_tf_importer.cpp              |    2 +-
 11 files changed, 3286 insertions(+), 12 deletions(-)
 create mode 100644 modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
 create mode 100644 modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 0bf39f9..1244433 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -71,6 +71,8 @@ using namespace cv::dnn::ocl4dnn;
 using namespace cv::dnn::cuda4dnn;
 #endif
 
+#include "fast_convolution/fast_convolution.hpp"
+
 namespace cv
 {
 namespace dnn
@@ -253,11 +255,14 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
 {
 public:
     enum { VEC_ALIGN = 8, DFT_TYPE = CV_32F };
-    Mat weightsMat;
+    Mat weightsMat;  // Used to store weight params. It will be used for layer fusion and memory alignment.
     std::vector<float> biasvec;
     std::vector<float> reluslope;
     Ptr<ActivationLayer> activ;
 
+    Mat fastWeights; // Used to store weight params. It will be used for layer fusion and without memory alignment.
+    Ptr<FastConv2d> fastConv2dImpl;
+
 #ifdef HAVE_OPENCL
     Ptr<OCL4DNNConvSpatial<float> > convolutionOp;
     std::vector<UMat> umat_blobs;
@@ -433,6 +438,7 @@ public:
                 wm.copyTo(wm_aligned);
                 wm = wm_aligned;
             }
+            fastWeights = blobs[0].reshape(1, numOutput);
             weightsMat = wm;
         }
         else
@@ -628,14 +634,26 @@ public:
             if (weightsMat.data == blobs[0].data)
                 weightsMat = weightsMat.clone();
 
+            // If fastWeights is the same as weightsMat, we don't need to allocate more space for fastWeights.
+            bool sameFastWeights = false;
+            if (fastWeights.step1() == weightsMat.step1()) // If weightsMat is realigned, it is not the same as fastWeights.
+                sameFastWeights = true;
+
+            if (!sameFastWeights && fastWeights.data == blobs[0].data)
+                fastWeights = fastWeights.clone();
+
             Mat originWeights = blobs[0].reshape(1, outCn);
             for (int i = 0; i < outCn; ++i)
             {
                 double wi = w.at<float>(i);
                 weightsMultipliers[i] *= wi;
                 cv::multiply(originWeights.row(i), weightsMultipliers[i], weightsMat.row(i));
+                if (!sameFastWeights)
+                    cv::multiply(originWeights.row(i), weightsMultipliers[i], fastWeights.row(i));
                 biasvec[i] *= wi;
             }
+            if (sameFastWeights)
+                fastWeights = weightsMat;
         }
 
         if (!b.empty())
@@ -1948,8 +1966,13 @@ public:
 
         int outCn = blobs.empty() ? inputs[1].size[0] : blobs[0].size[0];
         // Need to align non-const blobs
+        bool variableWeight = false;
         if (blobs.empty())
         {
+            variableWeight = true;
+            if (fastWeights.data != inputs[1].data)
+                fastWeights = inputs[1].clone();
+
             Mat wm = inputs[1].reshape(1, outCn);
             if (wm.data != weightsMat.data)
             {
@@ -2066,8 +2089,37 @@ public:
         {
             int nstripes = std::max(getNumThreads(), 1);
 
+            // Initialization of FastCovn2d
+            if ((!fastConv2dImpl || variableWeight) && inputs[0].dims == 4)
+            {
+                int K = outputs[0].size[1];
+                int C = inputs[0].size[1];
+                int Hk = kernel_size[kernel_size.size() - 2];
+                int Wk = kernel_size.back();
+
+                CV_Assert(outputs[0].size[1] % ngroups == 0);
+                int stride_h = strides[strides.size() - 2];
+                int stride_w = strides.back();
+
+                int dilation_h = dilations[dilations.size() - 2];
+                int dilation_w = dilations.back();
+                float* weightsPtr = fastWeights.ptr<float>();
+                CV_Assert(weightsPtr);
+
+                fastConv2dImpl = initFastConv2d(ngroups, K, C, Hk, Wk, stride_w, stride_h,
+                                              dilation_w, dilation_h, pads_begin, pads_end, weightsPtr, &biasvec[0]);
+            }
+
+            if (fastConv2dImpl)
+            {
+                runFastConv2d(inputs[0], outputs[0], fastConv2dImpl, nstripes, activ);
+                return;
+            }
+
+            // Use only for Conv1D and Conv3D.
             ParallelConv::run(inputs[0], outputs[0], weightsMat, biasvec, reluslope,
                             kernel_size, strides, pads_begin, pads_end, dilations, activ.get(), ngroups, nstripes);
+
         }
     }
 
diff --git a/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
new file mode 100644
index 0000000..c98c3d6
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
@@ -0,0 +1,385 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include "../../precomp.hpp"
+#include "fast_convolution.hpp"
+
+namespace cv { namespace dnn {
+
+static void depthWiseBlock(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
+                           float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
+                           int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
+                           int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
+{
+#ifdef CV_SIMD128
+    v_float32x4 vminval = v_setall_f32(minval), vmaxval = v_setall_f32(maxval);
+
+    v_float32x4 w0 = v_setall_f32(
+            0.f), w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0;
+    if (useSIMD)
+    {
+        vbias = v_setall_f32(biasval);
+        if (is3x3)
+        {
+            w0 = v_setall_f32(weights[0]);
+            w1 = v_setall_f32(weights[1]);
+            w2 = v_setall_f32(weights[2]);
+            w3 = v_setall_f32(weights[3]);
+            w4 = v_setall_f32(weights[4]);
+            w5 = v_setall_f32(weights[5]);
+            w6 = v_setall_f32(weights[6]);
+            w7 = v_setall_f32(weights[7]);
+            w8 = v_setall_f32(weights[8]);
+        }
+    }
+#endif
+    int dy0 = 1;
+    for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0)
+    {
+#ifdef CV_SIMD128
+        dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1
+              ? 3 : 1;
+#endif
+        int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0;
+        int yi_ = y0 * stride_y - pad_top;
+
+        for (;;)
+        {
+            float s_0, s_1, s_2;
+            if (dy0 == 3)
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    s_0 = s_1 = s_2 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        int dy = yxtab[k * 2];
+                        int yi = yi_ + dy;
+                        int xi = xi_ + yxtab[k * 2 + 1];
+                        float w = weights[k];
+
+                        if ((unsigned) xi < (unsigned) Wi)
+                        {
+                            s_0 += inptr[yi * Wi + xi] * w;
+                            s_1 += inptr[(yi + 1) * Wi + xi] * w;
+                            s_2 += inptr[(yi + 2) * Wi + xi] * w;
+                        }
+                    }
+                    s_0 = std::min(std::max(s_0, minval), maxval);
+                    s_1 = std::min(std::max(s_1, minval), maxval);
+                    s_2 = std::min(std::max(s_2, minval), maxval);
+                    outptr[x0] = s_0;
+                    outptr[x0 + W0] = s_1;
+                    outptr[x0 + W0 * 2] = s_2;
+                }
+            }
+            else
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    s_0 = biasval;
+                    for (int k = 0; k < ksize; k++) {
+                        int dy = yxtab[k * 2];
+                        int yi = yi_ + dy;
+                        int xi = xi_ + yxtab[k * 2 + 1];
+                        float w = weights[k];
+                        if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi))
+                            s_0 += inptr[yi * Wi + xi] * w;
+                    }
+                    s_0 = std::min(std::max(s_0, minval), maxval);
+                    outptr[x0] = s_0;
+                }
+            }
+            if (x0 == W0)
+                break;
+            x1 = inner_xright;
+#ifdef CV_SIMD128
+            if (useSIMD)
+            {
+                if (is3x3)
+                {
+                    if (dy0 == 3)
+                    {
+                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        {
+                            int xi_ = x0 * stride_x - pad_left;
+                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
+
+                            v_float32x4 s0, s1, s2;
+                            v_float32x4 x00 = v_load(inptr_xi);
+                            v_float32x4 x01 = v_load(inptr_xi + 1);
+                            v_float32x4 x02 = v_load(inptr_xi + 2);
+
+                            v_float32x4 x10 = v_load(inptr_xi + Wi);
+                            v_float32x4 x11 = v_load(inptr_xi + Wi + 1);
+                            v_float32x4 x12 = v_load(inptr_xi + Wi + 2);
+
+                            v_float32x4 x20 = v_load(inptr_xi + Wi * 2);
+                            v_float32x4 x21 = v_load(inptr_xi + Wi * 2 + 1);
+                            v_float32x4 x22 = v_load(inptr_xi + Wi * 2 + 2);
+
+                            v_float32x4 x30 = v_load(inptr_xi + Wi * 3);
+                            v_float32x4 x31 = v_load(inptr_xi + Wi * 3 + 1);
+                            v_float32x4 x32 = v_load(inptr_xi + Wi * 3 + 2);
+
+                            v_float32x4 x40 = v_load(inptr_xi + Wi * 4);
+                            v_float32x4 x41 = v_load(inptr_xi + Wi * 4 + 1);
+                            v_float32x4 x42 = v_load(inptr_xi + Wi * 4 + 2);
+
+                            s0 = v_fma(x00, w0, vbias);
+                            s1 = v_fma(x10, w0, vbias);
+                            s2 = v_fma(x20, w0, vbias);
+
+                            s0 = v_fma(x01, w1, s0);
+                            s1 = v_fma(x11, w1, s1);
+                            s2 = v_fma(x21, w1, s2);
+
+                            s0 = v_fma(x02, w2, s0);
+                            s1 = v_fma(x12, w2, s1);
+                            s2 = v_fma(x22, w2, s2);
+
+                            s0 = v_fma(x10, w3, s0);
+                            s1 = v_fma(x20, w3, s1);
+                            s2 = v_fma(x30, w3, s2);
+
+                            s0 = v_fma(x11, w4, s0);
+                            s1 = v_fma(x21, w4, s1);
+                            s2 = v_fma(x31, w4, s2);
+
+                            s0 = v_fma(x12, w5, s0);
+                            s1 = v_fma(x22, w5, s1);
+                            s2 = v_fma(x32, w5, s2);
+
+                            s0 = v_fma(x20, w6, s0);
+                            s1 = v_fma(x30, w6, s1);
+                            s2 = v_fma(x40, w6, s2);
+
+                            s0 = v_fma(x21, w7, s0);
+                            s1 = v_fma(x31, w7, s1);
+                            s2 = v_fma(x41, w7, s2);
+
+                            s0 = v_fma(x22, w8, s0);
+                            s1 = v_fma(x32, w8, s1);
+                            s2 = v_fma(x42, w8, s2);
+
+                            if (ifMinMaxAct)
+                            {
+                                s0 = v_min(v_max(s0, vminval), vmaxval);
+                                s1 = v_min(v_max(s1, vminval), vmaxval);
+                                s2 = v_min(v_max(s2, vminval), vmaxval);
+                            }
+
+                            v_store(outptr + x0, s0);
+                            v_store(outptr + W0 + x0, s1);
+                            v_store(outptr + W0 * 2 + x0, s2);
+                        }
+                    }
+                    else
+                    {
+                        for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                        {
+                            int xi_ = x0 * stride_x - pad_left;
+                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                            v_float32x4 s0 = v_fma(v_load(inptr_xi + ofstab[0]), w0, vbias);
+                            v_float32x4 s1 = v_load(inptr_xi + ofstab[1]) * w1;
+                            v_float32x4 s2 = v_load(inptr_xi + ofstab[2]) * w2;
+
+                            s0 = v_fma(v_load(inptr_xi + ofstab[3]), w3, s0);
+                            s1 = v_fma(v_load(inptr_xi + ofstab[4]), w4, s1);
+                            s2 = v_fma(v_load(inptr_xi + ofstab[5]), w5, s2);
+
+                            s0 = v_fma(v_load(inptr_xi + ofstab[6]), w6, s0);
+                            s1 = v_fma(v_load(inptr_xi + ofstab[7]), w7, s1);
+                            s2 = v_fma(v_load(inptr_xi + ofstab[8]), w8, s2);
+
+                            s0 = s0 + s1 + s2;
+                            if (ifMinMaxAct)
+                                s0 = v_min(v_max(s0, vminval), vmaxval);
+                            v_store(outptr + x0, s0);
+                        }
+                    }
+                }
+                else
+                {
+                    for (; x0 <= x1 - FAST_VEC_NLANES; x0 += FAST_VEC_NLANES)
+                    {
+                        int xi_ = x0 * stride_x - pad_left, k = 0;
+                        const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                        v_float32x4 s0 = vbias;
+                        for (; k <= ksize - 4; k += 4)
+                        {
+                            v_float32x4 v0 = v_load(inptr_xi + ofstab[k]);
+                            v_float32x4 v1 = v_load(inptr_xi + ofstab[k + 1]);
+                            v_float32x4 v2 = v_load(inptr_xi + ofstab[k + 2]);
+                            v_float32x4 v3 = v_load(inptr_xi + ofstab[k + 3]);
+
+                            v_float32x4 ww0 = v_setall_f32(weights[k]);
+                            v_float32x4 ww1 = v_setall_f32(weights[k+1]);
+                            v_float32x4 ww2 = v_setall_f32(weights[k+2]);
+                            v_float32x4 ww3 = v_setall_f32(weights[k+3]);
+
+                            s0 = v_fma(v0, ww0, s0);
+                            s0 = v_fma(v1, ww1, s0);
+                            s0 = v_fma(v2, ww2, s0);
+                            s0 = v_fma(v3, ww3, s0);
+                        }
+                        for (; k < ksize; k++)
+                            s0 = v_fma(v_load(inptr_xi + ofstab[k]),
+                                       v_setall_f32(weights[k]), s0);
+                        if (ifMinMaxAct)
+                            s0 = v_min(v_max(s0, vminval), vmaxval);
+                        v_store(outptr + x0, s0);
+                    }
+                }
+            }
+#endif
+            if (dy0 == 3)
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    const float *inptr_xi = inptr + W0 * yi_ + xi_;
+                    s_0 = s_1 = s_2 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        int inp_ofs = ofstab[k];
+                        float w = weights[k];
+                        s_0 += inptr_xi[inp_ofs] * w;
+                        s_1 += inptr_xi[inp_ofs + Wi] * w;
+                        s_2 += inptr_xi[inp_ofs + Wi * 2] * w;
+                    }
+                    if (ifMinMaxAct)
+                    {
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                        s_1 = std::min(std::max(s_1, minval), maxval);
+                        s_2 = std::min(std::max(s_2, minval), maxval);
+                    }
+
+                    outptr[x0] = s_0;
+                    outptr[x0 + W0] = s_1;
+                    outptr[x0 + W0 * 2] = s_2;
+                }
+            }
+            else
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                    s_0 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        s_0 += inptr_xi[ofstab[k]] * weights[k];
+                    }
+
+                    if (ifMinMaxAct)
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                    outptr[x0] = s_0;
+                }
+            }
+            x1 = W0;
+        }
+    }
+}
+
+void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct) {
+    Mat input = _input.getMat();
+    Mat output = _output.getMat();
+    MatShape inputShape = shape(input);
+    MatShape outputShape = shape(output);
+    CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
+
+    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
+    int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
+    int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups;
+
+    const size_t inp_planesize = (size_t) Hi * Wi;
+    const size_t out_planesize = (size_t) H0 * W0;
+
+    CV_Assert(ngroups > 1 && ngroups == K && ngroups == C);
+
+    int stride_y = conv->stride_y, stride_x = conv->stride_x;
+    int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x;
+
+    int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
+    int pad_left = conv->pad_left, pad_right = conv->pad_right;
+
+    int ksize = Hk * Wk, padded_ksize = ((ksize + FAST_VEC_NLANES - 1) / FAST_VEC_NLANES) * FAST_VEC_NLANES;
+
+    const float *inp = input.ptr<float>();
+    float *out = output.ptr<float>();
+
+    std::vector<int> ofstab_(3 * padded_ksize, 0);
+    int *ofstab = ofstab_.data();
+    int *yxtab = ofstab + padded_ksize;
+
+    for (int k = 0; k < padded_ksize; k++)
+    {
+        int y = k < ksize ? k / Wk : 0;
+        int x = k < ksize ? k % Wk : 0;
+        int dy = y * dilation_y, dx = x * dilation_x;
+        yxtab[k * 2] = dy;
+        yxtab[k * 2 + 1] = dx;
+        ofstab[k] = dy * Wi + dx;
+    }
+
+    const float *weights0 = conv->weightsBuf.data(), *bias = conv->biasBuf.data();
+    int inner_ytop = (pad_bottom + stride_y - 1) / stride_y, inner_ybottom = 3;
+    int inner_xleft = (pad_left + stride_x - 1) / stride_x, inner_xright = 4;
+
+    CV_Assert(ksize > 1 || (pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0));
+
+    inner_xright = (Wi - (Wk - 1) * dilation_x + pad_left) / stride_x;
+    inner_xright += inner_xright * stride_x - pad_left + (Wk - 1) * dilation_x < Wi;
+    inner_ybottom = (Hi - (Hk - 1) * dilation_y + pad_top) / stride_y;
+    inner_ybottom += inner_ybottom * stride_y - pad_top + (Hk - 1) * dilation_y < Hi;
+
+    if (inner_xleft >= inner_xright || inner_ytop >= inner_ybottom)
+    {
+        inner_xleft = W0;
+        inner_ytop = H0;
+    }
+
+    inner_ybottom = inner_ybottom < H0 ? inner_ybottom : H0;
+
+    bool useSIMD = stride_x == 1 && inner_xleft < W0;
+    bool is3x3 = Hk == 3 && Wk == 3;
+
+    parallel_for_(Range(0, N * C), [&](const Range &r0) {
+        for (int nc = r0.start; nc < r0.end; nc++)
+        {
+            int c = nc % C;
+            const float *inptr = inp + inp_planesize * nc;
+            float *outptr0 = out + out_planesize * nc;
+
+            float biasval = bias[c];
+            const float *weights = weights0 + c * padded_ksize;
+
+#if CV_TRY_AVX2
+            if (conv->useAVX2)
+                opt_AVX2::depthWiseBlock_AVX2(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
+                                         pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
+                                         inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
+            else
+#endif
+            depthWiseBlock(inptr, outptr0, weights, biasval, ofstab, yxtab, minval, maxval, Hi, Wi, H0, W0, ksize,
+                           pad_top, pad_left, dilation_y, stride_x, stride_y, inner_xleft, inner_xright, inner_ytop,
+                           inner_ybottom, ifMinMaxAct, useSIMD, is3x3);
+
+            if (activ)
+                activ->forwardSlice(outptr0, outptr0, (int) out_planesize, out_planesize, c, c+1);
+        }
+    });
+}
+
+}} // namespace cv::dnn
\ No newline at end of file
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
new file mode 100644
index 0000000..22580c5
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.avx2.cpp
@@ -0,0 +1,361 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "../../precomp.hpp"
+#include "fast_convolution.hpp"
+
+namespace cv {
+namespace opt_AVX2
+{
+#if CV_TRY_AVX2
+void convBlock_AVX2(int k, const float *a, const float *b,
+                float *c, int ldc, const float *bias,
+                float minval, float maxval, bool ifActiv)
+{
+#if FAST_CONV_MR == 4 && FAST_CONV_NR == 24
+    __m256 vminval = _mm256_set1_ps(minval), vmaxval = _mm256_set1_ps(maxval);
+    __m256 c0 = _mm256_set1_ps(bias[0]), c1 = c0, c2 = c0;
+    __m256 c3 = _mm256_set1_ps(bias[1]), c4 = c3, c5 = c3;
+    __m256 c6 = _mm256_set1_ps(bias[2]), c7 = c6, c8 = c6;
+    __m256 c9 = _mm256_set1_ps(bias[3]), c10 = c9, c11 = c9;
+
+    __m256 a0 = _mm256_setzero_ps(), a1 = _mm256_setzero_ps();
+    __m256 b0 = _mm256_setzero_ps(), b1 = _mm256_setzero_ps(), b2 = _mm256_setzero_ps();
+
+    for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
+    {
+        a0 = _mm256_set1_ps(a[0]), a1 = _mm256_set1_ps(a[1]);
+        b0 = _mm256_load_ps(b), b1 = _mm256_load_ps(b + 8), b2 = _mm256_load_ps(b + 16);
+
+        c0 = _mm256_fmadd_ps(b0, a0, c0);
+        c1 = _mm256_fmadd_ps(b1, a0, c1);
+        c2 = _mm256_fmadd_ps(b2, a0, c2);
+
+        c3 = _mm256_fmadd_ps(b0, a1, c3);
+        a0 = _mm256_set1_ps(a[2]);
+        c4 = _mm256_fmadd_ps(b1, a1, c4);
+        c5 = _mm256_fmadd_ps(b2, a1, c5);
+
+        c6 = _mm256_fmadd_ps(b0, a0, c6);
+        a1 = _mm256_set1_ps(a[3]);
+        c7 = _mm256_fmadd_ps(b1, a0, c7);
+        c8 = _mm256_fmadd_ps(b2, a0, c8);
+
+        c9 = _mm256_fmadd_ps(b0, a1, c9);
+        c10 = _mm256_fmadd_ps(b1, a1, c10);
+        c11 = _mm256_fmadd_ps(b2, a1, c11);
+    }
+
+    if (ifActiv)
+    {
+        c0 = _mm256_min_ps(_mm256_max_ps(c0, vminval), vmaxval);
+        c1 = _mm256_min_ps(_mm256_max_ps(c1, vminval), vmaxval);
+        c2 = _mm256_min_ps(_mm256_max_ps(c2, vminval), vmaxval);
+        c3 = _mm256_min_ps(_mm256_max_ps(c3, vminval), vmaxval);
+        c4 = _mm256_min_ps(_mm256_max_ps(c4, vminval), vmaxval);
+        c5 = _mm256_min_ps(_mm256_max_ps(c5, vminval), vmaxval);
+        c6 = _mm256_min_ps(_mm256_max_ps(c6, vminval), vmaxval);
+        c7 = _mm256_min_ps(_mm256_max_ps(c7, vminval), vmaxval);
+        c8 = _mm256_min_ps(_mm256_max_ps(c8, vminval), vmaxval);
+        c9 = _mm256_min_ps(_mm256_max_ps(c9, vminval), vmaxval);
+        c10 = _mm256_min_ps(_mm256_max_ps(c10, vminval), vmaxval);
+        c11 = _mm256_min_ps(_mm256_max_ps(c11, vminval), vmaxval);
+    }
+
+    _mm256_storeu_ps(c, c0); _mm256_storeu_ps(c+8, c1); _mm256_storeu_ps(c+16, c2);
+    _mm256_storeu_ps(c + ldc, c3); _mm256_storeu_ps(c + ldc + 8, c4); _mm256_storeu_ps(c + ldc + 16, c5);
+    _mm256_storeu_ps(c + ldc*2, c6); _mm256_storeu_ps(c + ldc*2 + 8, c7); _mm256_storeu_ps(c + ldc*2 + 16, c8);
+    _mm256_storeu_ps(c + ldc*3, c9); _mm256_storeu_ps(c + ldc*3 + 8, c10); _mm256_storeu_ps(c + ldc*3 + 16, c11);
+    _mm256_zeroupper();
+#else
+#error "unsupported FAST_CONV_MR and/or FAST_CONV_NR in convBlock_AVX2."
+#endif
+}
+
+void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
+                    float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
+                    int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
+                    int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3)
+{
+    const int VECSZ = 8;
+    __m256 vminval = _mm256_set1_ps(minval);
+    __m256 vmaxval = _mm256_set1_ps(maxval);
+
+    __m256 w0 = _mm256_setzero_ps(),
+        w1 = w0, w2 = w0, w3 = w0, w4 = w0, w5 = w0, w6 = w0, w7 = w0, w8 = w0, vbias = w0;
+
+    if (useSIMD)
+    {
+        vbias = _mm256_set1_ps(biasval);
+        if (is3x3)
+        {
+            w0 = _mm256_set1_ps(weights[0]);
+            w1 = _mm256_set1_ps(weights[1]);
+            w2 = _mm256_set1_ps(weights[2]);
+            w3 = _mm256_set1_ps(weights[3]);
+            w4 = _mm256_set1_ps(weights[4]);
+            w5 = _mm256_set1_ps(weights[5]);
+            w6 = _mm256_set1_ps(weights[6]);
+            w7 = _mm256_set1_ps(weights[7]);
+            w8 = _mm256_set1_ps(weights[8]);
+        }
+    }
+
+    int dy0 = 1;
+    for (int y0 = 0; y0 < H0; y0 += dy0, outptr += W0 * dy0)
+    {
+        dy0 = inner_ytop <= y0 && y0 + 3 < inner_ybottom && is3x3 && stride_y == 1 && dilation_y == 1
+              ? 3 : 1;
+
+        int x0 = 0, x1 = y0 >= inner_ytop && y0 < inner_ybottom ? inner_xleft : W0;
+        int yi_ = y0 * stride_y - pad_top;
+
+        for (;;)
+        {
+            float s_0, s_1, s_2;
+            if (dy0 == 3)
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    s_0 = s_1 = s_2 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        int dy = yxtab[k * 2];
+                        int yi = yi_ + dy;
+                        int xi = xi_ + yxtab[k * 2 + 1];
+                        float w = weights[k];
+
+                        if ((unsigned) xi < (unsigned) Wi)
+                        {
+                            s_0 += inptr[yi * Wi + xi] * w;
+                            s_1 += inptr[(yi + 1) * Wi + xi] * w;
+                            s_2 += inptr[(yi + 2) * Wi + xi] * w;
+                        }
+                    }
+                    if (ifMinMaxAct)
+                    {
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                        s_1 = std::min(std::max(s_1, minval), maxval);
+                        s_2 = std::min(std::max(s_2, minval), maxval);
+                    }
+
+                    outptr[x0] = s_0;
+                    outptr[x0 + W0] = s_1;
+                    outptr[x0 + W0 * 2] = s_2;
+                }
+            }
+            else
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    s_0 = biasval;
+                    for (int k = 0; k < ksize; k++) {
+                        int dy = yxtab[k * 2];
+                        int yi = yi_ + dy;
+                        int xi = xi_ + yxtab[k * 2 + 1];
+                        float w = weights[k];
+                        if (((unsigned) yi < (unsigned) Hi) & ((unsigned) xi < (unsigned) Wi))
+                            s_0 += inptr[yi * Wi + xi] * w;
+                    }
+                    if (ifMinMaxAct)
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                    outptr[x0] = s_0;
+                }
+            }
+            if (x0 == W0)
+                break;
+            x1 = inner_xright;
+
+            if (useSIMD)
+            {
+                if (is3x3)
+                {
+                    if (dy0 == 3)
+                    {
+                        for (; x0 <= x1 - VECSZ; x0 += VECSZ)
+                        {
+                            int xi_ = x0 * stride_x - pad_left;
+                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
+
+                            __m256 s0, s1, s2;
+                            __m256 x00 = _mm256_loadu_ps(inptr_xi);
+                            __m256 x01 = _mm256_loadu_ps(inptr_xi + 1);
+                            __m256 x02 = _mm256_loadu_ps(inptr_xi + 2);
+
+                            __m256 x10 = _mm256_loadu_ps(inptr_xi + Wi);
+                            __m256 x11 = _mm256_loadu_ps(inptr_xi + Wi + 1);
+                            __m256 x12 = _mm256_loadu_ps(inptr_xi + Wi + 2);
+
+                            __m256 x20 = _mm256_loadu_ps(inptr_xi + Wi * 2);
+                            __m256 x21 = _mm256_loadu_ps(inptr_xi + Wi * 2 + 1);
+                            __m256 x22 = _mm256_loadu_ps(inptr_xi + Wi * 2 + 2);
+
+                            __m256 x30 = _mm256_loadu_ps(inptr_xi + Wi * 3);
+                            __m256 x31 = _mm256_loadu_ps(inptr_xi + Wi * 3 + 1);
+                            __m256 x32 = _mm256_loadu_ps(inptr_xi + Wi * 3 + 2);
+
+                            __m256 x40 = _mm256_loadu_ps(inptr_xi + Wi * 4);
+                            __m256 x41 = _mm256_loadu_ps(inptr_xi + Wi * 4 + 1);
+                            __m256 x42 = _mm256_loadu_ps(inptr_xi + Wi * 4 + 2);
+
+                            s0 = _mm256_fmadd_ps(x00, w0, vbias);
+                            s1 = _mm256_fmadd_ps(x10, w0, vbias);
+                            s2 = _mm256_fmadd_ps(x20, w0, vbias);
+
+                            s0 = _mm256_fmadd_ps(x01, w1, s0);
+                            s1 = _mm256_fmadd_ps(x11, w1, s1);
+                            s2 = _mm256_fmadd_ps(x21, w1, s2);
+
+                            s0 = _mm256_fmadd_ps(x02, w2, s0);
+                            s1 = _mm256_fmadd_ps(x12, w2, s1);
+                            s2 = _mm256_fmadd_ps(x22, w2, s2);
+
+                            s0 = _mm256_fmadd_ps(x10, w3, s0);
+                            s1 = _mm256_fmadd_ps(x20, w3, s1);
+                            s2 = _mm256_fmadd_ps(x30, w3, s2);
+
+                            s0 = _mm256_fmadd_ps(x11, w4, s0);
+                            s1 = _mm256_fmadd_ps(x21, w4, s1);
+                            s2 = _mm256_fmadd_ps(x31, w4, s2);
+
+                            s0 = _mm256_fmadd_ps(x12, w5, s0);
+                            s1 = _mm256_fmadd_ps(x22, w5, s1);
+                            s2 = _mm256_fmadd_ps(x32, w5, s2);
+
+                            s0 = _mm256_fmadd_ps(x20, w6, s0);
+                            s1 = _mm256_fmadd_ps(x30, w6, s1);
+                            s2 = _mm256_fmadd_ps(x40, w6, s2);
+
+                            s0 = _mm256_fmadd_ps(x21, w7, s0);
+                            s1 = _mm256_fmadd_ps(x31, w7, s1);
+                            s2 = _mm256_fmadd_ps(x41, w7, s2);
+
+                            s0 = _mm256_fmadd_ps(x22, w8, s0);
+                            s1 = _mm256_fmadd_ps(x32, w8, s1);
+                            s2 = _mm256_fmadd_ps(x42, w8, s2);
+
+                            if (ifMinMaxAct)
+                            {
+                                s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval);
+                                s1 = _mm256_min_ps(_mm256_max_ps(s1, vminval), vmaxval);
+                                s2 = _mm256_min_ps(_mm256_max_ps(s2, vminval), vmaxval);
+                            }
+
+                            _mm256_storeu_ps(outptr + x0, s0);
+                            _mm256_storeu_ps(outptr + W0 + x0, s1);
+                            _mm256_storeu_ps(outptr + W0 * 2 + x0, s2);
+                        }
+                    }
+                    else
+                    {
+                        for (; x0 <= x1 - VECSZ; x0 += VECSZ)
+                        {
+                            int xi_ = x0 * stride_x - pad_left;
+                            const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                            __m256 s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[0]), w0, vbias);
+                            __m256 s1 = _mm256_mul_ps(_mm256_loadu_ps(inptr_xi + ofstab[1]), w1);
+                            __m256 s2 = _mm256_mul_ps(_mm256_loadu_ps(inptr_xi + ofstab[2]), w2);
+
+                            s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[3]), w3, s0);
+                            s1 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[4]), w4, s1);
+                            s2 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[5]), w5, s2);
+
+                            s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[6]), w6, s0);
+                            s1 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[7]), w7, s1);
+                            s2 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[8]), w8, s2);
+
+                            s0 = _mm256_add_ps(_mm256_add_ps(s0, s1), s2);
+
+                            if (ifMinMaxAct)
+                                s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval);
+                            _mm256_storeu_ps(outptr + x0, s0);
+                        }
+                    }
+                }
+                else
+                {
+                    for (; x0 <= x1 - VECSZ; x0 += VECSZ)
+                    {
+                        int xi_ = x0 * stride_x - pad_left, k = 0;
+                        const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                        __m256 s0 = vbias;
+                        for (; k <= ksize - 4; k += 4)
+                        {
+                            __m256 v0 = _mm256_loadu_ps(inptr_xi + ofstab[k]);
+                            __m256 v1 = _mm256_loadu_ps(inptr_xi + ofstab[k + 1]);
+                            __m256 v2 = _mm256_loadu_ps(inptr_xi + ofstab[k + 2]);
+                            __m256 v3 = _mm256_loadu_ps(inptr_xi + ofstab[k + 3]);
+
+                            __m256 ww0 = _mm256_set1_ps(weights[k]);
+                            __m256 ww1 = _mm256_set1_ps(weights[k+1]);
+                            __m256 ww2 = _mm256_set1_ps(weights[k+2]);
+                            __m256 ww3 = _mm256_set1_ps(weights[k+3]);
+
+                            s0 = _mm256_fmadd_ps(v0, ww0, s0);
+                            s0 = _mm256_fmadd_ps(v1, ww1, s0);
+                            s0 = _mm256_fmadd_ps(v2, ww2, s0);
+                            s0 = _mm256_fmadd_ps(v3, ww3, s0);
+                        }
+                        for (; k < ksize; k++)
+                            s0 = _mm256_fmadd_ps(_mm256_loadu_ps(inptr_xi + ofstab[k]),
+                                                 _mm256_set1_ps(weights[k]), s0);
+
+                        if (ifMinMaxAct)
+                            s0 = _mm256_min_ps(_mm256_max_ps(s0, vminval), vmaxval);
+                        _mm256_storeu_ps(outptr + x0, s0);
+                    }
+                }
+            }
+
+            if (dy0 == 3)
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    const float *inptr_xi = inptr + W0 * yi_ + xi_;
+                    s_0 = s_1 = s_2 = biasval;
+                    for (int k = 0; k < ksize; k++) {
+                        int inp_ofs = ofstab[k];
+                        float w = weights[k];
+                        s_0 += inptr_xi[inp_ofs] * w;
+                        s_1 += inptr_xi[inp_ofs + Wi] * w;
+                        s_2 += inptr_xi[inp_ofs + Wi * 2] * w;
+                    }
+                    if (ifMinMaxAct)
+                    {
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                        s_1 = std::min(std::max(s_1, minval), maxval);
+                        s_2 = std::min(std::max(s_2, minval), maxval);
+                    }
+
+                    outptr[x0] = s_0;
+                    outptr[x0 + W0] = s_1;
+                    outptr[x0 + W0 * 2] = s_2;
+                }
+            }
+            else
+            {
+                for (; x0 < x1; x0++)
+                {
+                    int xi_ = x0 * stride_x - pad_left;
+                    const float *inptr_xi = inptr + Wi * yi_ + xi_;
+                    s_0 = biasval;
+                    for (int k = 0; k < ksize; k++)
+                    {
+                        s_0 += inptr_xi[ofstab[k]] * weights[k];
+                    }
+                    if (ifMinMaxAct)
+                        s_0 = std::min(std::max(s_0, minval), maxval);
+                    outptr[x0] = s_0;
+                }
+            }
+            x1 = W0;
+        }
+    }
+}
+#endif
+} // namespace opt_AVX2
+} // namespace cv
\ No newline at end of file
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
new file mode 100644
index 0000000..139ea7f
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.cpp
@@ -0,0 +1,694 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+// This file is modified from the ficus (https://github.com/vpisarev/ficus/blob/master/lib/NN/OpConv.fx).
+// Here is the original license:
+/*
+    This file is a part of ficus language project.
+    See ficus/LICENSE for the licensing terms
+*/
+
+#include "../../precomp.hpp"
+#include "fast_convolution.hpp"
+#include "fast_convolution.simd.hpp"
+
+namespace cv { namespace dnn {
+
+Ptr<FastConv2d> initFastConv2d(
+        int ngroups,
+        int K, int C, int Hk, int Wk,
+        int stride_x, int stride_y,
+        int dilation_x, int dilation_y,
+        const std::vector<size_t>& pads_begin,
+        const std::vector<size_t>& pads_end,
+        float* srcWeights,
+        float* srcBias)
+{
+    Ptr<FastConv2d> conv = makePtr<FastConv2d>();
+
+    CV_Assert(ngroups > 0 && K > 0 && C > 0 && K % ngroups == 0);
+    CV_Assert(Hk > 0 && Wk > 0);
+    CV_Assert(stride_y > 0 && stride_x > 0);
+    CV_Assert(dilation_y > 0 && dilation_x > 0);
+
+    conv->K = K; conv->C = C; conv->Hk = Hk; conv->Wk = Wk;  // [K, iC, kH, kW]
+    conv->stride_y = stride_y;
+    conv->stride_x = stride_x;
+    conv->dilation_y = dilation_y;
+    conv->dilation_x = dilation_x;
+
+    conv->ngroups = ngroups;
+    conv->pad_top = pads_begin[0];
+    conv->pad_bottom = pads_end[0];
+    conv->pad_left = pads_begin[1];
+    conv->pad_right = pads_end[1];
+
+    // store bias; append some zero's to make sure that
+    // we can always read FAST_CONV_MR elements starting from any valid index
+    {
+        int k = 0, nbias = K + FAST_CONV_MR-1;
+        conv->biasBuf.reserve(nbias);
+        float* biasBufPtr = conv->biasBuf.data();
+        for(; k < K; k++)
+            biasBufPtr[k] = srcBias ? srcBias[k] : 0.f;
+        for(; k < nbias; k++)
+            biasBufPtr[k] = 0.f;
+    }
+
+#if CV_NEON // For now, winograd is ARM platform only.
+    if (ngroups == 1 && Hk ==3 && Wk == 3 && stride_x == 1 && stride_y == 1 && dilation_x == 1 && dilation_y ==1
+        && K >= 16 && C >= 16 )
+        conv->ifWinograd63 = true;
+#else
+    conv->ifWinograd63 = false;
+#endif
+
+    if (ngroups > 1 && ngroups == K && ngroups == C)
+    {
+        // for depth-wise convolutions on NCHW data we just preserve the weights in KCHW layout,
+        // but add some padding to make the weights array layout more SIMD-friendly
+        int ksize = Hk*Wk;
+        int padded_ksize = ((ksize + FAST_VEC_NLANES-1)/FAST_VEC_NLANES)*FAST_VEC_NLANES;  // this code aims to let memory fit with vector size.
+        int nweights = C*padded_ksize;
+        conv->weightsBuf.reserve(nweights);
+        float* weightsBufPtr = conv->weightsBuf.data();
+        memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
+        for(int c = 0; c < C; c++)
+        {
+            for (int k = 0; k < ksize; k++)
+                weightsBufPtr[c*padded_ksize + k] = srcWeights[c*ksize + k];
+        }
+    }
+    else
+    {
+        // The weights are packed as
+        // ngroups x (ceil((K/ngroups)/FAST_CONV_MR)*FAST_CONV_MR) x (Cg*Hk*Wk) x FAST_CONV_MR tensor
+        int Kg = K/ngroups, Cg = max(C/ngroups, 1);
+        int Kg_aligned = ((Kg + FAST_CONV_MR - 1)/FAST_CONV_MR)*FAST_CONV_MR;
+        size_t nweights = ngroups*Kg_aligned*Cg*Hk*Wk;
+        conv->weightsBuf.reserve(nweights);
+        float* weightsBufPtr = conv->weightsBuf.data();
+        memset(weightsBufPtr, 0, nweights*sizeof(weightsBufPtr[0]));
+        float* packed_wptr = weightsBufPtr;
+
+        // pack the weight.
+        for(int g = 0; g < ngroups; g++)
+        {
+            for(int k0 = 0; k0 < Kg_aligned; k0 += FAST_CONV_MR)
+            {
+                int dk = Kg - k0 < FAST_CONV_MR ? Kg - k0 : FAST_CONV_MR;
+                for(int c = 0; c < Cg; c++)
+                {
+                    for(int yx = 0; yx < Hk*Wk; yx++, packed_wptr += FAST_CONV_MR)
+                    {
+                        const float* wptr = srcWeights + ((g*Kg + k0)*Cg + c)*Hk*Wk + yx;
+                        int k = 0;
+                        for(; k < dk; k++, wptr += Cg*Hk*Wk)
+                            packed_wptr[k] = *wptr;
+                        for(; k < FAST_CONV_MR; k++)
+                            packed_wptr[k] = 0.f;
+                    }
+                }
+            }
+        }
+
+        // Prepare Weight for Winograd F(6x6, 3x3)
+        if (conv->ifWinograd63)
+        {
+            initWinograd63(conv, srcWeights, K, C);
+        }
+    }
+    return conv;
+}
+
+static void packInput(float* inpbuf, const float* inptr, int* yxtab, int ksize, int Cg, int Hi, int Wi, int W0,
+                         int pad_top, int pad_left, int stride_x, int stride_y, int yx0, int slice_len,
+                         bool fast_1x1, bool partial0, bool s1d1p0, bool s1d1)
+{
+    const size_t inp_planesize = (size_t)Hi*Wi;
+
+    if (fast_1x1)
+    {
+        /*
+           super-fast branch for 1x1 convolutions with sy=sx=1.
+           in this case each feature plane can be safely treated
+           as 1D array and we just extract next portion
+           of FAST_CONV_NR elements from each feature plane and
+           put it together.
+        */
+        inptr += yx0;
+        if (!partial0)
+        {
+            // Make special branch where memcpy() is called with a constant buffer size.
+            // Compilers will likely unroll this loop properly.
+            for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += FAST_CONV_NR)
+                memcpy(inpbuf, inptr, FAST_CONV_NR * sizeof(inpbuf[0]));
+        }
+        else
+        {
+            for (int c = 0; c < Cg; c++, inptr += inp_planesize, inpbuf += FAST_CONV_NR)
+            {
+                memcpy(inpbuf, inptr, slice_len * sizeof(inpbuf[0]));
+                memset(inpbuf + slice_len, 0, (FAST_CONV_NR - slice_len) * sizeof(inpbuf[0]));
+            }
+        }
+    }
+    else if (s1d1p0)
+    {
+        /*
+         slower, but still fast branch for sy=sx=1, dy=dx=1 and without padding,
+         in this case we copy data from input tensors by chunks.
+         */
+        for (int c = 0; c < Cg; c++)
+        {
+            float *inpbuf_c = inpbuf + c * (FAST_CONV_NR * ksize);
+            const float *inptr_c = inptr + c * inp_planesize;
+
+            for (int k = 0; k < ksize; k++)
+            {
+                int y0 = yx0 / W0, x0 = yx0 % W0;
+                int yi = y0 + yxtab[k * 2], xi = x0 + yxtab[k * 2 + 1];
+                float *inpbuf_k = inpbuf_c + k * FAST_CONV_NR;
+                int xi_0 = yxtab[k * 2 + 1];
+
+                int i = 0;
+                for (; i < slice_len;)
+                {
+                    const float *inptr_k = inptr_c + yi * Wi + xi;
+                    int copy_len = std::min(slice_len - i, W0 - x0);
+                    int di_z = (slice_len == i + copy_len) ? FAST_CONV_NR - slice_len : 0;
+
+                    memcpy(inpbuf_k + i,
+                           inptr_k,
+                           copy_len * sizeof(inpbuf_k[0]));
+
+                    memset(inpbuf_k + i + copy_len,
+                           0, di_z * sizeof(inpbuf_k[0]));
+
+                    i += copy_len;
+                    x0 = 0;
+                    xi = xi_0;
+                    yi++;
+                }
+            }
+        }
+    }
+    else if (s1d1)
+    {
+        /*
+         slower, but still fast branch for sy=sx=1, dy=dx=1.
+         in this case we copy data from input tensors by chunks and
+         interleave the data in inpbuf with 0's
+         (that correspond to the padding elements) when necessary
+         */
+        int y0 = yx0 / W0, x0 = yx0 % W0;
+        for (int c = 0; c < Cg; c++)
+        {
+            float *inpbuf_c = inpbuf + c * (FAST_CONV_NR * ksize);
+            const float *inptr_c = inptr + c * inp_planesize;
+
+            for (int k = 0; k < ksize; k++)
+            {
+                int x0_tmp = x0;
+
+                int xi_0 = yxtab[k * 2 + 1] - pad_left;
+
+                int yi = y0 + yxtab[k * 2] - pad_top, xi = x0_tmp + xi_0;
+                float *inpbuf_k = inpbuf_c + k * FAST_CONV_NR;
+
+                int i = 0;
+                for (; i < slice_len;) {
+                    int copyLen = std::min(slice_len - i, W0 - x0_tmp);
+
+                    int di_z = (i + copyLen == slice_len) ? FAST_CONV_NR - slice_len
+                                                          : 0; // The final padding.
+                    // pad_top or pad bottom
+                    if (yi < 0 || yi > Hi - 1)
+                    {
+                        memset(inpbuf_k + i,
+                               0, (copyLen + di_z) * sizeof(inpbuf_k[0]));
+                        i += copyLen + di_z;
+                    }
+                    else
+                    {
+                        int x_pad_left = 0, x_pad_right = 0;
+
+                        // pad_left
+                        if (xi < 0)
+                        {
+                            x_pad_left = std::min(-xi, copyLen);
+                            xi = 0;
+                            copyLen -= x_pad_left;
+                        }
+
+                        memset(inpbuf_k + i,
+                               0, x_pad_left * sizeof(inpbuf_k[0]));
+                        i += x_pad_left;
+
+                        // pad right
+                        if (xi + copyLen > Wi)
+                        {
+                            if (xi > Wi)
+                            {
+                                x_pad_right = copyLen;
+                                copyLen = 0;
+                            }
+                            else
+                            {
+                                x_pad_right = std::min(xi + copyLen - Wi, copyLen);
+                                copyLen -= x_pad_right;
+                            }
+                        }
+
+                        CV_Assert(copyLen >= 0);
+
+                        const float *inptr_k = inptr_c + yi * Wi + xi;
+                        memcpy(inpbuf_k + i,
+                               inptr_k,
+                               copyLen * sizeof(inpbuf_k[0]));
+
+                        i += copyLen;
+
+                        // pad_right and the final padding.
+                        memset(inpbuf_k + i,
+                               0, (di_z + x_pad_right) * sizeof(inpbuf_k[0]));
+                        i += x_pad_right + di_z;
+                    }
+
+                    x0_tmp = 0;
+                    xi = xi_0;
+                    yi++;
+                }
+            }
+        }
+    }
+    else
+    {
+        int y0_ = yx0 / W0, x0_ = yx0 - y0_ * W0;
+        for (int k = 0; k < ksize; k++)
+        {
+            int dy = yxtab[k * 2], dx = yxtab[k * 2 + 1];
+            int i = 0, y0 = y0_, x0 = x0_;
+            for (; i < FAST_CONV_NR;)
+            {
+                float *inpbuf_ki = inpbuf + k * FAST_CONV_NR + i;
+                int yi = y0 * stride_y + dy - pad_top;
+                int xi = x0 * stride_x + dx - pad_left;
+
+                if ((unsigned) yi < (unsigned) Hi &&
+                    (unsigned) xi < (unsigned) Wi)
+                {
+                    const float *inptr_ki = inptr + yi * Wi + xi;
+                    if (i + 4 <= FAST_CONV_NR && x0 + 4 <= W0 && xi + stride_x * 4 <= Wi)
+                    {
+                        if (stride_x == 2) {
+                            for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR *
+                                                                      ksize, inptr_ki += inp_planesize)
+                            {
+                                float t0 = inptr_ki[0], t1 = inptr_ki[2];
+                                float t2 = inptr_ki[4], t3 = inptr_ki[6];
+                                inpbuf_ki[0] = t0;
+                                inpbuf_ki[1] = t1;
+                                inpbuf_ki[2] = t2;
+                                inpbuf_ki[3] = t3;
+                            }
+                        }
+                        else
+                        {
+                            for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR *
+                                                                      ksize, inptr_ki += inp_planesize)
+                            {
+                                float t0 = inptr_ki[0], t1 = inptr_ki[stride_x];
+                                float t2 = inptr_ki[stride_x * 2], t3 = inptr_ki[stride_x * 3];
+                                inpbuf_ki[0] = t0;
+                                inpbuf_ki[1] = t1;
+                                inpbuf_ki[2] = t2;
+                                inpbuf_ki[3] = t3;
+                            }
+                        }
+                        i += 4;
+                        x0 += 4;
+                    }
+                    else
+                    {
+                        for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR *
+                                                                  ksize, inptr_ki += inp_planesize)
+                            *inpbuf_ki = *inptr_ki;
+                        i++;
+                        x0++;
+                    }
+                }
+                else
+                {
+                    for (int c = 0; c < Cg; c++, inpbuf_ki += FAST_CONV_NR * ksize)
+                        inpbuf_ki[0] = 0.f;
+                    i++;
+                    x0++;
+                }
+                int mask = x0 >= W0;
+                y0 += mask;
+                x0 &= mask - 1;
+            }
+        }
+    }
+}
+
+static void matMulCompute(float* outptr0, float* inpbuf_task, float* cbuf, const Ptr<FastConv2d>& conv, int HkWkCg,
+                          int k0, int k1, int yx0, int yx1, size_t out_planesize, int g, int Kg, int Kg_aligned,
+                          bool partial0, ActivationLayer*& activ, float minval, float maxval, bool ifMinMaxAct)
+{
+    int outstep0 = out_planesize;
+
+    for (int k = k0; k < k1; k += FAST_CONV_MR, outptr0 += outstep0 * FAST_CONV_MR)
+    {
+        int dk = Kg - k < FAST_CONV_MR ? Kg - k : FAST_CONV_MR;
+        bool partial = partial0 || dk < FAST_CONV_MR;
+        float *outptr = outptr0;
+
+        int outstep = outstep0;
+        if (partial)
+        {
+            outptr = cbuf;
+            outstep = FAST_CONV_NR;
+        }
+
+
+#if CV_TRY_AVX2
+        if (conv->useAVX2)
+            opt_AVX2::convBlock_AVX2( HkWkCg, conv->weightsBuf.data() + (g * Kg_aligned + k) * HkWkCg,
+                                  inpbuf_task, outptr, outstep, conv->biasBuf.data() + Kg * g + k,
+                                  minval, maxval, ifMinMaxAct);
+        else
+#endif
+#if CV_TRY_NEON
+        if (conv->useNEON)
+            opt_NEON::convBlock_NEON(HkWkCg, conv->weightsBuf.data() + (g * Kg_aligned + k) * HkWkCg,
+                                 inpbuf_task, outptr, outstep, conv->biasBuf.data() + Kg * g + k,
+                                 minval, maxval, ifMinMaxAct);
+        else
+#endif
+            convBlock(HkWkCg, conv->weightsBuf.data() + (g * Kg_aligned + k) * HkWkCg,
+                            inpbuf_task, outptr, outstep, conv->biasBuf.data() + Kg * g + k,
+                            minval, maxval, ifMinMaxAct);
+
+        // activation
+        if (activ)
+            activ->forwardSlice(outptr, outptr, yx1 - yx0, outstep, Kg * g + k,
+                                Kg * g + k + dk);
+
+        if (partial)
+        {
+            for (int i = 0; i < dk; i++)
+                memcpy(outptr0 + i * outstep0, cbuf + i * FAST_CONV_NR,
+                       (yx1 - yx0) * sizeof(cbuf[0]));
+        }
+    }
+}
+
+void runFastConv2d(InputArray _input, OutputArray _output,
+                   const Ptr<FastConv2d>& conv, int ntasks, const Ptr<ActivationLayer>& actLayer)
+{
+    Mat input = _input.getMat();
+    Mat output = _output.getMat();
+    MatShape inputShape = shape(input);
+    MatShape outputShape = shape(output);
+    CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
+
+    ActivationLayer* activ = 0;
+    float minval = -FLT_MAX, maxval = FLT_MAX;
+    bool ifMinMaxAct = false;
+    if (actLayer)
+    {
+        Ptr<ReLULayer> activ_relu = actLayer.dynamicCast<ReLULayer>();
+        Ptr<ReLU6Layer> activ_relu6 = actLayer.dynamicCast<ReLU6Layer>();
+
+        if (!activ_relu.empty())
+        {
+            if (activ_relu->negativeSlope == 0.0f)
+            {
+                minval = 0.0f;
+                ifMinMaxAct = true;
+                activ = nullptr;
+            }
+            else // Leaky ReLU
+            {
+                activ = actLayer.get();
+            }
+        }
+        else if (!activ_relu6.empty())
+        {
+            minval = activ_relu6->minValue;
+            maxval = activ_relu6->maxValue;
+
+            ifMinMaxAct = true;
+            activ = nullptr;
+        }
+        else
+            activ = actLayer.get();
+    }
+    else
+        activ = nullptr;
+
+    if (conv->ngroups  > 1 && conv->ngroups == conv->K && conv->ngroups == conv->C)
+    {
+        return runDepthwise(input, output, conv, minval, maxval, activ, ifMinMaxAct);
+    }
+
+#if CV_NEON
+    if ( conv->ifWinograd63
+         && inputShape[2] > 12 && inputShape[3] > 12
+         && inputShape[2] < 120 && inputShape[3] < 120 )
+    {
+        // In general, for winograd branch, more cores will give better performance.
+        int maxNumThread = std::max(getNumThreads(), 1);
+        if (runWinograd63(input, output, conv, maxNumThread, minval, maxval, activ, ifMinMaxAct))
+            return;
+    }
+#endif
+
+    float* inp = input.ptr<float>();
+    float* out = output.ptr<float>();
+
+    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
+    int K = conv->K, Hk = conv->Hk, Wk = conv->Wk;
+    int H0 = outputShape[2], W0 = outputShape[3], ngroups = conv->ngroups;         // ngroups
+    int Cg = C/ngroups, Kg = K/ngroups;
+    int Kg_nblocks = (Kg + FAST_CONV_MR-1)/FAST_CONV_MR, Kg_aligned = Kg_nblocks*FAST_CONV_MR; // align to MR
+
+    const size_t inp_planesize = (size_t)Hi*Wi;
+    const size_t out_planesize = (size_t)H0*W0;
+
+    int pad_top = conv->pad_top, pad_bottom = conv->pad_bottom;
+    int pad_left = conv->pad_left;
+    int pad_right = conv->pad_right;
+
+    int stride_y = conv->stride_y, stride_x = conv->stride_x;
+    int dilation_y = conv->dilation_y, dilation_x = conv->dilation_x;
+
+    int ksize = Hk * Wk;
+    bool s1d1 = stride_x == 1 && stride_y == 1 && dilation_x == 1 && dilation_y == 1;
+    bool s1d1p0 = s1d1 && pad_top == 0 && pad_left ==0 && pad_bottom == 0 && pad_right == 0;
+    bool fast_1x1 = stride_x == 1 && stride_y == 1 && ksize == 1;
+    int HkWkCg = Hk*Wk*Cg;
+
+    enum { VEC_ALIGN = 8, DFT_TYPE = CV_32F };
+    size_t taskbufsize = FAST_CONV_NR*HkWkCg; // input buffer
+    size_t taskbufsizeOutput = FAST_CONV_NR * FAST_CONV_MR;
+    size_t inputbufsize = 0;
+    size_t outbufsize = ntasks * taskbufsizeOutput;
+
+    int stripes_per_sample = (out_planesize + FAST_CONV_NR - 1)/FAST_CONV_NR; // align to NR
+    size_t hw_task = stripes_per_sample;
+    size_t hw_aligned = stripes_per_sample * FAST_CONV_NR;
+
+    bool separatedLoop = false;
+
+    if (stripes_per_sample < 4 * ntasks)
+    {
+        // If stripes_per_sample is small, we parallelize on K (output channel).
+        stripes_per_sample = 1;
+
+        // Separated Parallelloop could save much time in packing input data. But it may cost more memory, we use it when batch size is 1.
+        if (N == 1)
+        {
+            separatedLoop = true;
+            inputbufsize = ngroups * hw_aligned * HkWkCg;
+        }
+
+        if (!separatedLoop)
+        {
+            inputbufsize = taskbufsize * ntasks;
+        }
+    }
+    else
+    {
+        // If stripes_per_sample is big, we parallelize on H0*W0.
+        Kg_nblocks = 1;
+        inputbufsize = taskbufsize * ntasks;
+    }
+
+    int Kstripes = Kg_nblocks*stripes_per_sample;
+    int nsubtasks = N*ngroups*Kstripes;
+
+    AutoBuffer<float> inpbuf_all_, outputbuf_;
+    inputbufsize = alignSize(inputbufsize, VEC_ALIGN);
+    inpbuf_all_.allocate(inputbufsize + VEC_ALIGN);
+    float* inpbuf_all = alignPtr(inpbuf_all_.data(), (int)(VEC_ALIGN*sizeof(float)));
+
+    outbufsize = alignSize(outbufsize, VEC_ALIGN);
+    outputbuf_.allocate(outbufsize + VEC_ALIGN);
+    float* output_buf = alignPtr(outputbuf_.data(), (int)(VEC_ALIGN*sizeof(float)));
+
+    std::vector<int> ofstab_(Hk*Wk*3, 0);
+    int* ofstab = ofstab_.data();
+    int* yxtab = ofstab + Hk*Wk;
+
+    for (int y = 0; y < Hk; y++)
+        for( int x = 0; x < Wk; x++)
+        {
+            int k = y*Wk + x;
+            int dy = y*dilation_y, dx = x*dilation_x;
+            yxtab[k*2] = dy;
+            yxtab[k*2+1] = dx;
+            ofstab[k] = dy*Wi + dx;
+        }
+
+    if (ksize == 1)
+    {
+        CV_Assert(pad_left == 0 && pad_right == 0 && pad_top == 0 && pad_bottom == 0);
+        CV_Assert(stride_x != 1 || stride_y != 1 || (H0 == Hi && W0 == Wi));
+    }
+
+    if (separatedLoop)
+    {
+        // For now this branch only handles batch size = 1. Maybe we could support batch size < 10 in the future.
+        // Pack Input data
+        parallel_for_(Range(0, ngroups * hw_task), [&](const Range& r0)
+        {
+            for (int nhwi = r0.start; nhwi < r0.end; nhwi++)
+            {
+                int g = nhwi/hw_task;
+                int hw_i = nhwi % hw_task;
+                int hw0 = hw_i * FAST_CONV_NR;
+                float* inpbuf = inpbuf_all + g * hw_aligned * HkWkCg + hw0 * HkWkCg;
+                const float* inptr = inp + g * Cg * inp_planesize;
+                bool partial0 = hw0 + FAST_CONV_NR > out_planesize? true: false;
+                int slice_len = FAST_CONV_NR;
+
+                if (partial0)
+                    slice_len = out_planesize - hw0;
+
+                packInput(inpbuf, inptr, yxtab, ksize, Cg, Hi, Wi, W0, pad_top, pad_left, stride_x, stride_y,
+                          hw0, slice_len, fast_1x1, partial0, s1d1p0, s1d1);
+            }
+        });
+
+        // Compute
+        parallel_for_(Range(0, ntasks), [&](const Range& r0)
+        {
+            for (int task_id = r0.start; task_id < r0.end; task_id++)
+            {
+                float *cbuf = output_buf + task_id * taskbufsizeOutput;
+                int ngs0 = (int) ((size_t) nsubtasks * task_id / ntasks);
+                int ngs1 = (int) ((size_t) nsubtasks * (task_id + 1) / ntasks);
+                for (int subtask = ngs0; subtask < ngs1;)
+                {
+                    int ng = subtask / Kstripes;
+                    int kyx0 = subtask - ng * Kstripes;
+                    int kyx1 = kyx0 + (ngs1 - subtask);
+                    int n = ng / ngroups, g = ng - n * ngroups;
+
+                    CV_Assert(n <= 1);
+
+                    kyx1 = kyx1 <= Kstripes ? kyx1 : Kstripes; // Guarantee that maximum kyx1 is Kstripes.
+                    subtask += kyx1 - kyx0;
+
+                    int k0 = kyx0 * FAST_CONV_MR;
+                    int k1 = kyx1 * FAST_CONV_MR;
+                    k1 = k1 <= Kg ? k1 : Kg;
+
+
+                    for (int yx0 = 0; yx0 < out_planesize; yx0 += FAST_CONV_NR)
+                    {
+                        float* inpbuf_task = inpbuf_all + g * hw_aligned * HkWkCg + yx0 * HkWkCg;
+                        int yx1 = yx0 + FAST_CONV_NR;
+                        yx1 = yx1 <= out_planesize ? yx1 : out_planesize;
+                        int slice_len = yx1 - yx0;
+                        bool partial0 = slice_len < FAST_CONV_NR;
+
+                        int outstep0 = out_planesize;
+                        size_t outofs = ((n * ngroups + g) * Kg + k0) * outstep0 + yx0;
+                        float *outptr0 = out + outofs;
+
+                        matMulCompute(outptr0, inpbuf_task, cbuf, conv, HkWkCg, k0, k1, yx0, yx1, out_planesize, g,
+                                      Kg, Kg_aligned, partial0, activ, minval, maxval, ifMinMaxAct);
+                    }
+                }
+            }
+        });
+    }
+    else
+    {
+        parallel_for_(Range(0, ntasks), [&](const Range &r0) {
+            for (int task_id = r0.start; task_id < r0.end; task_id++) {
+                float *inpbuf_task = &inpbuf_all[taskbufsize * task_id];
+                float *cbuf = output_buf + task_id * taskbufsizeOutput;
+                int ngs0 = (int) ((size_t) nsubtasks * task_id / ntasks);
+                int ngs1 = (int) ((size_t) nsubtasks * (task_id + 1) / ntasks);
+
+                for (int subtask = ngs0; subtask < ngs1;)
+                {
+                    int ng = subtask / Kstripes;
+                    int kyx0 = subtask - ng * Kstripes;
+                    int kyx1 = kyx0 + (ngs1 - subtask);
+                    int n = ng / ngroups, g = ng - n * ngroups;
+                    size_t inp_plane_ofs = (size_t) (n * ngroups + g) * Cg * inp_planesize;
+                    kyx1 = kyx1 <= Kstripes ? kyx1 : Kstripes; // Guarantee that maximum kyx1 is Kstripes.
+                    subtask += kyx1 - kyx0;
+                    int k0, k1;
+                    int yx0, yx_limit;
+
+                    if (stripes_per_sample == 1)
+                    {
+                        k0 = kyx0 * FAST_CONV_MR;
+                        k1 = kyx1 * FAST_CONV_MR;
+                        k1 = k1 <= Kg ? k1 : Kg;
+                        yx0 = 0;
+                        yx_limit = out_planesize;
+                    }
+                    else
+                    {
+                        k0 = 0;
+                        k1 = Kg;
+                        yx0 = kyx0 * FAST_CONV_NR;
+                        yx_limit = kyx1 * FAST_CONV_NR;
+                        yx_limit = yx_limit < out_planesize ? yx_limit : out_planesize;
+                    }
+
+                    for (; yx0 < yx_limit; yx0 += FAST_CONV_NR)
+                    {
+                        float *inpbuf = inpbuf_task;
+                        const float *inptr = inp + inp_plane_ofs;
+                        int yx1 = yx0 + FAST_CONV_NR;
+                        yx1 = yx1 <= yx_limit ? yx1 : yx_limit;
+                        int slice_len = yx1 - yx0;
+                        bool partial0 = slice_len < FAST_CONV_NR;
+                        packInput(inpbuf, inptr, yxtab, ksize, Cg, Hi, Wi, W0, pad_top, pad_left, stride_x, stride_y,
+                                     yx0, slice_len, fast_1x1, partial0, s1d1p0, s1d1);
+
+                        // 2. do convolution, compute Kg x (yx1 - yx0) part of the output tensor
+                        int outstep0 = out_planesize;
+                        size_t outofs = ((n * ngroups + g) * Kg + k0) * outstep0 + yx0;
+                        float *outptr0 = out + outofs;
+
+                        matMulCompute(outptr0, inpbuf_task, cbuf, conv, HkWkCg, k0, k1, yx0, yx1, out_planesize, g,
+                                      Kg, Kg_aligned, partial0, activ, minval, maxval, ifMinMaxAct);
+                    }
+                }
+            }
+        });
+    }
+}
+
+}} // namespace cv::dnn
\ No newline at end of file
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
new file mode 100644
index 0000000..30c5ea2
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.hpp
@@ -0,0 +1,89 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_FAST_CONVOLUTION_HPP
+#define OPENCV_FAST_CONVOLUTION_HPP
+
+#include "opencv2/core/hal/intrin.hpp"
+
+#ifndef FAST_CONV_PRAM
+#define FAST_CONV_PRAM
+#if CV_NEON && __aarch64__  // 32 registers.
+#define FAST_CONV_MR 4
+#define FAST_CONV_NR 28
+enum { FAST_VEC_NLANES=4 };
+#elif CV_NEON              // 16 registers.
+#define FAST_CONV_MR 4
+#define FAST_CONV_NR 12
+enum { FAST_VEC_NLANES=4 };
+#else // SIMD 128, AVX or AVX2
+#define FAST_CONV_MR 4
+#define FAST_CONV_NR 24
+enum { FAST_VEC_NLANES=4 };
+#endif
+#endif
+
+namespace cv {
+namespace dnn {
+
+struct FastConv2d
+{
+    int ngroups;
+    int K, C, Hk, Wk;
+    int stride_y, stride_x;
+    int dilation_y, dilation_x;
+    int pad_top, pad_bottom, pad_left, pad_right;
+
+    std::vector<float> weightsBuf;        // For generic Conv 2D
+    std::vector<float> weightsWino63Buf;  // For Winograd F(6x6, 3x3).
+
+    std::vector<float> biasBuf;
+    bool ifWinograd63 = false;
+    bool useAVX2 = checkHardwareSupport(CPU_AVX2);
+    bool useNEON = checkHardwareSupport(CPU_NEON);
+};
+
+// return a FastConv2d instance.
+Ptr<FastConv2d> initFastConv2d(
+        int ngroups,
+        int K, int C, int Hk, int Wk,
+        int stride_x, int stride_y,
+        int dilation_x, int dilation_y,
+        const std::vector<size_t>& pads_begin,
+        const std::vector<size_t>& pads_end,
+        float* srcWeights,
+        float* srcBias);
+
+// It contains different computing branches, like winograd, 1x1 conv.
+void runFastConv2d(InputArray _input, OutputArray _output,
+                 const Ptr<FastConv2d>& conv, int ntasks, const Ptr<ActivationLayer>& actLayer);
+
+void runDepthwise(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, float minval, float maxval,
+        ActivationLayer* activ, bool ifMinMaxAct);
+
+// winograd init
+void initWinograd63(Ptr<FastConv2d>& conv, float* src_weight, int K, int C);
+
+int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks,
+                  float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct);
+
+} // namespace dnn
+
+namespace opt_AVX2
+{
+#if CV_TRY_AVX2
+void convBlock_AVX2(int k, const float *a, const float *b,
+        float *c, int ldc, const float *bias,
+        float minval, float maxval, bool ifActiv);
+
+void depthWiseBlock_AVX2(const float *inptr, float *outptr, const float *weights, float biasval, int *ofstab, int *yxtab,
+                float minval, float maxval, int Hi, int Wi, int H0, int W0, int ksize, int pad_top, int pad_left,
+                int dilation_y, int stride_x, int stride_y, int inner_xleft, int inner_xright, int inner_ytop,
+                int inner_ybottom, bool ifMinMaxAct, bool useSIMD, bool is3x3);
+#endif
+} // namespace opt_AVX2
+
+} // namespace cv
+
+#endif //OPENCV_FAST_CONVOLUTION_HPP
diff --git a/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
new file mode 100644
index 0000000..f154290
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/fast_convolution.simd.hpp
@@ -0,0 +1,342 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_FAST_CONVOLUTION_SIMD_HPP
+#define OPENCV_FAST_CONVOLUTION_SIMD_HPP
+
+#include "opencv2/core/hal/intrin.hpp"
+#include <opencv2/core/utils/logger.hpp>
+
+namespace cv {
+namespace dnn {
+
+void convBlock(int k, const float *a, const float *b,
+        float *c, int ldc, const float *bias,
+        float minval, float maxval, bool ifActiv)
+{
+#if CV_SIMD128
+#if FAST_CONV_MR == 4 && FAST_CONV_NR == 24
+    {
+        v_float32x4 c0 = v_setall_f32(bias[0]), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0;
+        v_float32x4 c6 = v_setall_f32(bias[1]), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6;
+        v_float32x4 c12 = v_setall_f32(bias[2]), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12;
+        v_float32x4 c18 = v_setall_f32(bias[3]), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18;
+
+        for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
+        {
+            v_float32x4 a0 = v_setall_f32(a[0]);
+            v_float32x4 b0 = v_load(b), b1 = v_load(b + 4), b2 = v_load(b + 8);
+            v_float32x4 b3 = v_load(b + 12), b4 = v_load(b + 16), b5 = v_load(b + 20);
+
+            c0 = v_fma(b0, a0, c0);
+            c1 = v_fma(b1, a0, c1);
+            c2 = v_fma(b2, a0, c2);
+            c3 = v_fma(b3, a0, c3);
+            c4 = v_fma(b4, a0, c4);
+            c5 = v_fma(b5, a0, c5);
+
+            a0  = v_setall_f32(a[1]);
+            c6  = v_fma(b0, a0, c6);
+            c7  = v_fma(b1, a0, c7);
+            c8  = v_fma(b2, a0, c8);
+            c9  = v_fma(b3, a0, c9);
+            c10 = v_fma(b4, a0, c10);
+            c11 = v_fma(b5, a0, c11);
+
+            a0 = v_setall_f32(a[2]);
+            c12 = v_fma(b0, a0, c12);
+            c13 = v_fma(b1, a0, c13);
+            c14 = v_fma(b2, a0, c14);
+            c15 = v_fma(b3, a0, c15);
+            c16 = v_fma(b4, a0, c16);
+            c17 = v_fma(b5, a0, c17);
+
+            a0 = v_setall_f32(a[3]);
+            c18 = v_fma(b0, a0, c18);
+            c19 = v_fma(b1, a0, c19);
+            c20 = v_fma(b2, a0, c20);
+            c21 = v_fma(b3, a0, c21);
+            c22 = v_fma(b4, a0, c22);
+            c23 = v_fma(b5, a0, c23);
+        }
+
+        if (ifActiv) {
+            v_float32x4 vmin = v_setall_f32(minval), vmax = v_setall_f32(maxval);
+            c0 = v_min(v_max(c0, vmin), vmax);
+            c1 = v_min(v_max(c1, vmin), vmax);
+            c2 = v_min(v_max(c2, vmin), vmax);
+            c3 = v_min(v_max(c3, vmin), vmax);
+            c4 = v_min(v_max(c4, vmin), vmax);
+            c5 = v_min(v_max(c5, vmin), vmax);
+            c6 = v_min(v_max(c6, vmin), vmax);
+            c7 = v_min(v_max(c7, vmin), vmax);
+            c8 = v_min(v_max(c8, vmin), vmax);
+            c9 = v_min(v_max(c9, vmin), vmax);
+            c10 = v_min(v_max(c10, vmin), vmax);
+            c11 = v_min(v_max(c11, vmin), vmax);
+            c12 = v_min(v_max(c12, vmin), vmax);
+            c13 = v_min(v_max(c13, vmin), vmax);
+            c14 = v_min(v_max(c14, vmin), vmax);
+            c15 = v_min(v_max(c15, vmin), vmax);
+            c16 = v_min(v_max(c16, vmin), vmax);
+            c17 = v_min(v_max(c17, vmin), vmax);
+            c18 = v_min(v_max(c18, vmin), vmax);
+            c19 = v_min(v_max(c19, vmin), vmax);
+            c20 = v_min(v_max(c20, vmin), vmax);
+            c21 = v_min(v_max(c21, vmin), vmax);
+            c22 = v_min(v_max(c22, vmin), vmax);
+            c23 = v_min(v_max(c23, vmin), vmax);
+        }
+        v_store(c, c0);
+        v_store(c + 4, c1);
+        v_store(c + 8, c2);
+        v_store(c + 12, c3);
+        v_store(c + 16, c4);
+        v_store(c + 20, c5);
+
+        v_store(c + ldc, c6);
+        v_store(c + ldc + 4, c7);
+        v_store(c + ldc + 8, c8);
+        v_store(c + ldc + 12, c9);
+        v_store(c + ldc + 16, c10);
+        v_store(c + ldc + 20, c11);
+
+        v_store(c + ldc * 2, c12);
+        v_store(c + ldc * 2 + 4, c13);
+        v_store(c + ldc * 2 + 8, c14);
+        v_store(c + ldc * 2 + 12, c15);
+        v_store(c + ldc * 2 + 16, c16);
+        v_store(c + ldc * 2 + 20, c17);
+
+        v_store(c + ldc * 3, c18);
+        v_store(c + ldc * 3 + 4, c19);
+        v_store(c + ldc * 3 + 8, c20);
+        v_store(c + ldc * 3 + 12, c21);
+        v_store(c + ldc * 3 + 16, c22);
+        v_store(c + ldc * 3 + 20, c23);
+    }
+#endif
+#else
+    for (int i = 0; i < FAST_CONV_MR; i++)
+    {
+        float beta = bias[i];
+        for (int j = 0; j < FAST_CONV_NR; j++)
+            c[i*ldc + j] = beta;
+    }
+    for (int p = 0; p < k; p++)
+    {
+        for (int i = 0; i < FAST_CONV_MR; i++)
+        {
+            float alpha = a[FAST_CONV_MR*p + i];
+            for (int j = 0; j < FAST_CONV_NR; j++)
+            {
+                c[i*ldc+j] += b[FAST_CONV_NR*p + j]*alpha;
+            }
+        }
+    }
+    if (ifActiv)
+    {
+        for (int i = 0; i < FAST_CONV_MR; i++)
+        {
+            for (int j = 0; j < FAST_CONV_NR; j++)
+            {
+                float v = c[i*ldc + j];
+                v = std::min(std::max(v, minval), maxval);
+                c[i*ldc + j] = v;
+            }
+        }
+    }
+#endif
+}
+} // namespace dnn
+
+namespace opt_NEON
+{
+#if CV_TRY_NEON
+void convBlock_NEON(int k, const float *a, const float *b,
+                float *c, int ldc, const float *bias,
+                float minval, float maxval, bool ifActiv)
+{
+#if FAST_CONV_MR == 4 && FAST_CONV_NR == 12
+    {
+        float32x4_t c0 = vdupq_n_f32(bias[0]), c1 = c0, c2 = c0;
+        float32x4_t c3 = vdupq_n_f32(bias[1]), c4 = c3, c5 = c3;
+        float32x4_t c6 = vdupq_n_f32(bias[2]), c7 = c6, c8 = c6;
+        float32x4_t c9 = vdupq_n_f32(bias[3]), c10 = c9, c11 = c9;
+
+        float32x4_t a0 = vdupq_n_f32(0.0f);
+        float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
+
+        for (int p = 0; p < k; p++, a += FAST_CONV_MR, b += FAST_CONV_NR)
+        {
+            a0 = vld1q_f32(a);
+            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+
+            c0 = vfmaq_laneq_f32(c0, b0, a0, 0);
+            c1 = vfmaq_laneq_f32(c1, b1, a0, 0);
+            c2 = vfmaq_laneq_f32(c2, b2, a0, 0);
+            c3 = vfmaq_laneq_f32(c3, b0, a0, 1);
+            c4 = vfmaq_laneq_f32(c4, b1, a0, 1);
+            c5 = vfmaq_laneq_f32(c5, b2, a0, 1);
+
+            c6 = vfmaq_laneq_f32(c6, b0, a0, 2);
+            c7 = vfmaq_laneq_f32(c7, b1, a0, 2);
+            c8 = vfmaq_laneq_f32(c8, b2, a0, 2);
+
+            c9 = vfmaq_laneq_f32(c9, b0, a0, 3);
+            c10 = vfmaq_laneq_f32(c10, b1, a0, 3);
+            c11 = vfmaq_laneq_f32(c11, b2, a0, 3);
+        }
+
+        if (ifActiv)
+        {
+            b0 = vdupq_n_f32(minval), b1 = vdupq_n_f32(maxval);
+            c0 = vminq_f32(vmaxq_f32(c0, b0), b1);
+            c1 = vminq_f32(vmaxq_f32(c1, b0), b1);
+            c2 = vminq_f32(vmaxq_f32(c2, b0), b1);
+            c3 = vminq_f32(vmaxq_f32(c3, b0), b1);
+            c4 = vminq_f32(vmaxq_f32(c4, b0), b1);
+            c5 = vminq_f32(vmaxq_f32(c5, b0), b1);
+            c6 = vminq_f32(vmaxq_f32(c6, b0), b1);
+            c7 = vminq_f32(vmaxq_f32(c7, b0), b1);
+            c8 = vminq_f32(vmaxq_f32(c8, b0), b1);
+            c9 = vminq_f32(vmaxq_f32(c9, b0), b1);
+            c10 = vminq_f32(vmaxq_f32(c10, b0), b1);
+            c11 = vminq_f32(vmaxq_f32(c11, b0), b1);
+        }
+        vst1q_f32(c, c0); vst1q_f32(c+4, c1); vst1q_f32(c+8, c2);
+        vst1q_f32(c + ldc, c3); vst1q_f32(c + ldc + 4, c4); vst1q_f32(c + ldc + 8, c5);
+        vst1q_f32(c + ldc*2, c6); vst1q_f32(c + ldc*2 + 4, c7); vst1q_f32(c + ldc*2 + 8, c8);
+        vst1q_f32(c + ldc*3, c9); vst1q_f32(c + ldc*3 + 4, c10); vst1q_f32(c + ldc*3 + 8, c11);
+    }
+#elif FAST_CONV_MR == 4 && FAST_CONV_NR == 28
+    {
+        float32x4_t c0 = vdupq_n_f32(bias[0]), c1 = c0, c2 = c0, c3 = c0, c4 = c0, c5 = c0, c24 = c0;
+        float32x4_t c6 = vdupq_n_f32(bias[1]), c7 = c6, c8 = c6, c9 = c6, c10 = c6, c11 = c6, c25 = c6;
+        float32x4_t c12 = vdupq_n_f32(bias[2]), c13 = c12, c14 = c12, c15 = c12, c16 = c12, c17 = c12, c26 = c12;
+        float32x4_t c18 = vdupq_n_f32(bias[3]), c19 = c18, c20 = c18, c21 = c18, c22 = c18, c23 = c18, c27 = c18;
+
+        float32x4_t a0 = vdupq_n_f32(0.0f);
+        float32x4_t b0 = vdupq_n_f32(0.0f), b1 = vdupq_n_f32(0.0f), b2 = vdupq_n_f32(0.0f);
+
+        for (int p = 0; p < k; p++, a += FAST_CONV_MR) {
+            a0 = vld1q_f32(a);
+            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+            b += 12;
+
+            c0 = vfmaq_laneq_f32(c0, b0, a0, 0);
+            c1 = vfmaq_laneq_f32(c1, b1, a0, 0);
+            c2 = vfmaq_laneq_f32(c2, b2, a0, 0);
+            c6 = vfmaq_laneq_f32(c6, b0, a0, 1);
+            c7 = vfmaq_laneq_f32(c7, b1, a0, 1);
+            c8 = vfmaq_laneq_f32(c8, b2, a0, 1);
+            c12 = vfmaq_laneq_f32(c12, b0, a0, 2);
+            c13 = vfmaq_laneq_f32(c13, b1, a0, 2);
+            c14 = vfmaq_laneq_f32(c14, b2, a0, 2);
+            c18 = vfmaq_laneq_f32(c18, b0, a0, 3);
+            c19 = vfmaq_laneq_f32(c19, b1, a0, 3);
+            c20 = vfmaq_laneq_f32(c20, b2, a0, 3);
+
+            b0 = vld1q_f32(b), b1 = vld1q_f32(b + 4), b2 = vld1q_f32(b + 8);
+            b += 12;
+
+            c3 = vfmaq_laneq_f32(c3, b0, a0, 0);
+            c4 = vfmaq_laneq_f32(c4, b1, a0, 0);
+            c5 = vfmaq_laneq_f32(c5, b2, a0, 0);
+
+            c9 = vfmaq_laneq_f32(c9, b0, a0, 1);
+            c10 = vfmaq_laneq_f32(c10, b1, a0, 1);
+            c11 = vfmaq_laneq_f32(c11, b2, a0, 1);
+
+            c15 = vfmaq_laneq_f32(c15, b0, a0, 2);
+            c16 = vfmaq_laneq_f32(c16, b1, a0, 2);
+            c17 = vfmaq_laneq_f32(c17, b2, a0, 2);
+
+            c21 = vfmaq_laneq_f32(c21, b0, a0, 3);
+
+            b0 = vld1q_f32(b);
+            b += 4;
+
+            c22 = vfmaq_laneq_f32(c22, b1, a0, 3);
+            c23 = vfmaq_laneq_f32(c23, b2, a0, 3);
+
+            c24 = vfmaq_laneq_f32(c24, b0, a0, 0);
+            c25 = vfmaq_laneq_f32(c25, b0, a0, 1);
+            c26 = vfmaq_laneq_f32(c26, b0, a0, 2);
+            c27 = vfmaq_laneq_f32(c27, b0, a0, 3);
+        }
+
+        if (ifActiv) {
+            b0 = vdupq_n_f32(minval), b1 = vdupq_n_f32(maxval);
+            c0 = vminq_f32(vmaxq_f32(c0, b0), b1);
+            c1 = vminq_f32(vmaxq_f32(c1, b0), b1);
+            c2 = vminq_f32(vmaxq_f32(c2, b0), b1);
+            c3 = vminq_f32(vmaxq_f32(c3, b0), b1);
+            c4 = vminq_f32(vmaxq_f32(c4, b0), b1);
+            c5 = vminq_f32(vmaxq_f32(c5, b0), b1);
+            c6 = vminq_f32(vmaxq_f32(c6, b0), b1);
+            c7 = vminq_f32(vmaxq_f32(c7, b0), b1);
+            c8 = vminq_f32(vmaxq_f32(c8, b0), b1);
+            c9 = vminq_f32(vmaxq_f32(c9, b0), b1);
+            c10 = vminq_f32(vmaxq_f32(c10, b0), b1);
+            c11 = vminq_f32(vmaxq_f32(c11, b0), b1);
+            c12 = vminq_f32(vmaxq_f32(c12, b0), b1);
+            c13 = vminq_f32(vmaxq_f32(c13, b0), b1);
+            c14 = vminq_f32(vmaxq_f32(c14, b0), b1);
+            c15 = vminq_f32(vmaxq_f32(c15, b0), b1);
+            c16 = vminq_f32(vmaxq_f32(c16, b0), b1);
+            c17 = vminq_f32(vmaxq_f32(c17, b0), b1);
+            c18 = vminq_f32(vmaxq_f32(c18, b0), b1);
+            c19 = vminq_f32(vmaxq_f32(c19, b0), b1);
+            c20 = vminq_f32(vmaxq_f32(c20, b0), b1);
+            c21 = vminq_f32(vmaxq_f32(c21, b0), b1);
+            c22 = vminq_f32(vmaxq_f32(c22, b0), b1);
+            c23 = vminq_f32(vmaxq_f32(c23, b0), b1);
+            c24 = vminq_f32(vmaxq_f32(c24, b0), b1);
+            c25 = vminq_f32(vmaxq_f32(c25, b0), b1);
+            c26 = vminq_f32(vmaxq_f32(c26, b0), b1);
+            c27 = vminq_f32(vmaxq_f32(c27, b0), b1);
+        }
+        vst1q_f32(c, c0);
+        vst1q_f32(c + 4, c1);
+        vst1q_f32(c + 8, c2);
+        vst1q_f32(c + 12, c3);
+        vst1q_f32(c + 16, c4);
+        vst1q_f32(c + 20, c5);
+        vst1q_f32(c + 24, c24);
+
+        vst1q_f32(c + ldc, c6);
+        vst1q_f32(c + ldc + 4, c7);
+        vst1q_f32(c + ldc + 8, c8);
+        vst1q_f32(c + ldc + 12, c9);
+        vst1q_f32(c + ldc + 16, c10);
+        vst1q_f32(c + ldc + 20, c11);
+        vst1q_f32(c + ldc + 24, c25);
+
+        vst1q_f32(c + ldc * 2, c12);
+        vst1q_f32(c + ldc * 2 + 4, c13);
+        vst1q_f32(c + ldc * 2 + 8, c14);
+        vst1q_f32(c + ldc * 2 + 12, c15);
+        vst1q_f32(c + ldc * 2 + 16, c16);
+        vst1q_f32(c + ldc * 2 + 20, c17);
+        vst1q_f32(c + ldc * 2 + 24, c26);
+
+        vst1q_f32(c + ldc * 3, c18);
+        vst1q_f32(c + ldc * 3 + 4, c19);
+        vst1q_f32(c + ldc * 3 + 8, c20);
+        vst1q_f32(c + ldc * 3 + 12, c21);
+        vst1q_f32(c + ldc * 3 + 16, c22);
+        vst1q_f32(c + ldc * 3 + 20, c23);
+        vst1q_f32(c + ldc * 3 + 24, c27);
+    }
+#else
+#error "unsupported FAST_CONV_MR and/or FAST_CONV_NR in convBlock_NEON."
+#endif
+}
+
+#endif
+} // namespace opt_NEON
+
+} // namespace cv
+#endif //OPENCV_FAST_CONVOLUTION_SIMD_HPP
diff --git a/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
new file mode 100644
index 0000000..7a0720f
--- /dev/null
+++ b/modules/dnn/src/layers/fast_convolution/winograd_3x3s1_f63.cpp
@@ -0,0 +1,1351 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+/*
+Winograd-based convolution F(6x6, 3x3).
+The code has been borrowed from ncnn inference engine (https://github.com/Tencent/ncnn)
+and adapted for OpenCV by Zihao Mu.
+
+Below is the original copyright
+*/
+
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "../../precomp.hpp"
+#include "fast_convolution.hpp"
+
+namespace cv { namespace dnn {
+enum
+{
+    WINO_STEP=6,
+    WINO_KSIZE=3,
+    WINO_SIZE= WINO_STEP + WINO_KSIZE - 1,
+    WINO_AREA= WINO_SIZE * WINO_SIZE
+};
+
+#if CV_NEON
+static void winograd_trans_input_F63(float* src, float* dst, int Channle_div4, const int tiles, const int big_step, const int line_step, const int* ofstab0)
+{
+    // const float itm[8][8] = {
+    //     {1.0f,  0.0f, -5.25f,  0.00f,  5.25f,  0.00f, -1.0f, 0.0f},
+    //
+    //     {0.0f,  1.0f,  1.00f, -4.25f, -4.25f,  1.00f,  1.0f, 0.0f},
+    //     {0.0f, -1.0f,  1.00f,  4.25f, -4.25f, -1.00f,  1.0f, 0.0f},
+    //
+    //     {0.0f,  0.5f,  0.25f, -2.50f, -1.25f,  2.00f,  1.0f, 0.0f},
+    //     {0.0f, -0.5f,  0.25f,  2.50f, -1.25f, -2.00f,  1.0f, 0.0f},
+    //
+    //     {0.0f,  2.0f,  4.00f, -2.50f, -5.00f,  0.50f,  1.0f, 0.0f},
+    //     {0.0f, -2.0f,  4.00f,  2.50f, -5.00f, -0.50f,  1.0f, 0.0f},
+    //
+    //     {0.0f, -1.0f,  0.00f,  5.25f,  0.00f, -5.25f,  0.0f, 1.0f}
+    // };
+
+    // 0 = r00 - r06 + (r04 - r02) * 5.25
+    // 7 = r07 - r01 + (r03 - r05) * 5.25
+
+    // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
+    // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
+
+    // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
+    // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
+
+    // reuse r04 * 1.25
+    // reuse r03 * 2.5
+    // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
+    // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
+
+    float tmp[8][8][FAST_VEC_NLANES];
+    AutoBuffer<float> input_buf0_;
+    input_buf0_.allocate(64 * tiles * FAST_VEC_NLANES);
+
+    float* input_buf0 = input_buf0_.data();
+    memset(input_buf0, 0, 64 * tiles * FAST_VEC_NLANES * sizeof(float ));
+
+    for (int ti = 0; ti < tiles; ti++)
+    {
+        float* input0 = src + ti *  64 * 4;
+        float* input = input0;
+        for (int m = 0; m < 8; m++)
+        {
+            float32x4_t _r00 = vld1q_f32(input);
+            float32x4_t _r01 = vld1q_f32(input + 4);
+            float32x4_t _r02 = vld1q_f32(input + 8);
+            float32x4_t _r03 = vld1q_f32(input + 12);
+            float32x4_t _r04 = vld1q_f32(input + 16);
+            float32x4_t _r05 = vld1q_f32(input + 20);
+            float32x4_t _r06 = vld1q_f32(input + 24);
+            float32x4_t _r07 = vld1q_f32(input + 28);
+
+            float32x4_t _tmp0m = vmlaq_n_f32(vsubq_f32(_r00, _r06), vsubq_f32(_r04, _r02), 5.25f);
+            float32x4_t _tmp7m = vmlaq_n_f32(vsubq_f32(_r07, _r01), vsubq_f32(_r03, _r05), 5.25f);
+            vst1q_f32(tmp[0][m], _tmp0m);
+            vst1q_f32(tmp[7][m], _tmp7m);
+
+            float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_r02, _r06), _r04, 4.25f);
+            float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_r01, _r05), _r03, 4.25f);
+
+            float32x4_t _tmp1m = vaddq_f32(_tmp12a, _tmp12b);
+            float32x4_t _tmp2m = vsubq_f32(_tmp12a, _tmp12b);
+            vst1q_f32(tmp[1][m], _tmp1m);
+            vst1q_f32(tmp[2][m], _tmp2m);
+
+            float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_r06, _r02, 0.25f), _r04, 1.25f);
+            float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 0.5f), _r03, 2.5f), _r05, 2.f);
+
+            float32x4_t _tmp3m = vaddq_f32(_tmp34a, _tmp34b);
+            float32x4_t _tmp4m = vsubq_f32(_tmp34a, _tmp34b);
+            vst1q_f32(tmp[3][m], _tmp3m);
+            vst1q_f32(tmp[4][m], _tmp4m);
+
+            float32x4_t _tmp56a = vmlaq_n_f32(_r06, vmlsq_n_f32(_r02, _r04, 1.25f), 4.f);
+            float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 2.f), _r03, 2.5f), _r05, 0.5f);
+
+            float32x4_t _tmp5m = vaddq_f32(_tmp56a, _tmp56b);
+            float32x4_t _tmp6m = vsubq_f32(_tmp56a, _tmp56b);
+            vst1q_f32(tmp[5][m], _tmp5m);
+            vst1q_f32(tmp[6][m], _tmp6m);
+
+            input += 8 * FAST_VEC_NLANES;
+        }
+
+        float* input_buf00 = input_buf0 + ti * 4;
+        float* input_buf01 = input_buf00 + tiles * 4;
+        float* input_buf02 = input_buf00 + tiles * 8;
+        float* input_buf03 = input_buf00 + tiles * 12;
+        float* input_buf04 = input_buf00 + tiles * 16;
+        float* input_buf05 = input_buf00 + tiles * 20;
+        float* input_buf06 = input_buf00 + tiles * 24;
+        float* input_buf07 = input_buf00 + tiles * 28;
+
+        for (int m = 0; m < 8; m++)
+        {
+            float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
+            float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
+            float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
+            float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
+            float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
+            float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
+            float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
+            float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
+
+            float32x4_t _r0tm0 = vmlaq_n_f32(vsubq_f32(_tmp00, _tmp06), vsubq_f32(_tmp04, _tmp02), 5.25f);
+            float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f);
+
+            float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f);
+            float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f);
+
+            float32x4_t _r0tm1 = vaddq_f32(_tmp12a, _tmp12b);
+            float32x4_t _r0tm2 = vsubq_f32(_tmp12a, _tmp12b);
+
+            float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f);
+            float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f);
+
+            float32x4_t _r0tm3 = vaddq_f32(_tmp34a, _tmp34b);
+            float32x4_t _r0tm4 = vsubq_f32(_tmp34a, _tmp34b);
+
+            float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f);
+            float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f);
+
+            float32x4_t _r0tm5 = vaddq_f32(_tmp56a, _tmp56b);
+            float32x4_t _r0tm6 = vsubq_f32(_tmp56a, _tmp56b);
+
+            vst1q_f32(input_buf00,  _r0tm0);
+            vst1q_f32(input_buf01,  _r0tm1);
+            vst1q_f32(input_buf02,  _r0tm2);
+            vst1q_f32(input_buf03, _r0tm3);
+            vst1q_f32(input_buf04, _r0tm4);
+            vst1q_f32(input_buf05, _r0tm5);
+            vst1q_f32(input_buf06, _r0tm6);
+            vst1q_f32(input_buf07, _r0tm7);
+
+            input_buf00 += tiles * 32;
+            input_buf01 += tiles * 32;
+            input_buf02 += tiles * 32;
+            input_buf03 += tiles * 32;
+            input_buf04 += tiles * 32;
+            input_buf05 += tiles * 32;
+            input_buf06 += tiles * 32;
+            input_buf07 += tiles * 32;
+        }
+    }
+
+    // [line Number, input pack]
+    // if InpPack == 8;
+    for (int r = 0; r < 64; r++)
+    {
+        int ti = 0;
+        float* out0 = dst + r * big_step;
+        float* input0 = input_buf0 + 4 * tiles * r;
+
+        // TODO! support tiles > 12
+//#if (ARMV8)
+//        for (; ti + 11 < tiles; ti += 12)
+//        {
+//            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+////            std::cout<<"ofstab0[ti * 2] = "<<ofstab0[ti * 2]<<", ofstab0[ti * 2 + 1] = "<<ofstab0[ti * 2 + 1]<<std::endl;
+//            float* input1 = input0 + ti * 4;
+//            memcpy(out1, input1, 12 * 4 * sizeof(float ));
+//        }
+//#endif
+        for (; ti + 7 < tiles; ti += 8)
+        {
+            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+            float* input1 = input0 + ti * 4;
+            memcpy(out1, input1, 8 * 4 * sizeof(float ));
+        }
+
+        for (; ti + 3 < tiles; ti += 4)
+        {
+            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+            float* input1 = input0 + ti * 4;
+            memcpy(out1, input1, 4 * 4 * sizeof(float ));
+        }
+
+        for (; ti + 1 < tiles; ti += 2)
+        {
+            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+            float* input1 = input0 + ti * 4;
+            memcpy(out1, input1, 2 * 4 * sizeof(float ));
+        }
+
+        for (; ti < tiles; ti++)
+        {
+            float* out1 = out0 + line_step * ofstab0[ti * 2] + Channle_div4 * ofstab0[ti * 2 + 1] * 4;
+            float* input1 = input0 + ti * 4;
+            memcpy(out1, input1, 1 * 4 * sizeof(float ));
+        }
+    }
+}
+
+static void winograd_trans_output_F63(float* src_, float* bias_, float minval, float maxval, bool ifMinMaxAct)
+{
+    // const float otm[6][8] = {
+    //     {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
+    //     {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
+    //     {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
+    // };
+
+    // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
+    // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
+    // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
+    // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
+    // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
+    // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
+
+    float32x4_t bias0 = bias_ ? vld1q_f32(bias_) : vdupq_n_f32(0.f);
+    float tmp[6][8][4];
+
+    for (int m = 0; m < 8; m++)
+    {
+        float* output0 = src_ + 8 * m * FAST_VEC_NLANES;
+
+        float32x4_t _out0tm0 = vld1q_f32(output0);
+        float32x4_t _out0tm1 = vld1q_f32(output0 + FAST_VEC_NLANES * 1);
+        float32x4_t _out0tm2 = vld1q_f32(output0 + FAST_VEC_NLANES * 2);
+        float32x4_t _out0tm3 = vld1q_f32(output0 + FAST_VEC_NLANES * 3);
+        float32x4_t _out0tm4 = vld1q_f32(output0 + FAST_VEC_NLANES * 4);
+        float32x4_t _out0tm5 = vld1q_f32(output0 + FAST_VEC_NLANES * 5);
+        float32x4_t _out0tm6 = vld1q_f32(output0 + FAST_VEC_NLANES * 6);
+        float32x4_t _out0tm7 = vld1q_f32(output0 + FAST_VEC_NLANES * 7);
+
+        float32x4_t _tmp024a = vaddq_f32(_out0tm1, _out0tm2);
+        float32x4_t _tmp135a = vsubq_f32(_out0tm1, _out0tm2);
+
+        float32x4_t _tmp024b = vaddq_f32(_out0tm3, _out0tm4);
+        float32x4_t _tmp135b = vsubq_f32(_out0tm3, _out0tm4);
+
+        float32x4_t _tmp024c = vaddq_f32(_out0tm5, _out0tm6);
+        float32x4_t _tmp135c = vsubq_f32(_out0tm5, _out0tm6);
+
+        float32x4_t _tmp0m = vaddq_f32(vaddq_f32(_out0tm0, _tmp024a), vmlaq_n_f32(_tmp024b, _tmp024c, 32.f));
+        float32x4_t _tmp2m = vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 4.f), _tmp024c, 8.f);
+        float32x4_t _tmp4m = vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 16.f), _tmp024c, 2.f);
+        vst1q_f32(tmp[0][m], _tmp0m);
+        vst1q_f32(tmp[2][m], _tmp2m);
+        vst1q_f32(tmp[4][m], _tmp4m);
+
+        float32x4_t _tmp1m = vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 2.f), _tmp135c, 16.f);
+        float32x4_t _tmp3m = vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 8.f), _tmp135c, 4.f);
+        float32x4_t _tmp5m = vaddq_f32(vaddq_f32(_out0tm7, _tmp135a), vmlaq_n_f32(_tmp135c, _tmp135b, 32.f));
+        vst1q_f32(tmp[1][m], _tmp1m);
+        vst1q_f32(tmp[3][m], _tmp3m);
+        vst1q_f32(tmp[5][m], _tmp5m);
+    }
+
+    for (int m = 0; m < 6; m++)
+    {
+        float* output0 = src_ + 6 * m * FAST_VEC_NLANES;
+
+        float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
+        float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
+        float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
+        float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
+        float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
+        float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
+        float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
+        float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
+
+        float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02);
+        float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02);
+
+        float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04);
+        float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04);
+
+        float32x4_t _tmp024c = vaddq_f32(_tmp05, _tmp06);
+        float32x4_t _tmp135c = vsubq_f32(_tmp05, _tmp06);
+
+        float32x4_t _out00 = vaddq_f32(bias0, vaddq_f32(vaddq_f32(_tmp00, _tmp024a), vmlaq_n_f32(_tmp024b, _tmp024c, 32.f)));
+        float32x4_t _out02 = vaddq_f32(bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 4.f), _tmp024c, 8.f));
+        float32x4_t _out04 = vaddq_f32(bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 16.f), _tmp024c, 2.f));
+
+        float32x4_t _out01 = vaddq_f32(bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 2.f), _tmp135c, 16.f));
+        float32x4_t _out03 = vaddq_f32(bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 8.f), _tmp135c, 4.f));
+        float32x4_t _out05 = vaddq_f32(bias0, vaddq_f32(vaddq_f32(_tmp07, _tmp135a), vmlaq_n_f32(_tmp135c, _tmp135b, 32.f)));
+
+        if (ifMinMaxAct)
+        {
+            float32x4_t vmin = vdupq_n_f32(minval), vmax = vdupq_n_f32(maxval);
+            _out00 = vminq_f32(vmaxq_f32(_out00, vmin), vmax);
+            _out01 = vminq_f32(vmaxq_f32(_out01, vmin), vmax);
+            _out02 = vminq_f32(vmaxq_f32(_out02, vmin), vmax);
+            _out03 = vminq_f32(vmaxq_f32(_out03, vmin), vmax);
+            _out04 = vminq_f32(vmaxq_f32(_out04, vmin), vmax);
+            _out05 = vminq_f32(vmaxq_f32(_out05, vmin), vmax);
+        }
+
+        vst1q_f32(output0,                     _out00);
+        vst1q_f32(output0 +     FAST_VEC_NLANES, _out01);
+        vst1q_f32(output0 + 2 * FAST_VEC_NLANES, _out02);
+        vst1q_f32(output0 + 3 * FAST_VEC_NLANES, _out03);
+        vst1q_f32(output0 + 4 * FAST_VEC_NLANES, _out04);
+        vst1q_f32(output0 + 5 * FAST_VEC_NLANES, _out05);
+    }
+}
+
+void initWinograd63(Ptr<FastConv2d>& conv, float* srcWeight, int K, int C)
+{
+    static const float ktm[8][3] = {
+            {1.0f,      0.0f,      0.0f},
+            {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+            {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+            {1.0f / 90, 1.0f / 45, 2.0f / 45},
+            {1.0f / 90, -1.0f / 45, 2.0f / 45},
+            {1.0f / 45, 1.0f / 90, 1.0f / 180},
+            {1.0f / 45, -1.0f / 90, 1.0f / 180},
+            {0.0f, 0.0f, 1.0f}
+    };
+
+    int K_aligned = ((K + FAST_VEC_NLANES - 1)/FAST_VEC_NLANES) * FAST_VEC_NLANES;
+    int C_aligned = ((C + FAST_VEC_NLANES - 1)/FAST_VEC_NLANES) * FAST_VEC_NLANES;
+    const int winoSize = C * WINO_AREA;
+    const int kArea = WINO_KSIZE * WINO_KSIZE;
+    const int kSize = C * kArea;
+
+    // Allocate memory for winograd.
+    int nweights = K_aligned * C_aligned * WINO_AREA;
+
+    conv->weightsWino63Buf.reserve(nweights);
+    float* weightsWino63Ptr = conv->weightsWino63Buf.data();
+    memset(weightsWino63Ptr, 0, nweights*sizeof(weightsWino63Ptr[0]));
+    float* wptrWino = weightsWino63Ptr;
+
+    AutoBuffer<float> kernelTm0_;
+    kernelTm0_.allocate(WINO_AREA * K * C);
+    float *kernelTm = kernelTm0_.data();
+    memset(kernelTm, 0, WINO_AREA * K * C*sizeof(kernelTm[0]));
+
+    // Step1 Transform : size [K, C, 8, 8]
+    parallel_for_(Range(0, K), [&](const Range& r0)
+    {
+        for (int outc = r0.start; outc < r0.end; outc++)
+        {
+            for (int inc = 0; inc < C; inc++)
+            {
+                float *kernel_tm0 = kernelTm + outc * winoSize + inc * WINO_AREA;
+                const float *kernel0 = srcWeight + outc * kSize + inc * kArea;
+
+                // transform kernel, transposed
+                const float *k0 = kernel0;
+                const float *k1 = kernel0 + 3;
+                const float *k2 = kernel0 + 6;
+
+                // h
+                float tmp[8][3];
+                for (int i = 0; i < 8; i++)
+                {
+                    tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                    tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                    tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+                }
+
+                // v
+                for (int j = 0; j < 8; j++)
+                {
+                    float *tmpp = &tmp[j][0];
+
+                    for (int i = 0; i < 8; i++)
+                    {
+                        kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                    }
+                }
+            }
+        }
+    });
+
+    // Step2 Pack 4:
+    // If the number of vector registers >= 32 and outch >= 8,
+    // the size = [8*8, K/4/2, C * 2, 4], otherwise [8*8, K/4, C, 4]
+    for (int r = 0; r < 64; r++)
+    {
+        int outc = 0;
+        float* out0 = wptrWino + r * K_aligned * C_aligned;
+        float* tmp0 = kernelTm + r;
+
+#if CV_NEON_AARCH64
+        // Pack 8
+        for (;outc + 7 < K_aligned; outc += 8)
+        {
+            for (int i = 0; i < 8; i++)
+            {
+                int outc_i = outc + i;
+                int offset8 = outc_i % 8;
+                int outc8 = outc_i / 8;
+                float* out1 = out0 + outc8 * 8 * C_aligned + offset8;
+
+                if (outc_i >= K)
+                {
+                    continue;
+                }
+                else
+                {
+                    float* tmp1 = tmp0 + outc_i * 64 * C;
+
+                    for (int inc = 0; inc < C_aligned; inc++)
+                    {
+                        if (inc >= C)
+                            continue;
+
+                        out1[inc * 8] = tmp1[inc * 64];
+                    }
+                }
+            }
+        }
+#endif
+
+        // Pack 4
+        for (;outc < K_aligned; outc++)
+        {
+            int offset4 = outc % FAST_VEC_NLANES;
+            int outc4 = outc / FAST_VEC_NLANES;
+            float* out1 = out0 + outc4 * 4 * C_aligned + offset4;
+
+            if (outc >= K)
+            {
+                continue;
+            }
+            else
+            {
+                float* tmp1 = tmp0 + outc * 64 * C;
+
+                for (int inc = 0; inc < C_aligned; inc++)
+                {
+                    if (inc >= C)
+                        continue;
+
+                    out1[inc * 4] = tmp1[inc * 64];
+                }
+            }
+
+        }
+    }
+}
+
+int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks, float minval,
+        float maxval, ActivationLayer* activ, bool ifMinMaxAct)
+{
+    Mat input = _input.getMat();
+    Mat output = _output.getMat();
+
+    MatShape inputShape = shape(input);
+    MatShape outputShape = shape(output);
+    CV_Assert(inputShape.size() == 4 && outputShape.size() == 4);
+
+    int N = inputShape[0], C = inputShape[1], Hi = inputShape[2], Wi = inputShape[3];  // [N, C, H, W]
+    int K = conv->K;
+    int H0 = outputShape[2], W0 = outputShape[3];
+
+    // Allocate the right memory size for output.
+    // H and W is integer of 6. the output HxW is integer of 6x6
+    int H_tiles = ((H0 + 5) / 6);
+    int W_tiles = ((W0 + 5) / 6);
+    int tiles = H_tiles * W_tiles;
+
+    int H0_align = H_tiles * 6;
+    int W0_align = W_tiles * 6;
+
+    int Hi_align = H0_align + 2;
+    int Wi_align = W0_align + 2;
+
+    int pad_top = conv->pad_top, pad_bottom = Hi_align - pad_top - Hi;
+    int pad_left = conv->pad_left, pad_right = Wi_align - pad_left - Wi;
+
+    int in_top = pad_top, in_bottom = Hi_align - pad_bottom;
+    int in_left = pad_left, in_right = Wi_align - pad_right;
+
+    CV_Assert(in_bottom >= in_top && in_right >= in_left);
+
+    int C_aligned = ((C + FAST_VEC_NLANES - 1)/FAST_VEC_NLANES) * FAST_VEC_NLANES;
+    int K_aligned = ((K + FAST_VEC_NLANES - 1)/FAST_VEC_NLANES) * FAST_VEC_NLANES;
+
+    int inpPack = 0;
+    int lineNum =0;
+
+    // TODO! tiles > 12
+//#if CV_NEON_AARCH64
+//    if (tiles >= 12)
+//    {
+//        inpPack = 12;
+//        lineNum = tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2;
+//    }
+//    else
+//#endif
+    if (tiles >= 8)
+    {
+        inpPack = 8;
+        lineNum = tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2;
+    }
+    else
+    if (tiles >= 4)
+    {
+        inpPack = 4;
+        lineNum = tiles / 4 + (tiles % 4) / 2 + tiles % 2;
+    }
+    else if (tiles >= 2)
+    {
+        inpPack = 2;
+        lineNum = tiles / 2 + tiles % 2;
+    }
+    else // tiles >= 1
+    {
+        inpPack = 1;
+        lineNum = tiles;
+    }
+    CV_Assert(lineNum > 0 && inpPack > 0);
+    std::vector<int> ofstab0_(tiles * 2, 0);
+    int* ofstab0 = ofstab0_.data(); // [line Number, input pack]
+
+    int tiles_tmp = tiles;
+    int line_0 = 0;
+
+    int* ofstab_tmp = ofstab0;
+    int big_step = inpPack * C_aligned * lineNum;
+    int line_step = inpPack * C_aligned;
+
+    std::vector<int> linePackList = {12, 8, 4, 2, 1};
+    auto iter = std::find(linePackList.begin(), linePackList.end(), inpPack);
+    CV_Assert(iter != linePackList.end());
+    int ptr = iter - linePackList.begin();
+
+    while (ptr < linePackList.size() && tiles_tmp != 0)
+    {
+        if (tiles_tmp >= linePackList[ptr])
+        {
+            int num = tiles_tmp / linePackList[ptr];
+            for (int i = 0; i < num; i ++)
+            {
+                for (int j = 0; j < linePackList[ptr]; j++)
+                {
+                    ofstab_tmp[0] = line_0;
+                    ofstab_tmp[1] = linePackList[ptr];
+                    ofstab_tmp += 2;
+                }
+                line_0++;
+            }
+            tiles_tmp -= num * linePackList[ptr];
+        }
+        else
+        {
+            ptr++;
+        }
+    }
+
+    const size_t out_planesize = (size_t)H0*W0;
+
+    size_t inputbuf_size = inpPack * C_aligned * lineNum * 64;
+    size_t inputbufCn_size = ntasks * tiles * 4 * 8 * 8;
+
+    size_t outputbuf_size = tiles * K_aligned * 8 * 8;
+    size_t outputCnbuf_size = ntasks * 8 * 8 * 4;
+
+    AutoBuffer<float> inputbuf0_, inputCnbuf0_, outputbuf0_, outputCnbuf0_;
+
+    inputbuf0_.allocate(inputbuf_size);
+    float* inputbuf0 = alignPtr(inputbuf0_.data(), (int)(sizeof(float)));
+    memset(inputbuf0, 0, inputbuf_size * sizeof(float ));
+
+    inputCnbuf0_.allocate(inputbufCn_size);
+    float* inputCnbuf0 = inputCnbuf0_.data();
+
+    outputbuf0_.allocate(outputbuf_size);
+    float* outputbuf0 = outputbuf0_.data();
+
+    outputCnbuf0_.allocate(outputCnbuf_size);
+    float* outputCnbuf0 = outputCnbuf0_.data();
+
+    // Input Parallel For
+    float* weight_ptr0 = conv->weightsWino63Buf.data();
+    for (int bn = 0; bn < N; bn++)
+    {
+        float* input_ptr0 = input.ptr<float>() + bn * Hi * Wi * C;
+        float* output_ptr0 = output.ptr<float>() + bn * out_planesize * K;
+
+        // Transform Input
+        //int taskItemLen = C_aligned/4/ntasks;
+        int C_aligned_div4 = C_aligned/4;
+
+        parallel_for_(Range(0, ntasks), [&](const Range& range)
+        {
+            for (int task_i = range.start; task_i < range.end; task_i++)
+            {
+                float *inpCnbuf = inputCnbuf0 + tiles * 256 * task_i;
+                for (int inc4 = task_i; inc4 < C_aligned_div4; inc4 += ntasks)
+                {
+                    for (int cn = 0; cn < 4; cn++)
+                    {
+                        if (cn + inc4 * 4 >= C)
+                        {
+                            // set value to zero
+                            for (int ti = 0; ti < tiles; ti++)
+                            {
+                                float *inpCnbuf_i = inpCnbuf + ti * 4 * 64 + cn;
+
+                                for (int i = 0; i < 8; i++)
+                                {
+                                    inpCnbuf_i[0] = 0.0f;
+                                    inpCnbuf_i[4] = 0.0f;
+                                    inpCnbuf_i[8] = 0.0f;
+                                    inpCnbuf_i[12] = 0.0f;
+
+                                    inpCnbuf_i[16] = 0.0f;
+                                    inpCnbuf_i[20] = 0.0f;
+                                    inpCnbuf_i[24] = 0.0f;
+                                    inpCnbuf_i[28] = 0.0f;
+
+                                    inpCnbuf_i += 4 * 8;
+                                }
+                            }
+                        }
+                        else
+                        {
+                            float *input_ptr = input_ptr0 + (inc4 * 4 + cn) * Hi * Wi;
+
+                            for (int ti = 0; ti < tiles; ti++)
+                            {
+                                float *input_buf0_i = inpCnbuf + ti * 256 + cn;
+
+                                int hi = ti / W_tiles;
+                                int wi = ti % W_tiles;
+
+                                int h_top = hi * 6, h_bottom = hi * 6 + 8;
+                                int w_left = wi * 6, w_right = wi * 6 + 8;
+
+                                for (int h = h_top; h < h_bottom; h++)
+                                {
+                                    if (h >= in_bottom || h < in_top)
+                                    {
+                                        input_buf0_i[0] = 0.0f;
+                                        input_buf0_i[4] = 0.0f;
+                                        input_buf0_i[8] = 0.0f;
+                                        input_buf0_i[12] = 0.0f;
+
+                                        input_buf0_i[16] = 0.0f;
+                                        input_buf0_i[20] = 0.0f;
+                                        input_buf0_i[24] = 0.0f;
+                                        input_buf0_i[28] = 0.0f;
+
+                                        input_buf0_i += 32;
+                                        continue;
+                                    }
+
+                                    for (int w = w_left; w < w_right; w++)
+                                    {
+                                        if (w >= in_right || w < in_left)
+                                        {
+                                            input_buf0_i[0] = 0.0f;
+                                            input_buf0_i += 4;
+                                            continue;
+                                        }
+                                        input_buf0_i[0] = input_ptr[(h - pad_top) * Wi + w - pad_left];
+                                        input_buf0_i += 4;
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    // Transfor Compute BdB^T
+                    winograd_trans_input_F63(inpCnbuf, inputbuf0, inc4, tiles, big_step, line_step, ofstab0);
+                }
+            }
+        });
+
+        // Matrix multiplication 8 channel
+        int K_div8 = 0;
+
+#if CV_NEON_AARCH64
+        K_div8 = K_aligned/8;
+
+        parallel_for_(Range(0, K_div8), [&](const Range &range){
+        for (int outcn = range.start; outcn < range.end; outcn ++)
+        {
+            float* output_tmp = outputbuf0 + tiles * outcn * 8;
+            float* kernel_tmp = weight_ptr0 + outcn * 8 * C_aligned;
+            for (int r = 0; r < 64; r++)
+            {
+                float* input_tm = inputbuf0 + r * big_step;
+                float* output0_tm = output_tmp + tiles * K_aligned * r;
+                float* output1_tm = output0_tm + tiles * 4;
+                float* kernel_tm_i = kernel_tmp + r * C_aligned * K_aligned;
+
+                int ti = 0;
+                for (; ti + 7 < tiles; ti += 8)
+                {
+                    const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                    const float* k01 = kernel_tm_i;
+
+                    int nn = C_aligned/4;
+
+                    // init 32 registers. FMA/load ratio = 64/16
+                    float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                    float32x4_t r04 = r00, r05 = r00, r06 = r00, r07 = r00;
+                    float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                    float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+                    float32x4_t r16 = r00, r17 = r00, r18 = r00, r19 = r00;
+                    float32x4_t r20 = r00, r21 = r00, r22 = r00, r23 = r00;
+                    float32x4_t r24 = r00, r25 = r00, r26 = r00, r27 = r00;
+                    float32x4_t r28 = r00, r29 = r00, r30 = r00, r31 = r00;
+
+                    for(;nn > 0; nn--)
+                    {
+                        r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                        r08 = vld1q_f32(k01), r09 = vld1q_f32(k01+4), r10 = vld1q_f32(k01+8), r11 = vld1q_f32(k01+12);
+                        r0 += 16, k01 += 16;
+
+                        r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                        r17 = vfmaq_laneq_f32(r17, r08, r01, 0);
+                        r18 = vfmaq_laneq_f32(r18, r08, r02, 0);
+                        r19 = vfmaq_laneq_f32(r19, r08, r03, 0);
+
+                        r04 = vld1q_f32(r0), r05 = vld1q_f32(r0+4), r06 = vld1q_f32(r0+8), r07 = vld1q_f32(r0+12);
+                        r0 += 16;
+
+                        r20 = vfmaq_laneq_f32(r20, r08, r04, 0);
+                        r21 = vfmaq_laneq_f32(r21, r08, r05, 0);
+                        r22 = vfmaq_laneq_f32(r22, r08, r06, 0);
+                        r23 = vfmaq_laneq_f32(r23, r08, r07, 0);
+
+                        r24 = vfmaq_laneq_f32(r24, r09, r00, 0);
+                        r25 = vfmaq_laneq_f32(r25, r09, r01, 0);
+                        r26 = vfmaq_laneq_f32(r26, r09, r02, 0);
+                        r27 = vfmaq_laneq_f32(r27, r09, r03, 0);
+                        r28 = vfmaq_laneq_f32(r28, r09, r04, 0);
+                        r29 = vfmaq_laneq_f32(r29, r09, r05, 0);
+                        r30 = vfmaq_laneq_f32(r30, r09, r06, 0);
+                        r31 = vfmaq_laneq_f32(r31, r09, r07, 0);
+
+                        r12 = vld1q_f32(k01), r13 = vld1q_f32(k01+4), r14 = vld1q_f32(k01+8), r15 = vld1q_f32(k01+12);
+                        k01 += 16;
+
+                        r16 = vfmaq_laneq_f32(r16, r10, r00, 1);
+                        r17 = vfmaq_laneq_f32(r17, r10, r01, 1);
+                        r18 = vfmaq_laneq_f32(r18, r10, r02, 1);
+                        r19 = vfmaq_laneq_f32(r19, r10, r03, 1);
+                        r20 = vfmaq_laneq_f32(r20, r10, r04, 1);
+                        r21 = vfmaq_laneq_f32(r21, r10, r05, 1);
+                        r22 = vfmaq_laneq_f32(r22, r10, r06, 1);
+                        r23 = vfmaq_laneq_f32(r23, r10, r07, 1);
+
+                        r24 = vfmaq_laneq_f32(r24, r11, r00, 1);
+                        r25 = vfmaq_laneq_f32(r25, r11, r01, 1);
+                        r26 = vfmaq_laneq_f32(r26, r11, r02, 1);
+                        r27 = vfmaq_laneq_f32(r27, r11, r03, 1);
+                        r28 = vfmaq_laneq_f32(r28, r11, r04, 1);
+                        r29 = vfmaq_laneq_f32(r29, r11, r05, 1);
+                        r30 = vfmaq_laneq_f32(r30, r11, r06, 1);
+                        r31 = vfmaq_laneq_f32(r31, r11, r07, 1);
+
+                        r16 = vfmaq_laneq_f32(r16, r12, r00, 2);
+                        r17 = vfmaq_laneq_f32(r17, r12, r01, 2);
+                        r18 = vfmaq_laneq_f32(r18, r12, r02, 2);
+                        r19 = vfmaq_laneq_f32(r19, r12, r03, 2);
+                        r20 = vfmaq_laneq_f32(r20, r12, r04, 2);
+                        r21 = vfmaq_laneq_f32(r21, r12, r05, 2);
+                        r22 = vfmaq_laneq_f32(r22, r12, r06, 2);
+                        r23 = vfmaq_laneq_f32(r23, r12, r07, 2);
+
+                        r24 = vfmaq_laneq_f32(r24, r13, r00, 2);
+                        r25 = vfmaq_laneq_f32(r25, r13, r01, 2);
+                        r26 = vfmaq_laneq_f32(r26, r13, r02, 2);
+                        r27 = vfmaq_laneq_f32(r27, r13, r03, 2);
+                        r28 = vfmaq_laneq_f32(r28, r13, r04, 2);
+                        r29 = vfmaq_laneq_f32(r29, r13, r05, 2);
+                        r30 = vfmaq_laneq_f32(r30, r13, r06, 2);
+                        r31 = vfmaq_laneq_f32(r31, r13, r07, 2);
+
+                        r16 = vfmaq_laneq_f32(r16, r14, r00, 3);
+                        r17 = vfmaq_laneq_f32(r17, r14, r01, 3);
+                        r18 = vfmaq_laneq_f32(r18, r14, r02, 3);
+                        r19 = vfmaq_laneq_f32(r19, r14, r03, 3);
+                        r20 = vfmaq_laneq_f32(r20, r14, r04, 3);
+                        r21 = vfmaq_laneq_f32(r21, r14, r05, 3);
+                        r22 = vfmaq_laneq_f32(r22, r14, r06, 3);
+                        r23 = vfmaq_laneq_f32(r23, r14, r07, 3);
+
+                        r24 = vfmaq_laneq_f32(r24, r15, r00, 3);
+                        r25 = vfmaq_laneq_f32(r25, r15, r01, 3);
+                        r26 = vfmaq_laneq_f32(r26, r15, r02, 3);
+                        r27 = vfmaq_laneq_f32(r27, r15, r03, 3);
+                        r28 = vfmaq_laneq_f32(r28, r15, r04, 3);
+                        r29 = vfmaq_laneq_f32(r29, r15, r05, 3);
+                        r30 = vfmaq_laneq_f32(r30, r15, r06, 3);
+                        r31 = vfmaq_laneq_f32(r31, r15, r07, 3);
+                    }
+
+                    vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17), vst1q_f32(output0_tm + 8, r18), vst1q_f32(output0_tm + 12, r19);
+                    output0_tm += 16;
+                    vst1q_f32(output1_tm, r24), vst1q_f32(output1_tm + 4, r25), vst1q_f32(output1_tm + 8, r26), vst1q_f32(output1_tm + 12, r27);
+                    output1_tm += 16;
+
+                    vst1q_f32(output0_tm, r20), vst1q_f32(output0_tm + 4, r21), vst1q_f32(output0_tm + 8, r22), vst1q_f32(output0_tm + 12, r23);
+                    output0_tm += 16;
+                    vst1q_f32(output1_tm, r28), vst1q_f32(output1_tm + 4, r29), vst1q_f32(output1_tm + 8, r30), vst1q_f32(output1_tm + 12, r31);
+                    output1_tm += 16;
+                }
+
+                for (; ti + 3 < tiles; ti += 4)
+                {
+                    const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                    const float* k01 = kernel_tm_i;
+
+                    int nn = C_aligned/4;
+
+                    // init 20 registers. FMA/load ratio = 32/12
+                    float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                    float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                    float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+                    float32x4_t r24 = r00, r25 = r00, r26 = r00, r27 = r00;
+                    float32x4_t r28 = r00, r29 = r00, r30 = r00, r31 = r00;
+
+                    for(; nn > 0; nn--)
+                    {
+                        r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                        r08 = vld1q_f32(k01), r09 = vld1q_f32(k01+4), r10 = vld1q_f32(k01+8), r11 = vld1q_f32(k01+12);
+                        r0 += 16, k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r08, r00, 0);
+                        r25 = vfmaq_laneq_f32(r25, r08, r01, 0);
+                        r26 = vfmaq_laneq_f32(r26, r08, r02, 0);
+                        r27 = vfmaq_laneq_f32(r27, r08, r03, 0);
+
+                        r28 = vfmaq_laneq_f32(r28, r09, r00, 0);
+                        r29 = vfmaq_laneq_f32(r29, r09, r01, 0);
+                        r30 = vfmaq_laneq_f32(r30, r09, r02, 0);
+                        r31 = vfmaq_laneq_f32(r31, r09, r03, 0);
+
+                        r12 = vld1q_f32(k01), r13 = vld1q_f32(k01+4), r14 = vld1q_f32(k01+8), r15 = vld1q_f32(k01+12);
+                        k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r10, r00, 1);
+                        r25 = vfmaq_laneq_f32(r25, r10, r01, 1);
+                        r26 = vfmaq_laneq_f32(r26, r10, r02, 1);
+                        r27 = vfmaq_laneq_f32(r27, r10, r03, 1);
+
+                        r28 = vfmaq_laneq_f32(r28, r11, r00, 1);
+                        r29 = vfmaq_laneq_f32(r29, r11, r01, 1);
+                        r30 = vfmaq_laneq_f32(r30, r11, r02, 1);
+                        r31 = vfmaq_laneq_f32(r31, r11, r03, 1);
+
+                        r24 = vfmaq_laneq_f32(r24, r12, r00, 2);
+                        r25 = vfmaq_laneq_f32(r25, r12, r01, 2);
+                        r26 = vfmaq_laneq_f32(r26, r12, r02, 2);
+                        r27 = vfmaq_laneq_f32(r27, r12, r03, 2);
+
+                        r28 = vfmaq_laneq_f32(r28, r13, r00, 2);
+                        r29 = vfmaq_laneq_f32(r29, r13, r01, 2);
+                        r30 = vfmaq_laneq_f32(r30, r13, r02, 2);
+                        r31 = vfmaq_laneq_f32(r31, r13, r03, 2);
+
+                        r24 = vfmaq_laneq_f32(r24, r14, r00, 3);
+                        r25 = vfmaq_laneq_f32(r25, r14, r01, 3);
+                        r26 = vfmaq_laneq_f32(r26, r14, r02, 3);
+                        r27 = vfmaq_laneq_f32(r27, r14, r03, 3);
+
+                        r28 = vfmaq_laneq_f32(r28, r15, r00, 3);
+                        r29 = vfmaq_laneq_f32(r29, r15, r01, 3);
+                        r30 = vfmaq_laneq_f32(r30, r15, r02, 3);
+                        r31 = vfmaq_laneq_f32(r31, r15, r03, 3);
+                    }
+
+                    vst1q_f32(output0_tm, r24), vst1q_f32(output0_tm + 4, r25), vst1q_f32(output0_tm + 8, r26), vst1q_f32(output0_tm + 12, r27);
+                    output0_tm += 16;
+                    vst1q_f32(output1_tm, r28), vst1q_f32(output1_tm + 4, r29), vst1q_f32(output1_tm + 8, r30), vst1q_f32(output1_tm + 12, r31);
+                    output1_tm += 16;
+                }
+
+                for (; ti + 1 < tiles; ti += 2)
+                {
+                    const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                    const float* k01 = kernel_tm_i;
+
+                    int nn = C_aligned/4;
+
+                    // init 14 registers. FMA/load ratio = 15/10
+                    float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00;
+                    float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                    float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+                    float32x4_t r24 = r00, r25 = r00;
+                    float32x4_t r28 = r00, r29 = r00;
+
+                    for (; nn > 0; nn--)
+                    {
+                        r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4);
+                        r08 = vld1q_f32(k01), r09 = vld1q_f32(k01+4), r10 = vld1q_f32(k01+8), r11 = vld1q_f32(k01+12);
+                        r0 += 8, k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r08, r00, 0);
+                        r25 = vfmaq_laneq_f32(r25, r08, r01, 0);
+
+                        r28 = vfmaq_laneq_f32(r28, r09, r00, 0);
+                        r29 = vfmaq_laneq_f32(r29, r09, r01, 0);
+
+                        r12 = vld1q_f32(k01), r13 = vld1q_f32(k01+4), r14 = vld1q_f32(k01+8), r15 = vld1q_f32(k01+12);
+                        k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r10, r00, 1);
+                        r25 = vfmaq_laneq_f32(r25, r10, r01, 1);
+
+                        r28 = vfmaq_laneq_f32(r28, r11, r00, 1);
+                        r29 = vfmaq_laneq_f32(r29, r11, r01, 1);
+
+                        r24 = vfmaq_laneq_f32(r24, r12, r00, 2);
+                        r25 = vfmaq_laneq_f32(r25, r12, r01, 2);
+
+                        r28 = vfmaq_laneq_f32(r28, r13, r00, 2);
+                        r29 = vfmaq_laneq_f32(r29, r13, r01, 2);
+
+                        r24 = vfmaq_laneq_f32(r24, r14, r00, 3);
+                        r25 = vfmaq_laneq_f32(r25, r14, r01, 3);
+
+                        r28 = vfmaq_laneq_f32(r28, r15, r00, 3);
+                        r29 = vfmaq_laneq_f32(r29, r15, r01, 3);
+                    }
+
+                    vst1q_f32(output0_tm, r24), vst1q_f32(output0_tm + 4, r25);
+                    output0_tm += 8;
+                    vst1q_f32(output1_tm, r28), vst1q_f32(output1_tm + 4, r29);
+                    output1_tm += 8;
+                }
+
+                for (; ti < tiles; ti ++)
+                {
+                    const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                    const float* k01 = kernel_tm_i;
+
+                    int nn = C_aligned/4;
+
+                    float32x4_t r00 = vdupq_n_f32(0.0f);
+                    float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                    float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+                    float32x4_t r24 = r00;
+                    float32x4_t r28 = r00;
+
+                    for(;nn > 0; nn--)
+                    {
+                        r00 = vld1q_f32(r0);
+                        r08 = vld1q_f32(k01), r09 = vld1q_f32(k01+4), r10 = vld1q_f32(k01+8), r11 = vld1q_f32(k01+12);
+                        r0 += 4, k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r08, r00, 0);
+                        r28 = vfmaq_laneq_f32(r28, r09, r00, 0);
+
+                        r12 = vld1q_f32(k01), r13 = vld1q_f32(k01+4), r14 = vld1q_f32(k01+8), r15 = vld1q_f32(k01+12);
+                        k01 += 16;
+
+                        r24 = vfmaq_laneq_f32(r24, r10, r00, 1);
+                        r28 = vfmaq_laneq_f32(r28, r11, r00, 1);
+
+                        r24 = vfmaq_laneq_f32(r24, r12, r00, 2);
+                        r28 = vfmaq_laneq_f32(r28, r13, r00, 2);
+
+                        r24 = vfmaq_laneq_f32(r24, r14, r00, 3);
+                        r28 = vfmaq_laneq_f32(r28, r15, r00, 3);
+                    }
+
+                    vst1q_f32(output0_tm, r24);
+                    output0_tm += 4;
+                    vst1q_f32(output1_tm, r28);
+                    output1_tm += 4;
+                }
+            }
+        }
+        });
+#endif
+
+        // Matrix multiplication, 4 output channel.
+        int Ock_div4 = (K_aligned - K_div8 * 8) / 4;
+        parallel_for_(Range(0, Ock_div4), [&](const Range &range){
+            for (int outcn = range.start; outcn < range.end; outcn++)
+            {
+                float* output_tmp = outputbuf0 + tiles * (outcn + K_div8 * 2)* 4;
+                float* kernel_tmp = weight_ptr0 + (outcn + K_div8 * 2) * 4 * C_aligned;
+
+                for (int r = 0; r < 64; r++)
+                {
+                    float *input_tm = inputbuf0 + r * big_step;
+                    float *output0_tm = output_tmp + tiles * K_aligned * r;
+                    float *kernel_tm_i = kernel_tmp + r * C_aligned * K_aligned;
+
+                    int ti = 0;
+                    for (; ti + 7 < tiles; ti += 8)
+                    {
+                        int nn = C_aligned/4;
+                        const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                        const float* k0 = kernel_tm_i;
+
+#if CV_NEON_AARCH64
+                        // init 24 registers. FMA/load ratio = 32/12
+                        float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                        float32x4_t r04 = r00, r05 = r00, r06 = r00, r07 = r00;
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r16 = r00, r17 = r00, r18 = r00, r19 = r00;
+                        float32x4_t r20 = r00, r21 = r00, r22 = r00, r23 = r00;
+
+                        for(; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 16, k0 += 16;
+
+                            r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                            r17 = vfmaq_laneq_f32(r17, r08, r01, 0);
+                            r18 = vfmaq_laneq_f32(r18, r08, r02, 0);
+                            r19 = vfmaq_laneq_f32(r19, r08, r03, 0);
+
+                            r04 = vld1q_f32(r0), r05 = vld1q_f32(r0+4), r06 = vld1q_f32(r0+8), r07 = vld1q_f32(r0+12);
+                            r0 += 16;
+
+                            r20 = vfmaq_laneq_f32(r20, r08, r04, 0);
+                            r21 = vfmaq_laneq_f32(r21, r08, r05, 0);
+                            r22 = vfmaq_laneq_f32(r22, r08, r06, 0);
+                            r23 = vfmaq_laneq_f32(r23, r08, r07, 0);
+
+                            r16 = vfmaq_laneq_f32(r16, r09, r00, 1);
+                            r17 = vfmaq_laneq_f32(r17, r09, r01, 1);
+                            r18 = vfmaq_laneq_f32(r18, r09, r02, 1);
+                            r19 = vfmaq_laneq_f32(r19, r09, r03, 1);
+                            r20 = vfmaq_laneq_f32(r20, r09, r04, 1);
+                            r21 = vfmaq_laneq_f32(r21, r09, r05, 1);
+                            r22 = vfmaq_laneq_f32(r22, r09, r06, 1);
+                            r23 = vfmaq_laneq_f32(r23, r09, r07, 1);
+
+                            r16 = vfmaq_laneq_f32(r16, r10, r00, 2);
+                            r17 = vfmaq_laneq_f32(r17, r10, r01, 2);
+                            r18 = vfmaq_laneq_f32(r18, r10, r02, 2);
+                            r19 = vfmaq_laneq_f32(r19, r10, r03, 2);
+                            r20 = vfmaq_laneq_f32(r20, r10, r04, 2);
+                            r21 = vfmaq_laneq_f32(r21, r10, r05, 2);
+                            r22 = vfmaq_laneq_f32(r22, r10, r06, 2);
+                            r23 = vfmaq_laneq_f32(r23, r10, r07, 2);
+
+                            r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
+                            r17 = vfmaq_laneq_f32(r17, r11, r01, 3);
+                            r18 = vfmaq_laneq_f32(r18, r11, r02, 3);
+                            r19 = vfmaq_laneq_f32(r19, r11, r03, 3);
+                            r20 = vfmaq_laneq_f32(r20, r11, r04, 3);
+                            r21 = vfmaq_laneq_f32(r21, r11, r05, 3);
+                            r22 = vfmaq_laneq_f32(r22, r11, r06, 3);
+                            r23 = vfmaq_laneq_f32(r23, r11, r07, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17), vst1q_f32(output0_tm + 8, r18), vst1q_f32(output0_tm + 12, r19);
+                        output0_tm += 16;
+
+                        vst1q_f32(output0_tm, r20), vst1q_f32(output0_tm + 4, r21), vst1q_f32(output0_tm + 8, r22), vst1q_f32(output0_tm + 12, r23);
+                        output0_tm += 16;
+
+#else // ARMv7 16 registers.
+
+                        // init 16 registers. FMA/load ratio = 32/12
+                        float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                        float32x4_t r04 = r00, r05 = r00, r06 = r00, r07 = r00;
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r12 = r00, r13 = r00, r14 = r00, r15 = r00;
+
+                        for (; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            r04 = vld1q_f32(k0), r05 = vld1q_f32(k0+4), r06 = vld1q_f32(k0+8), r07 = vld1q_f32(k0+12);
+                            r0 += 16, k0 += 16;
+
+                            r08 = vfmaq_laneq_f32(r08, r04, r00, 0);
+                            r09 = vfmaq_laneq_f32(r09, r04, r01, 0);
+                            r10 = vfmaq_laneq_f32(r10, r04, r02, 0);
+                            r11 = vfmaq_laneq_f32(r11, r04, r03, 0);
+
+                            r08 = vfmaq_laneq_f32(r08, r05, r00, 1);
+                            r09 = vfmaq_laneq_f32(r09, r05, r01, 1);
+                            r10 = vfmaq_laneq_f32(r10, r05, r02, 1);
+                            r11 = vfmaq_laneq_f32(r11, r05, r03, 1);
+
+                            r08 = vfmaq_laneq_f32(r08, r06, r00, 2);
+                            r09 = vfmaq_laneq_f32(r09, r06, r01, 2);
+                            r10 = vfmaq_laneq_f32(r10, r06, r02, 2);
+                            r11 = vfmaq_laneq_f32(r11, r06, r03, 2);
+
+                            r08 = vfmaq_laneq_f32(r08, r07, r00, 3);
+                            r09 = vfmaq_laneq_f32(r09, r07, r01, 3);
+                            r10 = vfmaq_laneq_f32(r10, r07, r02, 3);
+                            r11 = vfmaq_laneq_f32(r11, r07, r03, 3);
+
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            r0 += 16;
+
+                            r12 = vfmaq_laneq_f32(r12, r04, r00, 0);
+                            r13 = vfmaq_laneq_f32(r13, r04, r01, 0);
+                            r14 = vfmaq_laneq_f32(r14, r04, r02, 0);
+                            r15 = vfmaq_laneq_f32(r15, r04, r03, 0);
+
+                            r12 = vfmaq_laneq_f32(r12, r05, r00, 1);
+                            r13 = vfmaq_laneq_f32(r13, r05, r01, 1);
+                            r14 = vfmaq_laneq_f32(r14, r05, r02, 1);
+                            r15 = vfmaq_laneq_f32(r15, r05, r03, 1);
+
+                            r12 = vfmaq_laneq_f32(r12, r06, r00, 2);
+                            r13 = vfmaq_laneq_f32(r13, r06, r01, 2);
+                            r14 = vfmaq_laneq_f32(r14, r06, r02, 2);
+                            r15 = vfmaq_laneq_f32(r15, r06, r03, 2);
+
+                            r12 = vfmaq_laneq_f32(r12, r07, r00, 3);
+                            r13 = vfmaq_laneq_f32(r13, r07, r01, 3);
+                            r14 = vfmaq_laneq_f32(r14, r07, r02, 3);
+                            r15 = vfmaq_laneq_f32(r15, r07, r03, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r08), vst1q_f32(output0_tm + 4, r09), vst1q_f32(output0_tm + 8, r10), vst1q_f32(output0_tm + 12, r11);
+                        output0_tm += 16;
+
+                        vst1q_f32(output0_tm, r12), vst1q_f32(output0_tm + 4, r13), vst1q_f32(output0_tm + 8, r14), vst1q_f32(output0_tm + 12, r15);
+                        output0_tm += 16;
+#endif
+                    }
+
+                    for (; ti + 3 < tiles; ti += 4)
+                    {
+                        int nn = C_aligned/4;
+                        const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                        const float* k0 = kernel_tm_i;
+
+
+                        // init 12 registers. FMA/load ratio = 12/8
+                        float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00, r02 = r00, r03 = r00;
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r16 = r00, r17 = r00, r18 = r00, r19 = r00;
+
+                        for(; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4), r02 = vld1q_f32(r0+8), r03 = vld1q_f32(r0+12);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 16, k0 += 16;
+
+                            r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                            r17 = vfmaq_laneq_f32(r17, r08, r01, 0);
+                            r18 = vfmaq_laneq_f32(r18, r08, r02, 0);
+                            r19 = vfmaq_laneq_f32(r19, r08, r03, 0);
+
+                            r16 = vfmaq_laneq_f32(r16, r09, r00, 1);
+                            r17 = vfmaq_laneq_f32(r17, r09, r01, 1);
+                            r18 = vfmaq_laneq_f32(r18, r09, r02, 1);
+                            r19 = vfmaq_laneq_f32(r19, r09, r03, 1);
+
+                            r16 = vfmaq_laneq_f32(r16, r10, r00, 2);
+                            r17 = vfmaq_laneq_f32(r17, r10, r01, 2);
+                            r18 = vfmaq_laneq_f32(r18, r10, r02, 2);
+                            r19 = vfmaq_laneq_f32(r19, r10, r03, 2);
+
+                            r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
+                            r17 = vfmaq_laneq_f32(r17, r11, r01, 3);
+                            r18 = vfmaq_laneq_f32(r18, r11, r02, 3);
+                            r19 = vfmaq_laneq_f32(r19, r11, r03, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17), vst1q_f32(output0_tm + 8, r18), vst1q_f32(output0_tm + 12, r19);
+                        output0_tm += 16;
+                    }
+
+                    for (; ti + 1 < tiles; ti += 2)
+                    {
+                        int nn = C_aligned/4;
+                        const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                        const float* k0 = kernel_tm_i;
+
+                        // init 8 registers. FMA/load ratio = 8/6
+                        float32x4_t r00 = vdupq_n_f32(0.0f), r01 = r00;
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r16 = r00, r17 = r00;
+
+                        for(; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0), r01 = vld1q_f32(r0+4);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 8, k0 += 16;
+
+                            r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                            r17 = vfmaq_laneq_f32(r17, r08, r01, 0);
+
+                            r16 = vfmaq_laneq_f32(r16, r09, r00, 1);
+                            r17 = vfmaq_laneq_f32(r17, r09, r01, 1);
+
+                            r16 = vfmaq_laneq_f32(r16, r10, r00, 2);
+                            r17 = vfmaq_laneq_f32(r17, r10, r01, 2);
+
+                            r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
+                            r17 = vfmaq_laneq_f32(r17, r11, r01, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r16), vst1q_f32(output0_tm + 4, r17);
+                        output0_tm += 8;
+                    }
+
+                    for (; ti < tiles; ti ++)
+                    {
+                        int nn = C_aligned/4;
+                        const float* r0 = input_tm + ofstab0[ti * 2] * line_step;
+                        const float* k0 = kernel_tm_i;
+
+                        // init 8 registers. FMA/load ratio = 8/6
+                        float32x4_t r00 = vdupq_n_f32(0.0f);
+                        float32x4_t r08 = r00, r09 = r00, r10 = r00, r11 = r00;
+                        float32x4_t r16 = r00;
+
+                        for(; nn > 0; nn--)
+                        {
+                            r00 = vld1q_f32(r0);
+                            r08 = vld1q_f32(k0), r09 = vld1q_f32(k0+4), r10 = vld1q_f32(k0+8), r11 = vld1q_f32(k0+12);
+                            r0 += 4, k0 += 16;
+
+                            r16 = vfmaq_laneq_f32(r16, r08, r00, 0);
+                            r16 = vfmaq_laneq_f32(r16, r09, r00, 1);
+                            r16 = vfmaq_laneq_f32(r16, r10, r00, 2);
+                            r16 = vfmaq_laneq_f32(r16, r11, r00, 3);
+                        }
+
+                        vst1q_f32(output0_tm, r16);
+                        output0_tm += 4;
+                    }
+                }
+            }
+        });
+
+        int bigStepOut = tiles * K_aligned;
+
+        // Transfor Ouput
+        parallel_for_(Range(0, ntasks), [&](const Range& range)
+        {
+            for (int task_i = range.start; task_i < range.end; task_i++)
+            {
+                float* outputCnbuf = outputCnbuf0 + task_i * 8 * 8 * 4;
+                for (int outCn4 = task_i; outCn4 < K_aligned / 4; outCn4 += ntasks)
+                {
+
+                    int outCn = outCn4 * 4;
+                    float* output_buf = outputbuf0 + outCn * tiles;
+                    float* output_ptr = output_ptr0 + outCn * W0 * H0;
+
+                    for (int ti = 0; ti < tiles; ti++)
+                    {
+                        float* output_buf_i = output_buf + ti * 4;
+                        float* outputCnbuf_i = outputCnbuf;
+                        int hi = ti / W_tiles;
+                        int wi = ti % W_tiles;
+
+                        // construct the output tile.
+                        for (int r = 0; r < 64; r++)
+                        {
+                            memcpy(outputCnbuf_i, output_buf_i, FAST_VEC_NLANES * sizeof(float ));
+                            output_buf_i += bigStepOut;
+                            outputCnbuf_i += FAST_VEC_NLANES;
+                        }
+
+                        winograd_trans_output_F63(outputCnbuf, conv->biasBuf.data() + outCn,
+                                                  minval, maxval, ifMinMaxAct);
+
+                        int wEnd = (wi + 1) * 6 > W0 ? W0 - (wi * 6) : 6;
+                        int hEnd = (hi + 1) * 6 > H0 ? H0 - (hi * 6) : 6;
+
+                        float* output_ptr_i = output_ptr + (hi * W0 + wi) * 6;
+
+                        // write back the output data.
+                        for (int outCni = 0; outCni < FAST_VEC_NLANES; outCni++)
+                        {
+                            float* output_ptr_i_cn = output_ptr_i + outCni * out_planesize;
+                            outputCnbuf_i = outputCnbuf + outCni;
+
+                            if (outCni + outCn < K)
+                            {
+                                for (int i = 0; i < hEnd; i++)
+                                {
+                                    for (int j = 0; j < wEnd; j++)
+                                    {
+                                        output_ptr_i_cn[i * W0 + j] = outputCnbuf_i[(i * 6 + j) * FAST_VEC_NLANES ];
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    if (activ)
+                    {
+                        int outCnEnd = std::min(outCn + FAST_VEC_NLANES, K);
+                        activ->forwardSlice(output_ptr, output_ptr, out_planesize,
+                                                  out_planesize, outCn, outCnEnd);
+                    }
+                }
+            }
+        });
+    }
+
+    return 1;
+}
+
+#else
+
+void initWinograd63(Ptr<FastConv2d>& conv, float* src_weight, int K, int C)
+{
+    conv->ifWinograd63 = false;
+}
+
+int runWinograd63(InputArray _input, OutputArray _output, const Ptr<FastConv2d>& conv, int ntasks, float minval, float maxval, ActivationLayer* activ, bool ifMinMaxAct)
+{
+    return 0;
+}
+
+#endif
+
+}} // namespace cv::dnn
diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp
index f2c6f1e..dd2ff1d 100644
--- a/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@ -545,7 +545,7 @@ TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16)
     Mat img = imread(findDataFile("dnn/googlenet_1.png"));
     Mat inp = blobFromImage(img, 1.0, Size(320, 240), Scalar(103.939, 116.779, 123.68), false, false);
     // Output image has values in range [-143.526, 148.539].
-    float l1 = 4e-5, lInf = 2e-3;
+    float l1 = 1e-4, lInf = 2e-3;
     if (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD)
     {
         l1 = 0.4;
diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp
index b747b47..ddaa51d 100644
--- a/modules/dnn/test/test_caffe_importer.cpp
+++ b/modules/dnn/test/test_caffe_importer.cpp
@@ -653,7 +653,7 @@ TEST_P(opencv_face_detector, Accuracy)
                                     0, 1, 0.98977017, 0.23901358, 0.09084064, 0.29902688, 0.1769477,
                                     0, 1, 0.97203469, 0.67965847, 0.06876482, 0.73999709, 0.1513494,
                                     0, 1, 0.95097077, 0.51901293, 0.45863652, 0.5777427, 0.5347801);
-    normAssertDetections(ref, out, "", 0.5, 1e-5, 2e-4);
+    normAssertDetections(ref, out, "", 0.5, 1e-4, 2e-4);
 }
 
 // False positives bug for large faces: https://github.com/opencv/opencv/issues/15106
@@ -695,7 +695,7 @@ TEST_P(Test_Caffe_nets, FasterRCNN_vgg16)
 #if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
         CV_TEST_TAG_MEMORY_2GB,  // utilizes ~1Gb, but huge blobs may not be allocated on 32-bit systems due memory fragmentation
 #else
-        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_MEMORY_2GB,
 #endif
         CV_TEST_TAG_LONG,
         CV_TEST_TAG_DEBUG_VERYLONG
diff --git a/modules/dnn/test/test_int8_layers.cpp b/modules/dnn/test/test_int8_layers.cpp
index 8446012..562014a 100644
--- a/modules/dnn/test/test_int8_layers.cpp
+++ b/modules/dnn/test/test_int8_layers.cpp
@@ -696,7 +696,7 @@ TEST_P(Test_Int8_nets, GoogLeNet)
     Mat blob = blobFromImages(inpMats, 1.0, Size(224, 224), Scalar(), false);
     Mat ref = blobFromNPY(_tf("googlenet_prob.npy"));
 
-    float l1 = 2e-4, lInf = 0.06;
+    float l1 = 2e-4, lInf = 0.07;
     testClassificationNet(net, blob, ref, l1, lInf);
 }
 
@@ -718,7 +718,7 @@ TEST_P(Test_Int8_nets, ResNet50)
     Mat blob = blobFromImage(inp, 1.0, Size(224, 224), Scalar(), false);
     Mat ref = blobFromNPY(_tf("resnet50_prob.npy"));
 
-    float l1 = 3e-4, lInf = 0.04;
+    float l1 = 3e-4, lInf = 0.05;
     testClassificationNet(net, blob, ref, l1, lInf);
 }
 
@@ -952,7 +952,7 @@ TEST_P(Test_Int8_nets, EfficientDet)
                                     0, 17, 0.8245924, 0.16657517850399017, 0.3996818959712982, 0.4111558794975281, 0.9306337833404541,
                                     0, 7, 0.8039304, 0.6118435263633728, 0.13175517320632935, 0.9065558314323425, 0.2943994700908661);
 
-    float confThreshold = 0.65, scoreDiff = 0.17, iouDiff = 0.18;
+    float confThreshold = 0.65, scoreDiff = 0.3, iouDiff = 0.18;
     testDetectionNet(net, blob, ref, confThreshold, scoreDiff, iouDiff);
 }
 
@@ -1016,7 +1016,7 @@ TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
 #if defined(OPENCV_32BIT_CONFIGURATION) && defined(HAVE_OPENCL)
         CV_TEST_TAG_MEMORY_2GB,
 #else
-        (target == DNN_TARGET_CPU ? CV_TEST_TAG_MEMORY_1GB : CV_TEST_TAG_MEMORY_2GB),
+        CV_TEST_TAG_MEMORY_2GB,
 #endif
         CV_TEST_TAG_LONG,
         CV_TEST_TAG_DEBUG_VERYLONG
@@ -1034,7 +1034,7 @@ TEST_P(Test_Int8_nets, FasterRCNN_vgg16)
                                     0, 7, 0.997022, 481.841, 92.3218, 722.685, 175.953,
                                     0, 12, 0.993028, 133.221, 189.377, 350.994, 563.166);
 
-    float confThreshold = 0.8, scoreDiff = 0.024, iouDiff = 0.35;
+    float confThreshold = 0.8, scoreDiff = 0.048, iouDiff = 0.35;
     testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
 }
 
@@ -1084,7 +1084,7 @@ TEST_P(Test_Int8_nets, RFCN)
     Mat ref = (Mat_<float>(2, 7) << 0, 7, 0.991359, 491.822, 81.1668, 702.573, 178.234,
                                     0, 12, 0.94786, 132.093, 223.903, 338.077, 566.16);
 
-    float confThreshold = 0.8, scoreDiff = 0.017, iouDiff = 0.11;
+    float confThreshold = 0.8, scoreDiff = 0.15, iouDiff = 0.11;
     testFaster(net, ref, confThreshold, scoreDiff, iouDiff);
 }
 
@@ -1114,7 +1114,7 @@ TEST_P(Test_Int8_nets, YoloVoc)
     std::string config_file = "yolo-voc.cfg";
     std::string weights_file = "yolo-voc.weights";
 
-    double scoreDiff = 0.1, iouDiff = 0.3;
+    double scoreDiff = 0.12, iouDiff = 0.3;
     {
     SCOPED_TRACE("batch size 1");
     testDarknetModel(config_file, weights_file, ref.rowRange(0, 3), scoreDiff, iouDiff);
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 582d8b0..72a8989 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -1336,7 +1336,7 @@ TEST_P(Test_TensorFlow_nets, EAST_text_detection)
     }
     else
     {
-        l1_geometry = 1e-4, lInf_geometry = 3e-3;
+        l1_geometry = 1e-4, lInf_geometry = 4.3e-3;
     }
     normAssert(scores, blobFromNPY(refScoresPath), "scores", l1_scores, lInf_scores);
     normAssert(geometry, blobFromNPY(refGeometryPath), "geometry", l1_geometry, lInf_geometry);
-- 
2.7.4