From: Sergei Slashchinin <62052793+sl-sergei@users.noreply.github.com>
Date: Fri, 13 Nov 2020 22:22:10 +0000 (+0300)
Subject: Merge pull request #18783 from sl-sergei:fix_conv1d
X-Git-Tag: accepted/tizen/unified/20220125.121719~1^2~1^2~309
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=61144f935efaae03d506ab2b54ee02b3bc1a4452;p=platform%2Fupstream%2Fopencv.git

Merge pull request #18783 from sl-sergei:fix_conv1d

Add support for Conv1D on OpenCV backend

* Add support for Conv1D on OpenCV backend

* disable tests on other targets/backends

* Fix formatting

* Restore comment

* Remove unnecessary flag and fix test logic

* Fix perf test

* fix braces

* Fix indentation, assert check and remove unnecessary condition

* Remove unnecessary changes

* Add test cases for variable weights and bias

* dnn(conv): fallback on OpenCV+CPU instead of failures

* coding style
---

diff --git a/modules/dnn/perf/perf_convolution.cpp b/modules/dnn/perf/perf_convolution.cpp
index 7d51cd3..c2a3a66 100644
--- a/modules/dnn/perf/perf_convolution.cpp
+++ b/modules/dnn/perf/perf_convolution.cpp
@@ -533,7 +533,7 @@ struct ConvParamID
         CONV_100 = 100,
         CONV_LAST = sizeof(testConvolutionConfigs) / sizeof(testConvolutionConfigs[0])
     };
-    int val_;                                                                  \
+    int val_;
     ConvParamID(int val = 0) : val_(val) {}
     operator int() const { return val_; }
     static ::testing::internal::ParamGenerator<ConvParamID> all()
@@ -546,7 +546,7 @@ struct ConvParamID
         ConvParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = ConvParamID(i); } // reduce generated code size
         return ::testing::ValuesIn(v_, v_ + NUM);
     }
-};                                                                                  \
+};
 static inline void PrintTo(const ConvParamID& v, std::ostream* os)
 {
     CV_Assert((int)v >= 0); CV_Assert((int)v < ConvParamID::CONV_LAST);
diff --git a/modules/dnn/perf/perf_convolution1d.cpp b/modules/dnn/perf/perf_convolution1d.cpp
new file mode 100644
index 0000000..c35cbd5
--- /dev/null
+++ b/modules/dnn/perf/perf_convolution1d.cpp
@@ -0,0 +1,163 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#include "perf_precomp.hpp"
+#include <opencv2/dnn/shape_utils.hpp>
+
+namespace opencv_test {
+
+struct Conv1DParam_t {
+    int kernel;
+    struct BlobShape { int dims[3]; } shapeIn;
+    int outCN;
+    int groups;
+    int stride;
+    int dilation;
+    int pad[2];
+    const char* padMode;
+    bool hasBias;
+    double declared_flops;
+};
+// Details: #12142
+static const Conv1DParam_t testConvolution1DConfigs[] = {
+        {3, {{1, 6, 10}}, 6, 1, 1, 1, {0, 0}, "VALID", true, 1776.},
+        {3, {{1, 2, 19}}, 2, 2, 2, 1, {1, 1}, "", true, 260.},
+        {3, {{1, 2, 25}}, 2, 2, 1, 1, {2, 2}, "SAME", false, 650.},
+};
+
+struct Conv1DParamID
+{
+    enum {
+        CONV_0 = 0,
+        CONV_LAST = sizeof(testConvolution1DConfigs) / sizeof(testConvolution1DConfigs[0])
+    };
+    int val_;
+    Conv1DParamID(int val = 0) : val_(val) {}
+    operator int() const { return val_; }
+    static ::testing::internal::ParamGenerator<Conv1DParamID> all()
+    {
+        enum { NUM = (int)CONV_LAST };
+        Conv1DParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = Conv1DParamID(i); } // reduce generated code size
+        return ::testing::ValuesIn(v_, v_ + NUM);
+    }
+};
+static inline void PrintTo(const Conv1DParamID& v, std::ostream* os)
+{
+    CV_Assert((int)v >= 0); CV_Assert((int)v < Conv1DParamID::CONV_LAST);
+    const Conv1DParam_t& p = testConvolution1DConfigs[(int)v];
+
+    *os << "GFLOPS=" << cv::format("%.3f", p.declared_flops * 1e-9)
+        << ", K=[" << p.kernel << "]"
+        << ", IN={" << p.shapeIn.dims[0] << ", " << p.shapeIn.dims[1] << ", " << p.shapeIn.dims[2] << "}"
+        << ", OCN=" << p.outCN;
+    if (p.groups > 1)
+        *os << ", G=" << p.groups;
+    if (p.stride != 1)
+        *os << ", S=" << p.stride;
+    if (p.dilation != 1)
+        *os << ", D="  << p.dilation;
+    if (p.pad[0] != 0 && p.pad[1] != 0 )
+        *os << ", P=(" << p.pad[0] << ", " << p.pad[1] << ")";
+    if (!((std::string)p.padMode).empty())
+        *os << ", PM=" << ((std::string)p.padMode);
+    if (p.hasBias)
+        *os << ", BIAS";
+}
+
+
+typedef tuple<Conv1DParamID, tuple<Backend, Target> > Conv1DTestParam_t;
+typedef TestBaseWithParam<Conv1DTestParam_t> Conv1D;
+
+PERF_TEST_P_(Conv1D, conv1d)
+{
+    int test_id = (int)get<0>(GetParam());
+    ASSERT_GE(test_id, 0); ASSERT_LT(test_id, Conv1DParamID::CONV_LAST);
+    const Conv1DParam_t& params = testConvolution1DConfigs[test_id];
+    double declared_flops = params.declared_flops;
+
+    DictValue kernel   = DictValue::arrayInt(&params.kernel, 1);
+    DictValue stride   = DictValue::arrayInt(&params.stride, 1);
+    DictValue pad      = DictValue::arrayInt(&params.pad[0], 2);
+    DictValue dilation = DictValue::arrayInt(&params.dilation, 1);
+
+    MatShape inputShape = MatShape(params.shapeIn.dims, params.shapeIn.dims + 3);
+    int outChannels = params.outCN;
+    int groups = params.groups;
+    std::string padMode(params.padMode);
+
+    bool hasBias = params.hasBias;
+    Backend backendId = get<0>(get<1>(GetParam()));
+    Target targetId = get<1>(get<1>(GetParam()));
+
+    if (targetId != DNN_TARGET_CPU)
+        throw SkipTestException("Only CPU is supported");
+
+    int inChannels = inputShape[1];
+
+    int sz[] = {outChannels, inChannels / groups, params.kernel};
+    Mat weights(3, &sz[0], CV_32F);
+    randu(weights, -1.0f, 1.0f);
+
+    LayerParams lp;
+    lp.set("kernel_size", kernel);
+    lp.set("pad", pad);
+    if (!padMode.empty())
+        lp.set("pad_mode", padMode);
+
+    lp.set("stride", stride);
+    lp.set("dilation", dilation);
+    lp.set("num_output", outChannels);
+    lp.set("group", groups);
+    lp.set("bias_term", hasBias);
+    lp.type = "Convolution";
+    lp.name = "testLayer";
+    lp.blobs.push_back(weights);
+
+    if (hasBias)
+    {
+        Mat bias(1, outChannels, CV_32F);
+        randu(bias, -1.0f, 1.0f);
+        lp.blobs.push_back(bias);
+    }
+
+    int inpSz[] = {1, inChannels, inputShape[2]};
+    Mat input(3, &inpSz[0], CV_32F);
+    randu(input, -1.0f, 1.0f);
+
+    Net net;
+    net.addLayerToPrev(lp.name, lp.type, lp);
+
+    net.setInput(input);
+    net.setPreferableBackend(backendId);
+    net.setPreferableTarget(targetId);
+
+    // warmup
+    Mat output = net.forward();
+
+    MatShape netInputShape = shape(input);
+    size_t weightsMemory = 0, blobsMemory = 0;
+    net.getMemoryConsumption(netInputShape, weightsMemory, blobsMemory);
+    int64 flops = net.getFLOPS(netInputShape);
+    CV_Assert(flops > 0);
+
+    std::cout
+    << "IN=" << divUp(input.total() * input.elemSize(), 1u<<10) << " Kb " << netInputShape
+    << "    OUT=" << divUp(output.total() * output.elemSize(), 1u<<10) << " Kb " << shape(output)
+    << "    Weights(parameters): " << divUp(weightsMemory, 1u<<10) << " Kb"
+    << "    MFLOPS=" << flops * 1e-6 << std::endl;
+
+    TEST_CYCLE()
+    {
+        Mat res = net.forward();
+    }
+    EXPECT_NEAR(flops, declared_flops, declared_flops * 1e-6);
+    SANITY_CHECK_NOTHING();
+}
+
+INSTANTIATE_TEST_CASE_P(/**/, Conv1D, Combine(
+        Conv1DParamID::all(),
+        dnnBackendsAndTargets(false, false)  // defined in ../test/test_common.hpp
+));
+
+} // namespace
diff --git a/modules/dnn/perf/perf_convolution3d.cpp b/modules/dnn/perf/perf_convolution3d.cpp
index 1f512b2..0cf4ce2 100644
--- a/modules/dnn/perf/perf_convolution3d.cpp
+++ b/modules/dnn/perf/perf_convolution3d.cpp
@@ -46,7 +46,7 @@ struct Conv3DParamID
         CONV_100 = 16,
         CONV_LAST = sizeof(testConvolution3DConfigs) / sizeof(testConvolution3DConfigs[0])
     };
-    int val_;                                                                  \
+    int val_;
     Conv3DParamID(int val = 0) : val_(val) {}
     operator int() const { return val_; }
     static ::testing::internal::ParamGenerator<Conv3DParamID> all()
@@ -59,7 +59,7 @@ struct Conv3DParamID
         Conv3DParamID v_[NUM]; for (int i = 0; i < NUM; ++i) { v_[i] = Conv3DParamID(i); } // reduce generated code size
         return ::testing::ValuesIn(v_, v_ + NUM);
     }
-};                                                                                  \
+};
 static inline void PrintTo(const Conv3DParamID& v, std::ostream* os)
 {
     CV_Assert((int)v >= 0); CV_Assert((int)v < Conv3DParamID::CONV_LAST);
diff --git a/modules/dnn/src/layers/convolution_layer.cpp b/modules/dnn/src/layers/convolution_layer.cpp
index 473c07b..c8245c4 100644
--- a/modules/dnn/src/layers/convolution_layer.cpp
+++ b/modules/dnn/src/layers/convolution_layer.cpp
@@ -113,17 +113,22 @@ public:
         MatSize weightShape = blobs.empty() ? inputs[1].size : blobs[0].size;
 
         CV_Assert(inputs[0].dims == outputs[0].dims);
+        if (weightShape.dims() == 3)
+        {
+            kernel_size.assign(1, kernel_size[0]);
+            strides.assign(1, strides[0]);
+        }
         CV_Assert(weightShape.dims() == kernel_size.size() + 2);
         for (int i = 0; i < kernel_size.size(); i++) {
             CV_Assert(weightShape[i + 2] == kernel_size[i]);
         }
 
         const Mat &input = inputs[0];
-        CV_Assert((input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16S));
+        CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || input.dims == 4 || input.dims == 5) && (input.type() == CV_32F || input.type() == CV_16S));
         for (size_t i = 0; i < outputs.size(); i++)
         {
             CV_Assert(inputs[i].type() == input.type());
-            CV_Assert((inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]);
+            CV_Assert(((input.dims == 3 && kernel_size.size() == 1) || inputs[i].dims == 4 || inputs[i].dims == 5) && inputs[i].size[1] == input.size[1]);
             for (int j = 0; j < inputs[i].dims; j++) {
                 CV_Assert(inputs[i].size[j] == input.size[j]);
             }
@@ -261,19 +266,26 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
+        size_t ksize = kernel_size.size();
 #ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
         {
-            if (kernel_size.size() == 3)
+            if (ksize == 1)
+                return false;
+            if (ksize == 3)
                 return preferableTarget == DNN_TARGET_CPU;
             if ((backendId == DNN_BACKEND_INFERENCE_ENGINE_NN_BUILDER_2019 || preferableTarget != DNN_TARGET_MYRIAD) && blobs.empty())
                 return false;
             return (preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height);
         }
-        else
 #endif
-            return (kernel_size.size() == 3 && preferableTarget == DNN_TARGET_CPU && backendId == DNN_BACKEND_OPENCV) ||
-                   (kernel_size.size() == 2 && (backendId == DNN_BACKEND_OPENCV || (backendId == DNN_BACKEND_HALIDE && !blobs.empty())));
+        if (backendId == DNN_BACKEND_OPENCV)
+            return ksize >= 1 && ksize <= 3;
+#ifdef HAVE_HALIDE
+        if (backendId == DNN_BACKEND_HALIDE)
+            return ksize == 2 && !blobs.empty();
+#endif
+        return false;
     }
 
     bool getMemoryShapes(const std::vector<MatShape> &inputs,
@@ -325,18 +337,27 @@ public:
         inputs_arr.getMatVector(inputs);
         // prepare weightsMat where each row is aligned and has enough zero padding on the right to
         // use vectorized (i.e. with intrinsics) loops without tail processing
-        Mat wm = blobs.empty() ? inputs[1].reshape(1, numOutput) : blobs[0].reshape(1, numOutput);
-        if( wm.step1() % VEC_ALIGN != 0 )
-        {
-            int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
-            Mat wm_buffer = Mat(numOutput, newcols, wm.type());
-            Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
-            wm_padding.setTo(Scalar::all(0.));
-            Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
-            wm.copyTo(wm_aligned);
-            wm = wm_aligned;
-        }
-        weightsMat = wm;
+        if (!blobs.empty())
+        {
+            Mat wm = blobs[0].reshape(1, numOutput);
+            if( wm.step1() % VEC_ALIGN != 0 )
+            {
+                int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
+                Mat wm_buffer = Mat(numOutput, newcols, wm.type());
+                Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
+                wm_padding.setTo(Scalar::all(0.));
+                Mat wm_aligned = wm_buffer.colRange(0, wm.cols);
+                wm.copyTo(wm_aligned);
+                wm = wm_aligned;
+            }
+            weightsMat = wm;
+        }
+        else
+        {
+            // initialized in .forward()
+            weightsMat.release();
+        }
+
         weightsMultipliers.assign(numOutput, 1.0);
 
         Mat biasMat = hasBias() ? blobs[1].reshape(1, numOutput) : Mat();
@@ -678,8 +699,11 @@ public:
         {
             size_t karea = std::accumulate(kernel_size.begin(), kernel_size.end(),
                                            1, std::multiplies<size_t>());
-            CV_Assert_N(
-                       (input.dims == 4 || input.dims == 5) && (input.dims == output.dims),
+            bool isConv1D = input.dims == 3;
+            bool isConv2D = input.dims == 4;
+            bool isConv3D = input.dims == 5;
+            CV_CheckEQ(static_cast<int>(kernel_size.size()), input.dims - 2, "");
+            CV_Assert_N(input.dims == output.dims,
                        input.size[0] == output.size[0],
                        weights.rows == output.size[1],
                        weights.cols == (input.size[1]/ngroups)*karea,
@@ -689,12 +713,15 @@ public:
                        input.isContinuous(),
                        output.isContinuous(),
                        biasvec.size() == (size_t)output.size[1]+2);
+            CV_Check(weights.step1(), weights.step1() % VEC_ALIGN == 0, "");
+            CV_CheckType(weights.type(), CV_32FC1, "");
             ParallelConv p;
 
             p.input_ = &input;
             p.weights_ = &weights;
             p.output_ = &output;
-            for( int i = 0; i < 4; i++ ) p.outShape[i] = output.size[i];
+            int max_ind = isConv1D? 3: 4;
+            for( int i = 0; i < max_ind; i++ ) p.outShape[i] = output.size[i];
             p.outShape[1] /= ngroups;
 
             p.kernel_size = kernel_size; p.strides = strides; p.dilations = dilations;
@@ -706,20 +733,19 @@ public:
             int inpCnAll = input.size[1];
             int depth = (input.dims == 5) ? input.size[2] : 1;
             int width = input.size[input.dims - 1];
-            int height = input.size[input.dims - 2];
+            int height = isConv1D? 1 : input.size[input.dims - 2];
             int inpCn = inpCnAll / ngroups;
 
-            bool isConv2D = kernel_size.size() == 2;
-
-            p.is1x1_ = isConv2D && kernel_size[0] == 1 && kernel_size[1] == 1 &&
-                       pads_begin[0] == 0  && pads_begin[1] == 0;
+            p.is1x1_ = (isConv2D && kernel_size[0] == 1 && kernel_size[1] == 1 &&
+                       pads_begin[0] == 0  && pads_begin[1] == 0) ||
+                       (isConv1D && pads_begin[0] == 0 && kernel_size[0] == 1);
 
             p.useAVX    = checkHardwareSupport(CPU_AVX)  && isConv2D;
             p.useAVX2   = checkHardwareSupport(CPU_AVX2) && isConv2D;
             p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX  && isConv2D;
 
-            int kernel_d = !isConv2D? kernel_size[0] : 1;
-            int kernel_h = kernel_size[kernel_size.size() - 2];
+            int kernel_d = isConv3D? kernel_size[0] : 1;
+            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
             int kernel_w = kernel_size.back();
 
             int blk_size_cn0 = cvCeil(800./(kernel_w*kernel_h));
@@ -729,14 +755,20 @@ public:
             ncn = std::min(ncn, inpCn);
             p.blk_size_cn = ncn;
 
-            int dil_d = !isConv2D? dilations[0] : 1;
-            int dil_h = dilations[dilations.size() - 2];
+            int dil_d = isConv3D? dilations[0] : 1;
+            int dil_h = isConv1D? 1 : dilations[dilations.size() - 2];
             int dil_w = dilations.back();
 
             p.ofstab_.resize(karea * ncn);
             int* ofstab = &p.ofstab_[0];
 
-            if (isConv2D)
+            if (isConv1D)
+            {
+                for( int k = 0; k < ncn; k++ )
+                    for( int k_c = 0; k_c < kernel_w; k_c++ )
+                        ofstab[k*kernel_w + k_c] = k*width + k_c*dil_w;
+            }
+            else if (isConv2D)
             {
                 for( int k = 0; k < ncn; k++ )
                     for( int k_r = 0; k_r < kernel_h; k_r++ )
@@ -765,34 +797,36 @@ public:
         {
             const int valign = ConvolutionLayerImpl::VEC_ALIGN;
             int ngroups = ngroups_, batchSize = input_->size[0]*ngroups;
+            bool isConv1D = input_->dims == 3;
             bool isConv2D = input_->dims == 4;
+            bool isConv3D = input_->dims == 5;
 
             int outW = output_->size[output_->dims - 1];
-            int outH = output_->size[output_->dims - 2];
+            int outH = isConv1D? 1 : output_->size[output_->dims - 2];
             int outCn = output_->size[1]/ngroups;
 
-            int depth = !isConv2D? input_->size[2] : 1;
-            int height = input_->size[input_->dims - 2];
+            int depth = isConv3D? input_->size[2] : 1;
+            int height = isConv1D? 1 : input_->size[input_->dims - 2];
             int width = input_->size[input_->dims - 1];
             int inpCn = input_->size[1]/ngroups;
 
             const int nstripes = nstripes_;
 
-            int kernel_d = !isConv2D? kernel_size[0] : 1;
-            int kernel_h = kernel_size[kernel_size.size() - 2];
+            int kernel_d = isConv3D? kernel_size[0] : 1;
+            int kernel_h = isConv1D? 1 : kernel_size[kernel_size.size() - 2];
             int kernel_w = kernel_size.back();
             int karea = kernel_w*kernel_h*kernel_d;
 
-            int pad_d = !isConv2D? pads_begin[0] : 0;
-            int pad_t = pads_begin[pads_begin.size() - 2];
+            int pad_d = isConv3D? pads_begin[0] : 0;
+            int pad_t = isConv1D? 0 : pads_begin[pads_begin.size() - 2];
             int pad_l = pads_begin.back();
 
-            int stride_d = !isConv2D? strides[0] : 0;
-            int stride_h = strides[strides.size() - 2];
+            int stride_d = isConv3D? strides[0] : 0;
+            int stride_h = isConv1D? 0 : strides[strides.size() - 2];
             int stride_w = strides.back();
 
-            int dilation_d = !isConv2D? dilations[0] : 1;
-            int dilation_h = dilations[dilations.size() - 2];
+            int dilation_d = isConv3D? dilations[0] : 1;
+            int dilation_h = isConv1D? 1 : dilations[dilations.size() - 2];
             int dilation_w = dilations.back();
 
             int i, j, k, d;
@@ -1032,7 +1066,71 @@ public:
                         // do im2row for a part of input tensor
                         float* rowbuf = rowbuf0;
 
-                        if (isConv2D)
+                        if (isConv1D)
+                        {
+                            for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
+                            {
+                                int delta = std::min(ofs1 - ofs, outW - out_j);
+                                int out_j1 = out_j + delta;
+
+                                int in_j = out_j * stride_w - pad_l;
+                                const float* imgptr = data_inp0 + cn0*width + in_j;
+                                ofs += delta;
+
+                                // do im2row for a part of input tensor
+                                if( is1x1 )
+                                {
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w )
+                                    {
+                                        for( k = 0; k < vsz; k++ )
+                                            rowbuf[k] = imgptr[k*inpPlaneSize];
+                                    }
+                                }
+                                else
+                                {
+                                    for( ; out_j < out_j1; out_j++, rowbuf += vsz_a, imgptr += stride_w, in_j += stride_w )
+                                    {
+                                        // this condition should be true for most of the tensor elements, i.e.
+                                        // most of the time the kernel aperture is inside the tensor X-Y plane.
+                                        if( out_j + 2 <= out_j1 && 0 <= in_j && in_j + stride_w*2 <= width - (kernel_w-1)*dilation_w )
+                                        {
+                                            for( k = 0; k < vsz; k++ )
+                                            {
+                                                int k1 = ofstab[k];
+                                                float v0 = imgptr[k1];
+                                                float v1 = imgptr[k1 + stride_w];
+                                                rowbuf[k] = v0;
+                                                rowbuf[k+vsz_a] = v1;
+                                            }
+                                            out_j++;
+                                            rowbuf += vsz_a;
+                                            imgptr += stride_w;
+                                            in_j += stride_w;
+                                        }
+                                        else
+                                        {
+                                            int i0 = std::max(0, (-in_j + dilation_w-1)/dilation_w);
+                                            int i1 = std::min(kernel_w, (width - in_j + dilation_w-1)/dilation_w);
+
+                                            // here some non-continuous sub-row of the row will not be
+                                            // filled from the tensor; we need to make sure that the uncovered
+                                            // elements are explicitly set to 0's. the easiest way is to
+                                            // set all the elements to 0's before the loop.
+                                            memset(rowbuf, 0, vsz*sizeof(rowbuf[0]));
+                                            for( k = 0; k < ncn; k++ )
+                                            {
+                                                for( i = i0; i < i1; i++ )
+                                                {
+                                                    int imgofs = k*width + i*dilation_w;
+                                                    rowbuf[k*kernel_w + i] = imgptr[imgofs];
+                                                }
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                        else if (isConv2D)
                         {
                             if( is1x1 && stride_w == 1 && stride_h == 1 )
                             {
@@ -1265,9 +1363,12 @@ public:
                                             vs12 = v_setzero_f32(), vs13 = v_setzero_f32();
                                 for( k = 0; k < vsz; k += 4, rptr += 4 )
                                 {
-                                    v_float32x4 w0 = v_load_aligned(wptr0 + k), w1 = v_load_aligned(wptr1 + k);
-                                    v_float32x4 r0 = v_load_aligned(rptr), r1 = v_load_aligned(rptr + vsz_a),
-                                                r2 = v_load_aligned(rptr + vsz_a*2), r3 = v_load_aligned(rptr + vsz_a*3);
+                                    v_float32x4 w0 = v_load_aligned(wptr0 + k);
+                                    v_float32x4 w1 = v_load_aligned(wptr1 + k);
+                                    v_float32x4 r0 = v_load_aligned(rptr);
+                                    v_float32x4 r1 = v_load_aligned(rptr + vsz_a);
+                                    v_float32x4 r2 = v_load_aligned(rptr + vsz_a*2);
+                                    v_float32x4 r3 = v_load_aligned(rptr + vsz_a*3);
 
                                     vs00 += w0*r0;
                                     vs01 += w0*r1;
@@ -1337,6 +1438,12 @@ public:
 #ifdef HAVE_OPENCL
     bool forward_ocl(InputArrayOfArrays inps, OutputArrayOfArrays outs, OutputArrayOfArrays internals)
     {
+        if (kernel_size.size() != 2)
+        {
+            // no OpenCL optimizations, see .supportedBacked()
+            return false;
+        }
+
         std::vector<UMat> inputs;
         std::vector<UMat> outputs;
 
@@ -1520,26 +1627,35 @@ public:
         if (blobs.empty())
         {
             Mat wm = inputs[1].reshape(1, outCn);
-            if( wm.step1() % VEC_ALIGN != 0 )
+            if (wm.data != weightsMat.data)
             {
-                wm.copyTo(weightsMat);
+                int newcols = (int)alignSize(wm.step1(), VEC_ALIGN);
+                Mat wm_buffer = Mat(numOutput, newcols, wm.type());
+                Mat wm_padding = wm_buffer.colRange(wm.cols, newcols);
+                wm_padding.setTo(Scalar::all(0.));
+                weightsMat = wm_buffer.colRange(0, wm.cols);
+
+                wm.copyTo((const Mat&)weightsMat);
                 if (inputs.size() > 2)
                 {
                     Mat biasMat = inputs[2].reshape(1, outCn);
                     biasMat.col(0).copyTo(biasvec);
-                    biasvec.resize(outCn + 2);
-                }
-                else
-                {
-                    biasvec.resize(outCn + 2, 0);
                 }
+                biasvec.resize(outCn + 2, 0);
             }
         }
-
-        /*printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
-               name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
-               kernel.width, kernel.height, pad.width, pad.height,
-               stride.width, stride.height, dilation.width, dilation.height);*/
+        /*if (inputs[0].dims > 3) {
+            printf("conv %s: input (%d x %d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
+                   name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2], inputs[0].size[3],
+                   kernel.width, kernel.height, pad.width, pad.height,
+                   stride.width, stride.height, dilation.width, dilation.height);
+        }
+        else {
+            printf("conv %s: input (%d x %d x %d), kernel (%d x %d), pad (%d x %d), stride (%d x %d), dilation (%d x %d)\n",
+                   name.c_str(), inputs[0].size[0], inputs[0].size[1], inputs[0].size[2],
+                   kernel.width, kernel.height, pad.width, pad.height,
+                   stride.width, stride.height, dilation.width, dilation.height);
+        }*/
         int inpGroupCn = blobs.empty() ? inputs[1].size[1] : blobs[0].size[1];
         CV_Assert_N(inputs.size() >= (size_t)1, inputs[0].size[1] % inpGroupCn == 0,
                     outputs.size() == 1, inputs[0].data != outputs[0].data);
diff --git a/modules/dnn/src/onnx/onnx_importer.cpp b/modules/dnn/src/onnx/onnx_importer.cpp
index 56683f4..9443336 100644
--- a/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@ -200,12 +200,12 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
 
         if(attribute_name == "kernel_shape")
         {
-            CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
+            CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
             lp.set("kernel_size", parse(attribute_proto.ints()));
         }
         else if(attribute_name == "strides")
         {
-            CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
+            CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
             lp.set("stride", parse(attribute_proto.ints()));
         }
         else if(attribute_name == "pads")
@@ -229,7 +229,7 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
             else
             {
                 // Convolution or pooling.
-                CV_Assert(attribute_proto.ints_size() == 4 || attribute_proto.ints_size() == 6);
+                CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 4 || attribute_proto.ints_size() == 6);
                 lp.set("pad", parse(attribute_proto.ints()));
             }
         }
@@ -244,7 +244,7 @@ LayerParams ONNXImporter::getLayerParams(const opencv_onnx::NodeProto& node_prot
         }
         else if(attribute_name == "dilations")
         {
-            CV_Assert(attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
+            CV_Assert(attribute_proto.ints_size() == 1 || attribute_proto.ints_size() == 2 || attribute_proto.ints_size() == 3);
             lp.set("dilation", parse(attribute_proto.ints()));
         }
         else if (attribute_proto.has_i())
diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp
index 993ba56..5c6de55 100644
--- a/modules/dnn/test/test_onnx_importer.cpp
+++ b/modules/dnn/test/test_onnx_importer.cpp
@@ -183,9 +183,14 @@ TEST_P(Test_ONNX_layers, Convolution3D)
 #if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
     applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_VERSION);
 #endif
-    if (target != DNN_TARGET_CPU)
-        throw SkipTestException("Only CPU is supported");
     testONNXModels("conv3d");
+}
+
+TEST_P(Test_ONNX_layers, Convolution3D_bias)
+{
+#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LT(2019010000)
+    applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_VERSION);
+#endif
     testONNXModels("conv3d_bias");
 }
 
@@ -648,6 +653,58 @@ TEST_P(Test_ONNX_layers, ResizeOpset11_Torch1_6)
     testONNXModels("resize_opset11_torch1.6");
 }
 
+TEST_P(Test_ONNX_layers, Conv1d)
+{
+    testONNXModels("conv1d");
+}
+
+TEST_P(Test_ONNX_layers, Conv1d_bias)
+{
+    testONNXModels("conv1d_bias");
+}
+
+TEST_P(Test_ONNX_layers, Conv1d_variable_weight)
+{
+    String basename = "conv1d_variable_w";
+    Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
+    ASSERT_FALSE(net.empty());
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    Mat input = blobFromNPY(_tf("data/input_" + basename + "_0.npy"));
+    Mat weights = blobFromNPY(_tf("data/input_" + basename + "_1.npy"));
+    Mat ref = blobFromNPY(_tf("data/output_" + basename + ".npy"));
+
+    net.setInput(input, "0");
+    net.setInput(weights, "1");
+
+    Mat out = net.forward();
+    normAssert(ref, out, "", default_l1, default_lInf);
+}
+
+TEST_P(Test_ONNX_layers, Conv1d_variable_weight_bias)
+{
+    String basename = "conv1d_variable_wb";
+    Net net = readNetFromONNX(_tf("models/" + basename + ".onnx"));
+    ASSERT_FALSE(net.empty());
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+
+    Mat input = blobFromNPY(_tf("data/input_" + basename + "_0.npy"));
+    Mat weights = blobFromNPY(_tf("data/input_" + basename + "_1.npy"));
+    Mat bias = blobFromNPY(_tf("data/input_" + basename + "_2.npy"));
+    Mat ref = blobFromNPY(_tf("data/output_" + basename + ".npy"));
+
+    net.setInput(input, "0");
+    net.setInput(weights, "1");
+    net.setInput(bias, "bias");
+
+    Mat out = net.forward();
+    normAssert(ref, out, "", default_l1, default_lInf);
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, Test_ONNX_layers, dnnBackendsAndTargets());
 
 class Test_ONNX_nets : public Test_ONNX_layers
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
index 68b720a..e9c1562 100644
--- a/modules/dnn/test/test_tf_importer.cpp
+++ b/modules/dnn/test/test_tf_importer.cpp
@@ -173,8 +173,6 @@ TEST_P(Test_TensorFlow_layers, Convolution3D)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NN_BUILDER);  // Only CPU on DLIE backend is supported
     if (backend == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH && target != DNN_TARGET_CPU)
         applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_NGRAPH);  // Only CPU on DLIE backend is supported
-    if (target != DNN_TARGET_CPU)
-        throw SkipTestException("Only CPU is supported");
     runTensorFlowNet("conv3d");
 }