Merge branch '3.4' into merge-3.4

author Alexander Alekhin <alexander.a.alekhin@gmail.com>

Sat, 22 Dec 2018 05:40:15 +0000 (05:40 +0000)

committer Alexander Alekhin <alexander.a.alekhin@gmail.com>

Sat, 22 Dec 2018 05:40:15 +0000 (05:40 +0000)
author Alexander Alekhin <alexander.a.alekhin@gmail.com>
Sat, 22 Dec 2018 05:40:15 +0000 (05:40 +0000)
committer Alexander Alekhin <alexander.a.alekhin@gmail.com>
Sat, 22 Dec 2018 05:40:15 +0000 (05:40 +0000)
diff --combined modules/dnn/include/opencv2/dnn/dnn.hpp

index 1e2adef,c0e84b8..9371d12
--- 1/modules/dnn/include/opencv2/dnn/dnn.hpp
--- 2/modules/dnn/include/opencv2/dnn/dnn.hpp
+++ b/modules/dnn/include/opencv2/dnn/dnn.hpp
@@@ -45,13 -45,20 +45,13 @@@
   #include <vector>
   #include <opencv2/core.hpp>
   
- -#if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_EXPERIMENTAL_NS
- -#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_34_v11 {
- -#define CV__DNN_EXPERIMENTAL_NS_END }
- -namespace cv { namespace dnn { namespace experimental_dnn_34_v11 { } using namespace experimental_dnn_34_v11; }}
- -#else
- -#define CV__DNN_EXPERIMENTAL_NS_BEGIN
- -#define CV__DNN_EXPERIMENTAL_NS_END
- -#endif
+ +#include "../dnn/version.hpp"
   
   #include <opencv2/dnn/dict.hpp>
   
   namespace cv {
   namespace dnn {
- -CV__DNN_EXPERIMENTAL_NS_BEGIN
+ +CV__DNN_INLINE_NS_BEGIN
   //! @addtogroup dnn
   //! @{
   
@@@ -69,8 -76,7 +69,8 @@@
           DNN_BACKEND_DEFAULT,
           DNN_BACKEND_HALIDE,
           DNN_BACKEND_INFERENCE_ENGINE,
- -        DNN_BACKEND_OPENCV
+ +        DNN_BACKEND_OPENCV,
+ +        DNN_BACKEND_VKCOM
       };
   
       /**
@@@ -83,7 -89,6 +83,7 @@@
           DNN_TARGET_OPENCL,
           DNN_TARGET_OPENCL_FP16,
           DNN_TARGET_MYRIAD,
+ +        DNN_TARGET_VULKAN,
           //! FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
           DNN_TARGET_FPGA
       };
@@@ -273,7 -278,6 +273,7 @@@
   
           virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &inputs);
   
+ +        virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs);
          /**
           * @brief Automatic Halide scheduling based on layer hyper-parameters.
           * @param[in] node Backend node with Halide functions.
@@@ -750,6 -754,7 +750,7 @@@
        *  @brief Reads a network model stored in <a href="http://torch.ch">Torch7</a> framework's format.
        *  @param model    path to the file, dumped from Torch by using torch.save() function.
        *  @param isBinary specifies whether the network was serialized in ascii mode or binary.
+      *  @param evaluate specifies testing phase of network. If true, it's similar to evaluate() method in Torch.
        *  @returns Net object.
        *
        *  @note Ascii mode of Torch serializer is more preferable, because binary mode extensively use `long` type of C language,
@@@ -771,7 -776,7 +772,7 @@@
        *
        * Also some equivalents of these classes from cunn, cudnn, and fbcunn may be successfully imported.
        */
-      CV_EXPORTS_W Net readNetFromTorch(const String &model, bool isBinary = true);
+      CV_EXPORTS_W Net readNetFromTorch(const String &model, bool isBinary = true, bool evaluate = true);
   
        /**
         * @brief Read deep learning network represented in one of the supported formats.
@@@ -962,7 -967,7 +963,7 @@@
       CV_EXPORTS_W void resetMyriadDevice();
   
   //! @}
- -CV__DNN_EXPERIMENTAL_NS_END
+ +CV__DNN_INLINE_NS_END
   }
   }
   
diff --combined modules/dnn/include/opencv2/dnn/version.hpp

index 7d0f125,0000000..b41efda

mode 100644,000000..100644
--- 1/modules/dnn/include/opencv2/dnn/version.hpp
--- /dev/null
+++ b/modules/dnn/include/opencv2/dnn/version.hpp
@@@ -1,21 -1,0 +1,21 @@@
- #define OPENCV_DNN_API_VERSION 20181205
+ +// This file is part of OpenCV project.
+ +// It is subject to the license terms in the LICENSE file found in the top-level directory
+ +// of this distribution and at http://opencv.org/license.html.
+ +
+ +#ifndef OPENCV_DNN_VERSION_HPP
+ +#define OPENCV_DNN_VERSION_HPP
+ +
+ +/// Use with major OpenCV version only.
++#define OPENCV_DNN_API_VERSION 20181221
+ +
+ +#if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_INLINE_NS
+ +#define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
+ +#define CV__DNN_INLINE_NS_BEGIN namespace CV__DNN_INLINE_NS {
+ +#define CV__DNN_INLINE_NS_END }
+ +namespace cv { namespace dnn { namespace CV__DNN_INLINE_NS { } using namespace CV__DNN_INLINE_NS; }}
+ +#else
+ +#define CV__DNN_INLINE_NS_BEGIN
+ +#define CV__DNN_INLINE_NS_END
+ +#endif
+ +
+ +#endif  // OPENCV_DNN_VERSION_HPP
diff --combined modules/dnn/src/onnx/onnx_importer.cpp

index b690a1f,218775b..b62366c
--- 1/modules/dnn/src/onnx/onnx_importer.cpp
--- 2/modules/dnn/src/onnx/onnx_importer.cpp
+++ b/modules/dnn/src/onnx/onnx_importer.cpp
@@@ -28,7 -28,7 +28,7 @@@
   
   namespace cv {
   namespace dnn {
- -CV__DNN_EXPERIMENTAL_NS_BEGIN
+ +CV__DNN_INLINE_NS_BEGIN
   
   
   class ONNXImporter
@@@ -420,31 -420,30 +420,30 @@@ void ONNXImporter::populateNet(Net dstN
           }
           else if (layer_type == "Sub")
           {
-             Mat blob = (-1.0f) * getBlob(node_proto, constBlobs, 1);
-             blob = blob.reshape(1, 1);
+             Mat blob = getBlob(node_proto, constBlobs, 1);
               if (blob.total() == 1) {
                   layerParams.type = "Power";
-                 layerParams.set("shift", blob.at<float>(0));
+                 layerParams.set("shift", -blob.at<float>(0));
               }
               else {
                   layerParams.type = "Scale";
                   layerParams.set("has_bias", true);
-                 layerParams.blobs.push_back(blob);
+                 layerParams.blobs.push_back(-1.0f * blob.reshape(1, 1));
               }
           }
           else if (layer_type == "Div")
           {
               Mat blob = getBlob(node_proto, constBlobs, 1);
               CV_Assert_N(blob.type() == CV_32F, blob.total());
-             divide(1.0, blob, blob);
               if (blob.total() == 1)
               {
-                 layerParams.set("scale", blob.at<float>(0));
+                 layerParams.set("scale", 1.0f / blob.at<float>(0));
                   layerParams.type = "Power";
               }
               else
               {
                   layerParams.type = "Scale";
+                 divide(1.0, blob, blob);
                   layerParams.blobs.push_back(blob);
                   layerParams.set("bias_term", false);
               }
@@@ -760,7 -759,7 +759,7 @@@ Mat readTensorFromONNX(const String& pa
       return mat;
   }
   
- -CV__DNN_EXPERIMENTAL_NS_END
+ +CV__DNN_INLINE_NS_END
   }} // namespace
   
   #endif
diff --combined modules/dnn/src/op_inf_engine.hpp

index 69d4944,a1144b4..118e525
--- 1/modules/dnn/src/op_inf_engine.hpp
--- 2/modules/dnn/src/op_inf_engine.hpp
+++ b/modules/dnn/src/op_inf_engine.hpp
@@@ -26,10 -26,11 +26,11 @@@
   #define INF_ENGINE_RELEASE_2018R2 2018020000
   #define INF_ENGINE_RELEASE_2018R3 2018030000
   #define INF_ENGINE_RELEASE_2018R4 2018040000
+ #define INF_ENGINE_RELEASE_2018R5 2018050000
   
   #ifndef INF_ENGINE_RELEASE
- #warning("IE version have not been provided via command-line. Using 2018R4 by default")
- #define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2018R4
+ #warning("IE version have not been provided via command-line. Using 2018R5 by default")
+ #define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2018R5
   #endif
   
   #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
@@@ -48,69 -49,69 +49,69 @@@ public
   
       InfEngineBackendNet(InferenceEngine::CNNNetwork& net);
   
- -    virtual void Release() noexcept CV_OVERRIDE;
+ +    virtual void Release() CV_NOEXCEPT CV_OVERRIDE;
   
- -    void setPrecision(InferenceEngine::Precision p) noexcept;
+ +    void setPrecision(InferenceEngine::Precision p) CV_NOEXCEPT;
   
- -    virtual InferenceEngine::Precision getPrecision() noexcept;
+ +    virtual InferenceEngine::Precision getPrecision() CV_NOEXCEPT;
   
- -    virtual InferenceEngine::Precision getPrecision() const noexcept;
+ +    virtual InferenceEngine::Precision getPrecision() const CV_NOEXCEPT;
   
- -    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) noexcept /*CV_OVERRIDE*/;
+ +    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) CV_NOEXCEPT /*CV_OVERRIDE*/;
   
- -    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) const noexcept /*CV_OVERRIDE*/;
+ +    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) const CV_NOEXCEPT /*CV_OVERRIDE*/;
   
- -    virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) noexcept /*CV_OVERRIDE*/;
+ +    virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) CV_NOEXCEPT /*CV_OVERRIDE*/;
   
- -    virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) const noexcept /*CV_OVERRIDE*/;
+ +    virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) const CV_NOEXCEPT /*CV_OVERRIDE*/;
   
- -    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) noexcept;
+ +    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) CV_NOEXCEPT;
   
- -    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) const noexcept;
+ +    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) const CV_NOEXCEPT;
   
- -    virtual InferenceEngine::StatusCode serialize(const std::string &xmlPath, const std::string &binPath, InferenceEngine::ResponseDesc* resp) const noexcept;
+ +    virtual InferenceEngine::StatusCode serialize(const std::string &xmlPath, const std::string &binPath, InferenceEngine::ResponseDesc* resp) const CV_NOEXCEPT;
   
- -    virtual void getName(char *pName, size_t len) noexcept;
+ +    virtual void getName(char *pName, size_t len) CV_NOEXCEPT;
   
- -    virtual void getName(char *pName, size_t len) const noexcept;
+ +    virtual void getName(char *pName, size_t len) const CV_NOEXCEPT;
   
- -    virtual const std::string& getName() const noexcept;
+ +    virtual const std::string& getName() const CV_NOEXCEPT;
   
- -    virtual size_t layerCount() noexcept;
+ +    virtual size_t layerCount() CV_NOEXCEPT;
   
- -    virtual size_t layerCount() const noexcept;
+ +    virtual size_t layerCount() const CV_NOEXCEPT;
   
- -    virtual InferenceEngine::DataPtr& getData(const char *dname) noexcept CV_OVERRIDE;
+ +    virtual InferenceEngine::DataPtr& getData(const char *dname) CV_NOEXCEPT CV_OVERRIDE;
   
- -    virtual void addLayer(const InferenceEngine::CNNLayerPtr &layer) noexcept CV_OVERRIDE;
+ +    virtual void addLayer(const InferenceEngine::CNNLayerPtr &layer) CV_NOEXCEPT CV_OVERRIDE;
   
       virtual InferenceEngine::StatusCode addOutput(const std::string &layerName,
                                                     size_t outputIndex = 0,
- -                                                  InferenceEngine::ResponseDesc *resp = nullptr) noexcept;
+ +                                                  InferenceEngine::ResponseDesc *resp = nullptr) CV_NOEXCEPT;
   
       virtual InferenceEngine::StatusCode getLayerByName(const char *layerName,
                                                          InferenceEngine::CNNLayerPtr &out,
- -                                                       InferenceEngine::ResponseDesc *resp) noexcept;
+ +                                                       InferenceEngine::ResponseDesc *resp) CV_NOEXCEPT;
   
       virtual InferenceEngine::StatusCode getLayerByName(const char *layerName,
                                                          InferenceEngine::CNNLayerPtr &out,
- -                                                       InferenceEngine::ResponseDesc *resp) const noexcept;
+ +                                                       InferenceEngine::ResponseDesc *resp) const CV_NOEXCEPT;
   
- -    virtual void setTargetDevice(InferenceEngine::TargetDevice device) noexcept CV_OVERRIDE;
+ +    virtual void setTargetDevice(InferenceEngine::TargetDevice device) CV_NOEXCEPT CV_OVERRIDE;
   
- -    virtual InferenceEngine::TargetDevice getTargetDevice() noexcept;
+ +    virtual InferenceEngine::TargetDevice getTargetDevice() CV_NOEXCEPT;
   
- -    virtual InferenceEngine::TargetDevice getTargetDevice() const noexcept;
+ +    virtual InferenceEngine::TargetDevice getTargetDevice() const CV_NOEXCEPT;
   
- -    virtual InferenceEngine::StatusCode setBatchSize(const size_t size) noexcept CV_OVERRIDE;
+ +    virtual InferenceEngine::StatusCode setBatchSize(const size_t size) CV_NOEXCEPT CV_OVERRIDE;
   
- -    virtual InferenceEngine::StatusCode setBatchSize(size_t size, InferenceEngine::ResponseDesc* responseDesc) noexcept;
+ +    virtual InferenceEngine::StatusCode setBatchSize(size_t size, InferenceEngine::ResponseDesc* responseDesc) CV_NOEXCEPT;
   
- -    virtual size_t getBatchSize() const noexcept CV_OVERRIDE;
+ +    virtual size_t getBatchSize() const CV_NOEXCEPT CV_OVERRIDE;
   
   #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R2)
- -    virtual InferenceEngine::StatusCode AddExtension(const InferenceEngine::IShapeInferExtensionPtr& extension, InferenceEngine::ResponseDesc* resp) noexcept;
- -    virtual InferenceEngine::StatusCode reshape(const InputShapes& inputShapes, InferenceEngine::ResponseDesc* resp) noexcept;
+ +    virtual InferenceEngine::StatusCode AddExtension(const InferenceEngine::IShapeInferExtensionPtr& extension, InferenceEngine::ResponseDesc* resp) CV_NOEXCEPT;
+ +    virtual InferenceEngine::StatusCode reshape(const InputShapes& inputShapes, InferenceEngine::ResponseDesc* resp) CV_NOEXCEPT;
   #endif
   
       void init(int targetId);
diff --combined modules/dnn/src/torch/torch_importer.cpp

index 6c19093,0ecb74d..b9af28f
--- 1/modules/dnn/src/torch/torch_importer.cpp
--- 2/modules/dnn/src/torch/torch_importer.cpp
+++ b/modules/dnn/src/torch/torch_importer.cpp
@@@ -51,7 -51,7 +51,7 @@@
   
   namespace cv {
   namespace dnn {
- -CV__DNN_EXPERIMENTAL_NS_BEGIN
+ +CV__DNN_INLINE_NS_BEGIN
   
   using namespace TH;
   
@@@ -129,13 -129,15 +129,15 @@@ struct TorchImporte
       Module *rootModule;
       Module *curModule;
       int moduleCounter;
+     bool testPhase;
   
-     TorchImporter(String filename, bool isBinary)
+     TorchImporter(String filename, bool isBinary, bool evaluate)
       {
           CV_TRACE_FUNCTION();
   
           rootModule = curModule = NULL;
           moduleCounter = 0;
+         testPhase = evaluate;
   
           file = cv::Ptr<THFile>(THDiskFile_new(filename, "r", 0), THFile_free);
           CV_Assert(file && THFile_isOpened(file));
@@@ -680,7 -682,8 +682,8 @@@
                       layerParams.blobs.push_back(tensorParams["bias"].second);
                   }
   
-                 if (nnName == "InstanceNormalization")
+                 bool trainPhase = scalarParams.get<bool>("train", false);
+                 if (nnName == "InstanceNormalization" || (trainPhase && !testPhase))
                   {
                       cv::Ptr<Module> mvnModule(new Module(nnName));
                       mvnModule->apiType = "MVN";
@@@ -1243,22 -1246,22 +1246,22 @@@
   
   Mat readTorchBlob(const String &filename, bool isBinary)
   {
-     TorchImporter importer(filename, isBinary);
+     TorchImporter importer(filename, isBinary, true);
       importer.readObject();
       CV_Assert(importer.tensors.size() == 1);
   
       return importer.tensors.begin()->second;
   }
   
- Net readNetFromTorch(const String &model, bool isBinary)
+ Net readNetFromTorch(const String &model, bool isBinary, bool evaluate)
   {
       CV_TRACE_FUNCTION();
   
-     TorchImporter importer(model, isBinary);
+     TorchImporter importer(model, isBinary, evaluate);
       Net net;
       importer.populateNet(net);
       return net;
   }
   
- -CV__DNN_EXPERIMENTAL_NS_END
+ +CV__DNN_INLINE_NS_END
   }} // namespace
diff --combined modules/dnn/test/test_backends.cpp

index cf901a2,75591e1..3a64d64
--- 1/modules/dnn/test/test_backends.cpp
--- 2/modules/dnn/test/test_backends.cpp
+++ b/modules/dnn/test/test_backends.cpp
@@@ -226,9 -226,9 +226,9 @@@ TEST_P(DNNTestNetwork, OpenPose_pose_mp
   TEST_P(DNNTestNetwork, OpenFace)
   {
   #if defined(INF_ENGINE_RELEASE)
- #if INF_ENGINE_RELEASE < 2018030000
+ #if (INF_ENGINE_RELEASE < 2018030000 || INF_ENGINE_RELEASE == 2018050000)
       if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-         throw SkipTestException("Test is enabled starts from OpenVINO 2018R3");
+         throw SkipTestException("");
   #elif INF_ENGINE_RELEASE < 2018040000
       if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
           throw SkipTestException("Test is enabled starts from OpenVINO 2018R4");
@@@ -292,6 -292,6 +292,6 @@@ TEST_P(DNNTestNetwork, FastNeuralStyle_
       processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", inp, "", "", l1, lInf);
   }
   
- -INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets(true, true, false));
+ +INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets(true, true, false, true));
   
   }} // namespace
diff --combined modules/imgproc/src/filter.cpp

index 6da66a1,9107d00..342421e
--- 1/modules/imgproc/src/filter.cpp
--- 2/modules/imgproc/src/filter.cpp
+++ b/modules/imgproc/src/filter.cpp
@@@ -44,6 -44,7 +44,7 @@@
   #include "opencv2/core/opencl/ocl_defs.hpp"
   #include "opencl_kernels_imgproc.hpp"
   #include "hal_replacement.hpp"
+ #include "opencv2/core/hal/intrin.hpp"
   #include "filter.hpp"
   
   
@@@ -477,7 -478,7 +478,7 @@@ struct FilterNoVe
   };
   
   
- #if CV_SSE2
+ #if CV_SIMD
   
   ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
   
@@@ -502,9 -503,6 +503,6 @@@ struct RowVec_8u32
   
       int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
       {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
- 
           int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
           int* dst = (int*)_dst;
           const int* _kx = kernel.ptr<int>();
@@@ -512,52 -510,81 +510,81 @@@
   
           if( smallValues )
           {
-             __m128i z = _mm_setzero_si128();
-             for( ; i <= width - 8; i += 8 )
+             for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
               {
                   const uchar* src = _src + i;
-                 __m128i s0 = z, s1 = z;
- 
-                 for( k = 0; k < _ksize; k++, src += cn )
+                 v_int32 s0 = vx_setzero_s32();
+                 v_int32 s1 = vx_setzero_s32();
+                 v_int32 s2 = vx_setzero_s32();
+                 v_int32 s3 = vx_setzero_s32();
+                 k = 0;
+                 for (; k <= _ksize - 2; k += 2, src += 2 * cn)
+                 {
+                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+                     v_uint8 x0, x1;
+                     v_zip(vx_load(src), vx_load(src + cn), x0, x1);
+                     s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
+                     s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
+                     s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
+                     s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+                 }
+                 if (k < _ksize)
+                 {
+                     v_int32 f = vx_setall_s32(_kx[k]);
+                     v_uint16 x0, x1;
+                     v_expand(vx_load(src), x0, x1);
+                     s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
+                     s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
+                     s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
+                     s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+                 }
+                 v_store(dst + i, s0);
+                 v_store(dst + i + v_int32::nlanes, s1);
+                 v_store(dst + i + 2*v_int32::nlanes, s2);
+                 v_store(dst + i + 3*v_int32::nlanes, s3);
+             }
+             if( i <= width - v_uint16::nlanes )
+             {
+                 const uchar* src = _src + i;
+                 v_int32 s0 = vx_setzero_s32();
+                 v_int32 s1 = vx_setzero_s32();
+                 k = 0;
+                 for( ; k <= _ksize - 2; k += 2, src += 2*cn )
                   {
-                     __m128i f = _mm_cvtsi32_si128(_kx[k]);
-                     f = _mm_shuffle_epi32(f, 0);
- 
-                     __m128i x0 = _mm_loadl_epi64((const __m128i*)src);
-                     x0 = _mm_unpacklo_epi8(x0, z);
- 
-                     __m128i x1 = _mm_unpackhi_epi16(x0, z);
-                     x0 = _mm_unpacklo_epi16(x0, z);
- 
-                     x0 = _mm_madd_epi16(x0, f);
-                     x1 = _mm_madd_epi16(x1, f);
- 
-                     s0 = _mm_add_epi32(s0, x0);
-                     s1 = _mm_add_epi32(s1, x1);
+                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+                     v_uint16 x0, x1;
+                     v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1);
+                     s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
+                     s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
                   }
- 
-                 _mm_store_si128((__m128i*)(dst + i), s0);
-                 _mm_store_si128((__m128i*)(dst + i + 4), s1);
+                 if( k < _ksize )
+                 {
+                     v_int32 f = vx_setall_s32(_kx[k]);
+                     v_uint32 x0, x1;
+                     v_expand(vx_load_expand(src), x0, x1);
+                     s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
+                     s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
+                 }
+                 v_store(dst + i, s0);
+                 v_store(dst + i + v_int32::nlanes, s1);
+                 i += v_uint16::nlanes;
               }
- 
-             if( i <= width - 4 )
+             if( i <= width - v_uint32::nlanes )
               {
+                 v_int32 d = vx_setzero_s32();
+                 k = 0;
                   const uchar* src = _src + i;
-                 __m128i s0 = z;
- 
-                 for( k = 0; k < _ksize; k++, src += cn )
+                 for (; k <= _ksize - 2; k += 2, src += 2*cn)
                   {
-                     __m128i f = _mm_cvtsi32_si128(_kx[k]);
-                     f = _mm_shuffle_epi32(f, 0);
- 
-                     __m128i x0 = _mm_cvtsi32_si128(*(const int*)src);
-                     x0 = _mm_unpacklo_epi8(x0, z);
-                     x0 = _mm_unpacklo_epi16(x0, z);
-                     x0 = _mm_madd_epi16(x0, f);
-                     s0 = _mm_add_epi32(s0, x0);
+                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+                     v_uint32 x0, x1;
+                     v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1);
+                     d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f));
                   }
-                 _mm_store_si128((__m128i*)(dst + i), s0);
-                 i += 4;
+                 if (k < _ksize)
+                     d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k])));
+                 v_store(dst + i, d);
+                 i += v_uint32::nlanes;
               }
           }
           return i;
@@@ -590,9 -617,6 +617,6 @@@ struct SymmRowSmallVec_8u32
   
       int operator()(const uchar* src, uchar* _dst, int width, int cn) const
       {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
- 
           int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
           int* dst = (int*)_dst;
           bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
@@@ -603,7 -627,6 +627,6 @@@
           src += (_ksize/2)*cn;
           width *= cn;
   
-         __m128i z = _mm_setzero_si128();
           if( symmetrical )
           {
               if( _ksize == 1 )
@@@ -611,143 -634,276 +634,276 @@@
               if( _ksize == 3 )
               {
                   if( kx[0] == 2 && kx[1] == 1 )
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                 {
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src), x1l, x1h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+                         x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l));
+                         x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h));
+                         v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l)));
+                         v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l)));
+                         v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h)));
+                         v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h)));
+                     }
+                     if( i <= width - v_uint16::nlanes )
+                     {
+                         v_uint16 x = vx_load_expand(src);
+                         x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)));
+                         v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x)));
+                         v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x)));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
+                     }
+                     if( i <= width - v_uint32::nlanes )
                       {
-                         __m128i x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
-                         x1 = _mm_loadu_si128((__m128i*)src);
-                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
-                         y0 = _mm_unpackhi_epi8(x0, z);
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         y1 = _mm_unpackhi_epi8(x1, z);
-                         x1 = _mm_unpacklo_epi8(x1, z);
-                         y2 = _mm_unpackhi_epi8(x2, z);
-                         x2 = _mm_unpacklo_epi8(x2, z);
-                         x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
-                         y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
-                         _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
-                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
-                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
-                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
+                         v_uint32 x = vx_load_expand_q(src);
+                         x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn);
+                         v_store(dst + i, v_reinterpret_as_s32(x));
+                         i += v_uint32::nlanes;
                       }
+                 }
                   else if( kx[0] == -2 && kx[1] == 1 )
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                 {
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src), x1l, x1h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+                         x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
+                         x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
+                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
+                         v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
+                         v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+                     }
+                     if( i <= width - v_uint16::nlanes )
+                     {
+                         v_uint16 x = vx_load_expand(src);
+                         x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x));
+                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
+                     }
+                     if( i <= width - v_uint32::nlanes )
                       {
-                         __m128i x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
-                         x1 = _mm_loadu_si128((__m128i*)src);
-                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
-                         y0 = _mm_unpackhi_epi8(x0, z);
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         y1 = _mm_unpackhi_epi8(x1, z);
-                         x1 = _mm_unpacklo_epi8(x1, z);
-                         y2 = _mm_unpackhi_epi8(x2, z);
-                         x2 = _mm_unpacklo_epi8(x2, z);
-                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
-                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
-                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+                         v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
+                         x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x);
+                         v_store(dst + i, x);
+                         i += v_uint32::nlanes;
                       }
+                 }
                   else
                   {
-                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
-                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
-                     k1 = _mm_packs_epi32(k1, k1);
- 
-                     for( ; i <= width - 8; i += 8, src += 8 )
+                     v_int16 k0 = vx_setall_s16((short)kx[0]);
+                     v_int16 k1 = vx_setall_s16((short)kx[1]);
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src), x1l, x1h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+ 
+                         v_int32 dl, dh;
+                         v_int16 x0, x1;
+                         v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh);
+                         v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1);
+                         dl += v_dotprod(x0, k1);
+                         dh += v_dotprod(x1, k1);
+                         v_store(dst + i, dl);
+                         v_store(dst + i + v_int32::nlanes, dh);
+ 
+                         v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh);
+                         v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1);
+                         dl += v_dotprod(x0, k1);
+                         dh += v_dotprod(x1, k1);
+                         v_store(dst + i + 2*v_int32::nlanes, dl);
+                         v_store(dst + i + 3*v_int32::nlanes, dh);
+                     }
+                     if ( i <= width - v_uint16::nlanes )
+                     {
+                         v_int32 dl, dh;
+                         v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh);
+                         v_int16 x0, x1;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1);
+                         dl += v_dotprod(x0, k1);
+                         dh += v_dotprod(x1, k1);
+                         v_store(dst + i, dl);
+                         v_store(dst + i + v_int32::nlanes, dh);
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
+                     }
+                     if ( i <= width - v_uint32::nlanes )
                       {
-                         __m128i x0 = _mm_loadl_epi64((__m128i*)(src - cn));
-                         __m128i x1 = _mm_loadl_epi64((__m128i*)src);
-                         __m128i x2 = _mm_loadl_epi64((__m128i*)(src + cn));
- 
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         x1 = _mm_unpacklo_epi8(x1, z);
-                         x2 = _mm_unpacklo_epi8(x2, z);
-                         __m128i x3 = _mm_unpacklo_epi16(x0, x2);
-                         __m128i x4 = _mm_unpackhi_epi16(x0, x2);
-                         __m128i x5 = _mm_unpacklo_epi16(x1, z);
-                         __m128i x6 = _mm_unpackhi_epi16(x1, z);
-                         x3 = _mm_madd_epi16(x3, k1);
-                         x4 = _mm_madd_epi16(x4, k1);
-                         x5 = _mm_madd_epi16(x5, k0);
-                         x6 = _mm_madd_epi16(x6, k0);
-                         x3 = _mm_add_epi32(x3, x5);
-                         x4 = _mm_add_epi32(x4, x6);
- 
-                         _mm_store_si128((__m128i*)(dst + i), x3);
-                         _mm_store_si128((__m128i*)(dst + i + 4), x4);
+                         v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1])));
+                         i += v_uint32::nlanes;
                       }
                   }
               }
               else if( _ksize == 5 )
               {
                   if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                 {
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                         v_expand(vx_load(src - 2*cn), x0l, x0h);
+                         v_expand(vx_load(src), x1l, x1h);
+                         v_expand(vx_load(src + 2*cn), x2l, x2h);
+                         x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
+                         x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
+                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
+                         v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
+                         v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+                     }
+                     if( i <= width - v_uint16::nlanes )
                       {
-                         __m128i x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
-                         x1 = _mm_loadu_si128((__m128i*)src);
-                         x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
-                         y0 = _mm_unpackhi_epi8(x0, z);
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         y1 = _mm_unpackhi_epi8(x1, z);
-                         x1 = _mm_unpacklo_epi8(x1, z);
-                         y2 = _mm_unpackhi_epi8(x2, z);
-                         x2 = _mm_unpacklo_epi8(x2, z);
-                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
-                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
-                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+                         v_uint16 x = vx_load_expand(src);
+                         x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x));
+                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
                       }
+                     if( i <= width - v_uint32::nlanes )
+                     {
+                         v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
+                         x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x);
+                         v_store(dst + i, x);
+                         i += v_uint32::nlanes;
+                     }
+                 }
                   else
                   {
-                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
-                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
-                             k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
-                     k1 = _mm_packs_epi32(k1, k1);
-                     k2 = _mm_packs_epi32(k2, k2);
+                     v_int16 k0 = vx_setall_s16((short)(kx[0]));
+                     v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_int32 x0, x1, x2, x3;
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
+                         v_int16 xl, xh;
+ 
+                         v_expand(vx_load(src), x0l, x0h);
+                         v_mul_expand(v_reinterpret_as_s16(x0l), k0, x0, x1);
+                         v_mul_expand(v_reinterpret_as_s16(x0h), k0, x2, x3);
+ 
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src + cn), x1l, x1h);
+                         v_expand(vx_load(src - 2*cn), x2l, x2h);
+                         v_expand(vx_load(src + 2*cn), x3l, x3h);
+                         v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh);
+                         x0 += v_dotprod(xl, k12);
+                         x1 += v_dotprod(xh, k12);
+                         v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh);
+                         x2 += v_dotprod(xl, k12);
+                         x3 += v_dotprod(xh, k12);
+ 
+                         v_store(dst + i, x0);
+                         v_store(dst + i + v_int32::nlanes, x1);
+                         v_store(dst + i + 2*v_int32::nlanes, x2);
+                         v_store(dst + i + 3*v_int32::nlanes, x3);
+                     }
+                     if( i <= width - v_uint16::nlanes )
+                     {
+                         v_int32 x1, x2;
+                         v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2);
   
-                     for( ; i <= width - 8; i += 8, src += 8 )
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh);
+                         x1 += v_dotprod(xl, k12);
+                         x2 += v_dotprod(xh, k12);
+ 
+                         v_store(dst + i, x1);
+                         v_store(dst + i + v_int32::nlanes, x2);
+                         i += v_uint16::nlanes, src += v_uint16::nlanes;
+                     }
+                     if( i <= width - v_uint32::nlanes )
+                     {
+                         v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]),
+                                          v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]),
+                                                   v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2]))));
+                         i += v_uint32::nlanes;
+                     }
+                 }
+             }
+             else
+             {
+                 v_int16 k0 = vx_setall_s16((short)(kx[0]));
+                 for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                 {
+                     v_uint8 v_src = vx_load(src);
+                     v_int32 s0, s1, s2, s3;
+                     v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1);
+                     v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3);
+                     for (k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn)
+                     {
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+ 
+                         v_uint8 v_src0 = vx_load(src - j);
+                         v_uint8 v_src1 = vx_load(src - j - cn);
+                         v_uint8 v_src2 = vx_load(src + j);
+                         v_uint8 v_src3 = vx_load(src + j + cn);
+ 
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
+                         s2 += v_dotprod(xl, k12);
+                         s3 += v_dotprod(xh, k12);
+                     }
+                     if( k < _ksize / 2 + 1 )
+                     {
+                         v_int16 k1 = vx_setall_s16((short)(kx[k]));
+ 
+                         v_uint8 v_src0 = vx_load(src - j);
+                         v_uint8 v_src1 = vx_load(src + j);
+ 
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
+                         s0 += v_dotprod(xl, k1);
+                         s1 += v_dotprod(xh, k1);
+                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
+                         s2 += v_dotprod(xl, k1);
+                         s3 += v_dotprod(xh, k1);
+                     }
+                     v_store(dst + i, s0);
+                     v_store(dst + i + v_int32::nlanes, s1);
+                     v_store(dst + i + 2*v_int32::nlanes, s2);
+                     v_store(dst + i + 3*v_int32::nlanes, s3);
+                 }
+                 if( i <= width - v_uint16::nlanes )
+                 {
+                     v_int32 s0, s1;
+                     v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
+                     for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn)
+                     {
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh);
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16)));
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                     }
+                     if ( k < _ksize / 2 + 1 )
                       {
-                         __m128i x0 = _mm_loadl_epi64((__m128i*)src);
- 
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         __m128i x1 = _mm_unpacklo_epi16(x0, z);
-                         __m128i x2 = _mm_unpackhi_epi16(x0, z);
-                         x1 = _mm_madd_epi16(x1, k0);
-                         x2 = _mm_madd_epi16(x2, k0);
- 
-                         __m128i x3 = _mm_loadl_epi64((__m128i*)(src - cn));
-                         __m128i x4 = _mm_loadl_epi64((__m128i*)(src + cn));
- 
-                         x3 = _mm_unpacklo_epi8(x3, z);
-                         x4 = _mm_unpacklo_epi8(x4, z);
-                         __m128i x5 = _mm_unpacklo_epi16(x3, x4);
-                         __m128i x6 = _mm_unpackhi_epi16(x3, x4);
-                         x5 = _mm_madd_epi16(x5, k1);
-                         x6 = _mm_madd_epi16(x6, k1);
-                         x1 = _mm_add_epi32(x1, x5);
-                         x2 = _mm_add_epi32(x2, x6);
- 
-                         x3 = _mm_loadl_epi64((__m128i*)(src - cn*2));
-                         x4 = _mm_loadl_epi64((__m128i*)(src + cn*2));
- 
-                         x3 = _mm_unpacklo_epi8(x3, z);
-                         x4 = _mm_unpacklo_epi8(x4, z);
-                         x5 = _mm_unpacklo_epi16(x3, x4);
-                         x6 = _mm_unpackhi_epi16(x3, x4);
-                         x5 = _mm_madd_epi16(x5, k2);
-                         x6 = _mm_madd_epi16(x6, k2);
-                         x1 = _mm_add_epi32(x1, x5);
-                         x2 = _mm_add_epi32(x2, x6);
- 
-                         _mm_store_si128((__m128i*)(dst + i), x1);
-                         _mm_store_si128((__m128i*)(dst + i + 4), x2);
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh);
+                         v_int16 k1 = vx_setall_s16((short)(kx[k]));
+                         s0 += v_dotprod(xl, k1);
+                         s1 += v_dotprod(xh, k1);
                       }
+                     v_store(dst + i, s0);
+                     v_store(dst + i + v_int32::nlanes, s1);
+                     i += v_uint16::nlanes; src += v_uint16::nlanes;
+                 }
+                 if( i <= width - v_uint32::nlanes )
+                 {
+                     v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                     for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn )
+                         s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0);
+                     v_store(dst + i, s0);
+                     i += v_uint32::nlanes;
                   }
               }
           }
@@@ -756,111 -912,175 +912,175 @@@
               if( _ksize == 3 )
               {
                   if( kx[0] == 0 && kx[1] == 1 )
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                 {
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+                         v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l));
+                         v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h));
+                         v_store(dst + i, v_expand_low(dl));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
+                         v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh));
+                         v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh));
+                     }
+                     if( i <= width - v_uint16::nlanes )
                       {
-                         __m128i x0, x1, y0;
-                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
-                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
-                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
-                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
-                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+                         v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn)));
+                         v_store(dst + i, v_expand_low(dl));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
                       }
+                     if (i <= width - v_uint32::nlanes)
+                     {
+                         v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)));
+                         i += v_uint32::nlanes;
+                     }
+                 }
                   else
                   {
-                     __m128i k0 = _mm_set_epi32(-kx[1], kx[1], -kx[1], kx[1]);
-                     k0 = _mm_packs_epi32(k0, k0);
- 
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                     v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16)));
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh);
+                         v_store(dst + i, v_dotprod(xl, k0));
+                         v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+                         v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh);
+                         v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0));
+                         v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0));
+                     }
+                     if( i <= width - v_uint16::nlanes )
+                     {
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh);
+                         v_store(dst + i, v_dotprod(xl, k0));
+                         v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
+                     }
+                     if (i <= width - v_uint32::nlanes)
                       {
-                         __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn));
-                         __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn));
- 
-                         __m128i x2 = _mm_unpacklo_epi8(x0, z);
-                         __m128i x3 = _mm_unpacklo_epi8(x1, z);
-                         __m128i x4 = _mm_unpackhi_epi8(x0, z);
-                         __m128i x5 = _mm_unpackhi_epi8(x1, z);
-                         __m128i x6 = _mm_unpacklo_epi16(x2, x3);
-                         __m128i x7 = _mm_unpacklo_epi16(x4, x5);
-                         __m128i x8 = _mm_unpackhi_epi16(x2, x3);
-                         __m128i x9 = _mm_unpackhi_epi16(x4, x5);
-                         x6 = _mm_madd_epi16(x6, k0);
-                         x7 = _mm_madd_epi16(x7, k0);
-                         x8 = _mm_madd_epi16(x8, k0);
-                         x9 = _mm_madd_epi16(x9, k0);
- 
-                         _mm_store_si128((__m128i*)(dst + i), x6);
-                         _mm_store_si128((__m128i*)(dst + i + 4), x8);
-                         _mm_store_si128((__m128i*)(dst + i + 8), x7);
-                         _mm_store_si128((__m128i*)(dst + i + 12), x9);
+                         v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1])));
+                         i += v_uint32::nlanes;
                       }
                   }
               }
               else if( _ksize == 5 )
               {
-                 __m128i k0 = _mm_loadl_epi64((__m128i*)(kx + 1));
-                 k0 = _mm_unpacklo_epi64(k0, k0);
-                 k0 = _mm_packs_epi32(k0, k0);
- 
-                 for( ; i <= width - 16; i += 16, src += 16 )
+                 v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
+                 for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                 {
+                     v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
+                     v_expand(vx_load(src - cn), x0l, x0h);
+                     v_expand(vx_load(src - 2*cn), x1l, x1h);
+                     v_expand(vx_load(src + cn), x2l, x2h);
+                     v_expand(vx_load(src + 2*cn), x3l, x3h);
+                     v_int16 x0, x1;
+                     v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1);
+                     v_store(dst + i, v_dotprod(x0, k0));
+                     v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+                     v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1);
+                     v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0));
+                     v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0));
+                 }
+                 if( i <= width - v_uint16::nlanes )
+                 {
+                     v_int16 x0, x1;
+                     v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))),
+                           v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1);
+                     v_store(dst + i, v_dotprod(x0, k0));
+                     v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+                     i += v_uint16::nlanes; src += v_uint16::nlanes;
+                 }
+                 if( i <= width - v_uint32::nlanes )
                   {
-                     __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn));
-                     __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn));
- 
-                     __m128i x2 = _mm_unpackhi_epi8(x0, z);
-                     __m128i x3 = _mm_unpackhi_epi8(x1, z);
-                     x0 = _mm_unpacklo_epi8(x0, z);
-                     x1 = _mm_unpacklo_epi8(x1, z);
-                     __m128i x5 = _mm_sub_epi16(x2, x3);
-                     __m128i x4 = _mm_sub_epi16(x0, x1);
- 
-                     __m128i x6 = _mm_loadu_si128((__m128i*)(src + cn * 2));
-                     __m128i x7 = _mm_loadu_si128((__m128i*)(src - cn * 2));
- 
-                     __m128i x8 = _mm_unpackhi_epi8(x6, z);
-                     __m128i x9 = _mm_unpackhi_epi8(x7, z);
-                     x6 = _mm_unpacklo_epi8(x6, z);
-                     x7 = _mm_unpacklo_epi8(x7, z);
-                     __m128i x11 = _mm_sub_epi16(x8, x9);
-                     __m128i x10 = _mm_sub_epi16(x6, x7);
- 
-                     __m128i x13 = _mm_unpackhi_epi16(x5, x11);
-                     __m128i x12 = _mm_unpackhi_epi16(x4, x10);
-                     x5 = _mm_unpacklo_epi16(x5, x11);
-                     x4 = _mm_unpacklo_epi16(x4, x10);
-                     x5 = _mm_madd_epi16(x5, k0);
-                     x4 = _mm_madd_epi16(x4, k0);
-                     x13 = _mm_madd_epi16(x13, k0);
-                     x12 = _mm_madd_epi16(x12, k0);
- 
-                     _mm_store_si128((__m128i*)(dst + i), x4);
-                     _mm_store_si128((__m128i*)(dst + i + 4), x12);
-                     _mm_store_si128((__m128i*)(dst + i + 8), x5);
-                     _mm_store_si128((__m128i*)(dst + i + 12), x13);
+                     v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]),
+                                              (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2])));
+                     i += v_uint32::nlanes;
                   }
               }
-         }
- 
-         src -= (_ksize/2)*cn;
-         kx -= _ksize/2;
-         for( ; i <= width - 4; i += 4, src += 4 )
-         {
-             __m128i s0 = z;
- 
-             for( k = j = 0; k < _ksize; k++, j += cn )
+             else
               {
-                 __m128i f = _mm_cvtsi32_si128(kx[k]);
-                 f = _mm_shuffle_epi32(f, 0);
- 
-                 __m128i x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
-                 x0 = _mm_unpacklo_epi8(x0, z);
-                 x0 = _mm_unpacklo_epi16(x0, z);
-                 x0 = _mm_madd_epi16(x0, f);
-                 s0 = _mm_add_epi32(s0, x0);
+                 v_int16 k0 = vx_setall_s16((short)(kx[0]));
+                 for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                 {
+                     v_uint8 v_src = vx_load(src);
+                     v_int32 s0, s1, s2, s3;
+                     v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1);
+                     v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3);
+                     for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn )
+                     {
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+ 
+                         v_uint8 v_src0 = vx_load(src - j);
+                         v_uint8 v_src1 = vx_load(src - j - cn);
+                         v_uint8 v_src2 = vx_load(src + j);
+                         v_uint8 v_src3 = vx_load(src + j + cn);
+ 
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh);
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                         v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh);
+                         s2 += v_dotprod(xl, k12);
+                         s3 += v_dotprod(xh, k12);
+                     }
+                     if( k < _ksize / 2 + 1 )
+                     {
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
+                         v_uint8 v_src0 = vx_load(src - j);
+                         v_uint8 v_src1 = vx_load(src + j);
+ 
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh);
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh);
+                         s2 += v_dotprod(xl, k12);
+                         s3 += v_dotprod(xh, k12);
+                     }
+                     v_store(dst + i, s0);
+                     v_store(dst + i + v_int32::nlanes, s1);
+                     v_store(dst + i + 2*v_int32::nlanes, s2);
+                     v_store(dst + i + 3*v_int32::nlanes, s3);
+                 }
+                 if( i <= width - v_uint16::nlanes )
+                 {
+                     v_int32 s0, s1;
+                     v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
+                     for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn )
+                     {
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh);
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                     }
+                     if( k < _ksize / 2 + 1 )
+                     {
+                         v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh);
+                         s0 += v_dotprod(xl, k1);
+                         s1 += v_dotprod(xh, k1);
+                     }
+                     v_store(dst + i, s0);
+                     v_store(dst + i + v_int32::nlanes, s1);
+                     i += v_uint16::nlanes; src += v_uint16::nlanes;
+                 }
+                 if( i <= width - v_uint32::nlanes )
+                 {
+                     v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                     for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn)
+                         s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0);
+                     v_store(dst + i, s0);
+                     i += v_uint32::nlanes;
+                 }
               }
-             _mm_store_si128((__m128i*)(dst + i), s0);
           }
   
           return i;
@@@ -885,129 -1105,117 +1105,117 @@@ struct SymmColumnVec_32s8
   
       int operator()(const uchar** _src, uchar* dst, int width) const
       {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
- 
-         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+         int _ksize = kernel.rows + kernel.cols - 1;
+         int ksize2 = _ksize/2;
           const float* ky = kernel.ptr<float>() + ksize2;
           int i = 0, k;
           bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
           const int** src = (const int**)_src;
-         const __m128i *S, *S2;
-         __m128 d4 = _mm_set1_ps(delta);
   
+         v_float32 d4 = vx_setall_f32(delta);
           if( symmetrical )
           {
-             for( ; i <= width - 16; i += 16 )
-             {
-                 __m128 f = _mm_load_ss(ky);
-                 f = _mm_shuffle_ps(f, f, 0);
-                 __m128 s0, s1, s2, s3;
-                 __m128i x0, x1;
-                 S = (const __m128i*)(src[0] + i);
-                 s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
-                 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
-                 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
-                 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
-                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
-                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
- 
+             if (_ksize == 1)
+                 return 0;
+             v_float32 f0 = vx_setall_f32(ky[0]);
+             for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+             {
+                 const int* S = src[0] + i;
+                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
+                 v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
+                 v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4);
+                 v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4);
                   for( k = 1; k <= ksize2; k++ )
                   {
-                     S = (const __m128i*)(src[k] + i);
-                     S2 = (const __m128i*)(src[-k] + i);
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                     x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
-                     x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
-                     x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
+                     v_float32 f = vx_setall_f32(ky[k]);
+                     const int* S0 = src[k] + i;
+                     const int* S1 = src[-k] + i;
+                     s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
+                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
+                     s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2);
+                     s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3);
                   }
- 
-                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-                 x0 = _mm_packus_epi16(x0, x1);
-                 _mm_storeu_si128((__m128i*)(dst + i), x0);
+                 v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
               }
- 
-             for( ; i <= width - 4; i += 4 )
+             if( i <= width - v_uint16::nlanes )
               {
-                 __m128 f = _mm_load_ss(ky);
-                 f = _mm_shuffle_ps(f, f, 0);
-                 __m128i x0;
-                 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
- 
+                 const int* S = src[0] + i;
+                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
+                 v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
                   for( k = 1; k <= ksize2; k++ )
                   {
-                     S = (const __m128i*)(src[k] + i);
-                     S2 = (const __m128i*)(src[-k] + i);
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
+                     v_float32 f = vx_setall_f32(ky[k]);
+                     const int* S0 = src[k] + i;
+                     const int* S1 = src[-k] + i;
+                     s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
+                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
                   }
- 
-                 x0 = _mm_cvtps_epi32(s0);
-                 x0 = _mm_packs_epi32(x0, x0);
-                 x0 = _mm_packus_epi16(x0, x0);
-                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
+                 v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                 i += v_uint16::nlanes;
+             }
+ #if CV_SIMD_WIDTH > 16
+             while( i <= width - v_int32x4::nlanes )
+ #else
+             if( i <= width - v_int32x4::nlanes )
+ #endif
+             {
+                 v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta));
+                 for( k = 1; k <= ksize2; k++ )
+                     s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
+                 v_int32x4 s32 = v_round(s0);
+                 v_int16x8 s16 = v_pack(s32, s32);
+                 *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+                 i += v_int32x4::nlanes;
               }
           }
           else
           {
-             for( ; i <= width - 16; i += 16 )
+             for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
               {
-                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-                 __m128i x0, x1;
- 
-                 for( k = 1; k <= ksize2; k++ )
+                 v_float32 s0 = d4;
+                 v_float32 s1 = d4;
+                 v_float32 s2 = d4;
+                 v_float32 s3 = d4;
+                 for ( k = 1; k <= ksize2; k++ )
                   {
-                     S = (const __m128i*)(src[k] + i);
-                     S2 = (const __m128i*)(src[-k] + i);
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                     x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
-                     x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
-                     x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
+                     v_float32 f = vx_setall_f32(ky[k]);
+                     const int* S0 = src[k] + i;
+                     const int* S1 = src[-k] + i;
+                     s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
+                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
+                     s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2);
+                     s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3);
                   }
- 
-                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-                 x0 = _mm_packus_epi16(x0, x1);
-                 _mm_storeu_si128((__m128i*)(dst + i), x0);
+                 v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
               }
- 
-             for( ; i <= width - 4; i += 4 )
+             if( i <= width - v_uint16::nlanes )
               {
-                 __m128 f, s0 = d4;
-                 __m128i x0;
- 
-                 for( k = 1; k <= ksize2; k++ )
+                 v_float32 s0 = d4;
+                 v_float32 s1 = d4;
+                 for ( k = 1; k <= ksize2; k++ )
                   {
-                     S = (const __m128i*)(src[k] + i);
-                     S2 = (const __m128i*)(src[-k] + i);
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
+                     v_float32 f = vx_setall_f32(ky[k]);
+                     const int* S0 = src[k] + i;
+                     const int* S1 = src[-k] + i;
+                     s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
+                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
                   }
- 
-                 x0 = _mm_cvtps_epi32(s0);
-                 x0 = _mm_packs_epi32(x0, x0);
-                 x0 = _mm_packus_epi16(x0, x0);
-                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
+                 v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                 i += v_uint16::nlanes;
+             }
+ #if CV_SIMD_WIDTH > 16
+             while( i <= width - v_int32x4::nlanes )
+ #else
+             if( i <= width - v_int32x4::nlanes )
+ #endif
+             {
+                 v_float32x4 s0 = v_setall_f32(delta);
+                 for (k = 1; k <= ksize2; k++)
+                     s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
+                 v_int32x4 s32 = v_round(s0);
+                 v_int16x8 s16 = v_pack(s32, s32);
+                 *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+                 i += v_int32x4::nlanes;
               }
           }
   
@@@ -1033,9 -1241,6 +1241,6 @@@ struct SymmColumnSmallVec_32s16
   
       int operator()(const uchar** _src, uchar* _dst, int width) const
       {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
- 
           int ksize2 = (kernel.rows + kernel.cols - 1)/2;
           const float* ky = kernel.ptr<float>() + ksize2;
           int i = 0;
@@@ -1043,66 -1248,63 +1248,63 @@@
           const int** src = (const int**)_src;
           const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
           short* dst = (short*)_dst;
-         __m128 df4 = _mm_set1_ps(delta);
-         __m128i d4 = _mm_cvtps_epi32(df4);
   
+         v_float32 df4 = vx_setall_f32(delta);
+         v_int32 d4 = v_round(df4);
           if( symmetrical )
           {
               if( ky[0] == 2 && ky[1] == 1 )
               {
-                 for( ; i <= width - 8; i += 8 )
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                 {
+                     v_int32 sl = vx_load(S1 + i);
+                     v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
+                     v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 + (sh + sh)));
+                 }
+                 if( i <= width - v_int32::nlanes )
                   {
-                     __m128i s0, s1, s2, s3, s4, s5;
-                     s0 = _mm_load_si128((__m128i*)(S0 + i));
-                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
-                     s2 = _mm_load_si128((__m128i*)(S1 + i));
-                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
-                     s4 = _mm_load_si128((__m128i*)(S2 + i));
-                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
-                     s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
-                     s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
-                     s0 = _mm_add_epi32(s0, d4);
-                     s1 = _mm_add_epi32(s1, d4);
-                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+                     v_int32 s = vx_load(S1 + i);
+                     v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (s + s));
+                     i += v_int32::nlanes;
                   }
               }
               else if( ky[0] == -2 && ky[1] == 1 )
               {
-                 for( ; i <= width - 8; i += 8 )
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                 {
+                     v_int32 sl = vx_load(S1 + i);
+                     v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
+                     v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 - (sh + sh)));
+                 }
+                 if( i <= width - v_int32::nlanes )
+                 {
+                     v_int32 s = vx_load(S1 + i);
+                     v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (s + s));
+                     i += v_int32::nlanes;
+                 }
+             }
+             else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) )
+             {
+                 v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                     v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+                                             v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
+                 if( i <= width - v_int32::nlanes )
                   {
-                     __m128i s0, s1, s2, s3, s4, s5;
-                     s0 = _mm_load_si128((__m128i*)(S0 + i));
-                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
-                     s2 = _mm_load_si128((__m128i*)(S1 + i));
-                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
-                     s4 = _mm_load_si128((__m128i*)(S2 + i));
-                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
-                     s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
-                     s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
-                     s0 = _mm_add_epi32(s0, d4);
-                     s1 = _mm_add_epi32(s1, d4);
-                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+                     v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
+                     i += v_int32::nlanes;
                   }
               }
               else
               {
-                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
-                 for( ; i <= width - 8; i += 8 )
+                 v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                     v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+                                             v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
+                 if( i <= width - v_int32::nlanes )
                   {
-                     __m128 s0, s1;
-                     s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
-                     s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
-                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
-                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
-                     __m128i x0, x1;
-                     x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
-                                        _mm_load_si128((__m128i*)(S2 + i)));
-                     x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
-                                        _mm_load_si128((__m128i*)(S2 + i + 4)));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
-                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                     _mm_storeu_si128((__m128i*)(dst + i), x0);
+                     v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
+                     i += v_int32::nlanes;
                   }
               }
           }
@@@ -1112,33 -1314,24 +1314,24 @@@
               {
                   if( ky[1] < 0 )
                       std::swap(S0, S2);
-                 for( ; i <= width - 8; i += 8 )
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                     v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i) + d4, vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes) + d4));
+                 if( i <= width - v_int32::nlanes )
                   {
-                     __m128i s0, s1, s2, s3;
-                     s0 = _mm_load_si128((__m128i*)(S2 + i));
-                     s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
-                     s2 = _mm_load_si128((__m128i*)(S0 + i));
-                     s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
-                     s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
-                     s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
-                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+                     v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
+                     i += v_int32::nlanes;
                   }
               }
               else
               {
-                 __m128 k1 = _mm_set1_ps(ky[1]);
-                 for( ; i <= width - 8; i += 8 )
+                 v_float32 k1 = vx_setall_f32(ky[1]);
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                     v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
+                                             v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
+                 if( i <= width - v_int32::nlanes )
                   {
-                     __m128 s0 = df4, s1 = df4;
-                     __m128i x0, x1;
-                     x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)),
-                                        _mm_load_si128((__m128i*)(S0 + i)));
-                     x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)),
-                                        _mm_load_si128((__m128i*)(S0 + i + 4)));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
-                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                     _mm_storeu_si128((__m128i*)(dst + i), x0);
+                     v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)));
+                     i += v_int32::nlanes;
                   }
               }
           }
@@@ -1156,188 -1349,118 +1349,118 @@@
   
   struct RowVec_16s32f
   {
-     RowVec_16s32f() { sse2_supported = false; }
+     RowVec_16s32f() {}
       RowVec_16s32f( const Mat& _kernel )
       {
           kernel = _kernel;
-         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
       }
   
       int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
       {
-         if( !sse2_supported )
-             return 0;
- 
           int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
           float* dst = (float*)_dst;
           const float* _kx = kernel.ptr<float>();
           width *= cn;
   
-         for( ; i <= width - 8; i += 8 )
+         for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
           {
               const short* src = (const short*)_src + i;
-             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
+             v_float32 s0 = vx_setzero_f32();
+             v_float32 s1 = vx_setzero_f32();
               for( k = 0; k < _ksize; k++, src += cn )
               {
-                 f = _mm_load_ss(_kx+k);
-                 f = _mm_shuffle_ps(f, f, 0);
- 
-                 __m128i x0i = _mm_loadu_si128((const __m128i*)src);
-                 __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
-                 x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
-                 x0 = _mm_cvtepi32_ps(x0i);
-                 x1 = _mm_cvtepi32_ps(x1i);
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
+                 v_int16 x = vx_load(src);
+                 s0 = v_muladd(v_cvt_f32(v_expand_low(x)), vx_setall_f32(_kx[k]), s0);
+                 s1 = v_muladd(v_cvt_f32(v_expand_high(x)), vx_setall_f32(_kx[k]), s1);
               }
-             _mm_store_ps(dst + i, s0);
-             _mm_store_ps(dst + i + 4, s1);
+             v_store(dst + i, s0);
+             v_store(dst + i + v_float32::nlanes, s1);
+         }
+         if( i <= width - v_float32::nlanes )
+         {
+             const short* src = (const short*)_src + i;
+             v_float32 s0 = vx_setzero_f32();
+             for( k = 0; k < _ksize; k++, src += cn )
+                 s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0);
+             v_store(dst + i, s0);
+             i += v_float32::nlanes;
           }
           return i;
       }
   
       Mat kernel;
-     bool sse2_supported;
   };
   
   
   struct SymmColumnVec_32f16s
   {
-     SymmColumnVec_32f16s() { symmetryType=0; delta = 0; sse2_supported = false; }
+     SymmColumnVec_32f16s() { symmetryType=0; delta = 0; }
       SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
       {
           symmetryType = _symmetryType;
           kernel = _kernel;
           delta = (float)_delta;
           CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
       }
   
       int operator()(const uchar** _src, uchar* _dst, int width) const
       {
-         if( !sse2_supported )
-             return 0;
- 
-         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+         int _ksize = kernel.rows + kernel.cols - 1;
+         int ksize2 = _ksize / 2;
           const float* ky = kernel.ptr<float>() + ksize2;
           int i = 0, k;
           bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
           const float** src = (const float**)_src;
-         const float *S, *S2;
           short* dst = (short*)_dst;
-         __m128 d4 = _mm_set1_ps(delta);
   
+         v_float32 d4 = vx_setall_f32(delta);
           if( symmetrical )
           {
-             for( ; i <= width - 16; i += 16 )
+             if (_ksize == 1)
+                 return 0;
+             v_float32 k0 = vx_setall_f32(ky[0]);
+             for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
               {
-                 __m128 f = _mm_load_ss(ky);
-                 f = _mm_shuffle_ps(f, f, 0);
-                 __m128 s0, s1, s2, s3;
-                 __m128 x0, x1;
-                 S = src[0] + i;
-                 s0 = _mm_load_ps(S);
-                 s1 = _mm_load_ps(S+4);
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
-                 s2 = _mm_load_ps(S+8);
-                 s3 = _mm_load_ps(S+12);
-                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
-                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
- 
+                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+                 v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
                   for( k = 1; k <= ksize2; k++ )
                   {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
+                     v_float32 k1 = vx_setall_f32(ky[k]);
+                     s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
+                     s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
                   }
- 
-                 __m128i s0i = _mm_cvtps_epi32(s0);
-                 __m128i s1i = _mm_cvtps_epi32(s1);
-                 __m128i s2i = _mm_cvtps_epi32(s2);
-                 __m128i s3i = _mm_cvtps_epi32(s3);
- 
-                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
-                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
+                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
               }
- 
-             for( ; i <= width - 4; i += 4 )
+             if( i <= width - v_float32::nlanes )
               {
-                 __m128 f = _mm_load_ss(ky);
-                 f = _mm_shuffle_ps(f, f, 0);
-                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
- 
+                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
                   for( k = 1; k <= ksize2; k++ )
-                 {
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 }
- 
-                 __m128i s0i = _mm_cvtps_epi32(s0);
-                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
+                     s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                 v_pack_store(dst + i, v_round(s0));
+                 i += v_float32::nlanes;
               }
           }
           else
           {
-             for( ; i <= width - 16; i += 16 )
+             for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
               {
-                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-                 __m128 x0, x1;
-                 S = src[0] + i;
- 
+                 v_float32 s0 = d4;
+                 v_float32 s1 = d4;
                   for( k = 1; k <= ksize2; k++ )
                   {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
+                     v_float32 k1 = vx_setall_f32(ky[k]);
+                     s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k1, s0);
+                     s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
                   }
- 
-                 __m128i s0i = _mm_cvtps_epi32(s0);
-                 __m128i s1i = _mm_cvtps_epi32(s1);
-                 __m128i s2i = _mm_cvtps_epi32(s2);
-                 __m128i s3i = _mm_cvtps_epi32(s3);
- 
-                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
-                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
+                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
               }
- 
-             for( ; i <= width - 4; i += 4 )
+             if( i <= width - v_float32::nlanes )
               {
-                 __m128 f, x0, s0 = d4;
- 
+                 v_float32 s0 = d4;
                   for( k = 1; k <= ksize2; k++ )
-                 {
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 }
- 
-                 __m128i s0i = _mm_cvtps_epi32(s0);
-                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
+                     s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                 v_pack_store(dst + i, v_round(s0));
+                 i += v_float32::nlanes;
               }
           }
   
@@@ -1347,7 -1470,6 +1470,6 @@@
       int symmetryType;
       float delta;
       Mat kernel;
-     bool sse2_supported;
   };
   
   
@@@ -1357,7 -1479,6 +1479,6 @@@ struct RowVec_32
   {
       RowVec_32f()
       {
-         haveSSE = checkHardwareSupport(CV_CPU_SSE);
           haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
   #if defined USE_IPP_SEP_FILTERS
           bufsz = -1;
@@@ -1367,7 -1488,6 +1488,6 @@@
       RowVec_32f( const Mat& _kernel )
       {
           kernel = _kernel;
-         haveSSE = checkHardwareSupport(CV_CPU_SSE);
           haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
   #if defined USE_IPP_SEP_FILTERS
           bufsz = -1;
@@@ -1389,9 -1509,6 +1509,6 @@@
           float* dst = (float*)_dst;
           const float* _kx = kernel.ptr<float>();
   
-         if( !haveSSE )
-             return 0;
- 
           int i = 0, k;
           width *= cn;
   
@@@ -1399,27 -1516,18 +1516,18 @@@
           if (haveAVX2)
               return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize);
   #endif
-         for( ; i <= width - 8; i += 8 )
+         for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
           {
               const float* src = src0 + i;
-             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
+             v_float32 s0 = vx_setzero_f32();
               for( k = 0; k < _ksize; k++, src += cn )
-             {
-                 f = _mm_set1_ps(_kx[k]);
- 
-                 x0 = _mm_loadu_ps(src);
-                 x1 = _mm_loadu_ps(src + 4);
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-             }
-             _mm_store_ps(dst + i, s0);
-             _mm_store_ps(dst + i + 4, s1);
+                 s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0);
+             v_store(dst + i, s0);
           }
           return i;
       }
   
       Mat kernel;
-     bool haveSSE;
       bool haveAVX2;
   #if defined USE_IPP_SEP_FILTERS
   private:
@@@ -1475,9 -1583,6 +1583,6 @@@ struct SymmRowSmallVec_32
   
       int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
       {
-         if( !checkHardwareSupport(CV_CPU_SSE) )
-             return 0;
- 
           int i = 0, _ksize = kernel.rows + kernel.cols - 1;
           float* dst = (float*)_dst;
           const float* src = (const float*)_src + (_ksize/2)*cn;
@@@ -1491,101 -1596,32 +1596,32 @@@
                   return 0;
               if( _ksize == 3 )
               {
-                 if( kx[0] == 2 && kx[1] == 1 )
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn);
-                         y0 = _mm_loadu_ps(src - cn + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn + 4);
-                         x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
-                         y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
-                 else if( kx[0] == -2 && kx[1] == 1 )
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn);
-                         y0 = _mm_loadu_ps(src - cn + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn + 4);
-                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
-                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                 if( fabs(kx[0]) == 2 && kx[1] == 1 )
+                 {
+                     v_float32 k0 = vx_setall_f32(kx[0]);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn)));
+                 }
                   else
                   {
-                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn);
-                         y0 = _mm_loadu_ps(src - cn + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn + 4);
- 
-                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
-                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
-                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
-                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                     v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1));
                   }
               }
               else if( _ksize == 5 )
               {
                   if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn*2);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn*2);
-                         y0 = _mm_loadu_ps(src - cn*2 + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn*2 + 4);
-                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
-                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                 {
+                     v_float32 k0 = vx_setall_f32(-2);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn)));
+                 }
                   else
                   {
-                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn);
-                         y0 = _mm_loadu_ps(src - cn + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn + 4);
- 
-                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
-                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
-                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
-                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
- 
-                         x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
-                         y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
-                         x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
-                         y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
- 
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                     v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)));
                   }
               }
           }
@@@ -1594,58 -1630,20 +1630,20 @@@
               if( _ksize == 3 )
               {
                   if( kx[0] == 0 && kx[1] == 1 )
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x2, y0, y2;
-                         x0 = _mm_loadu_ps(src + cn);
-                         x2 = _mm_loadu_ps(src - cn);
-                         y0 = _mm_loadu_ps(src + cn + 4);
-                         y2 = _mm_loadu_ps(src - cn + 4);
-                         x0 = _mm_sub_ps(x0, x2);
-                         y0 = _mm_sub_ps(y0, y2);
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, vx_load(src + cn) - vx_load(src - cn));
                   else
                   {
-                     __m128 k1 = _mm_set1_ps(kx[1]);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x2, y0, y2;
-                         x0 = _mm_loadu_ps(src + cn);
-                         x2 = _mm_loadu_ps(src - cn);
-                         y0 = _mm_loadu_ps(src + cn + 4);
-                         y2 = _mm_loadu_ps(src - cn + 4);
- 
-                         x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
-                         y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                     v_float32 k1 = vx_setall_f32(kx[1]);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1);
                   }
               }
               else if( _ksize == 5 )
               {
-                 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
-                 for( ; i <= width - 8; i += 8, src += 8 )
-                 {
-                     __m128 x0, x2, y0, y2;
-                     x0 = _mm_loadu_ps(src + cn);
-                     x2 = _mm_loadu_ps(src - cn);
-                     y0 = _mm_loadu_ps(src + cn + 4);
-                     y2 = _mm_loadu_ps(src - cn + 4);
- 
-                     x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
-                     y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
- 
-                     x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
-                     y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
-                     x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
-                     y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
- 
-                     _mm_store_ps(dst + i, x0);
-                     _mm_store_ps(dst + i + 4, y0);
-                 }
+                 v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                     v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1));
               }
           }
   
@@@ -1661,7 -1659,6 +1659,6 @@@ struct SymmColumnVec_32
   {
       SymmColumnVec_32f() {
           symmetryType=0;
-         haveSSE = checkHardwareSupport(CV_CPU_SSE);
           haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
           delta = 0;
       }
@@@ -1670,22 -1667,17 +1667,17 @@@
           symmetryType = _symmetryType;
           kernel = _kernel;
           delta = (float)_delta;
-         haveSSE = checkHardwareSupport(CV_CPU_SSE);
           haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
           CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
       }
   
       int operator()(const uchar** _src, uchar* _dst, int width) const
       {
-         if( !haveSSE )
-             return 0;
- 
           int ksize2 = (kernel.rows + kernel.cols - 1)/2;
           const float* ky = kernel.ptr<float>() + ksize2;
           int i = 0, k;
           bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
           const float** src = (const float**)_src;
-         const float *S, *S2;
           float* dst = (float*)_dst;
   
           if( symmetrical )
@@@ -1695,869 -1687,28 +1687,28 @@@
               if (haveAVX2)
                   return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2);
   #endif
-             const __m128 d4 = _mm_set1_ps(delta);
-             for( ; i <= width - 16; i += 16 )
-             {
-                 __m128 f = _mm_set1_ps(ky[0]);
-                 __m128 s0, s1, s2, s3;
-                 __m128 x0, x1;
-                 S = src[0] + i;
-                 s0 = _mm_load_ps(S);
-                 s1 = _mm_load_ps(S+4);
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
-                 s2 = _mm_load_ps(S+8);
-                 s3 = _mm_load_ps(S+12);
-                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
-                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
- 
-                 for( k = 1; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     f = _mm_set1_ps(ky[k]);
-                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
-                 }
- 
-                 _mm_storeu_ps(dst + i, s0);
-                 _mm_storeu_ps(dst + i + 4, s1);
-                 _mm_storeu_ps(dst + i + 8, s2);
-                 _mm_storeu_ps(dst + i + 12, s3);
-             }
- 
-             for( ; i <= width - 4; i += 4 )
-             {
-                 __m128 f = _mm_set1_ps(ky[0]);
-                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
- 
-                 for( k = 1; k <= ksize2; k++ )
-                 {
-                     f = _mm_set1_ps(ky[k]);
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 }
- 
-                 _mm_storeu_ps(dst + i, s0);
-             }
-         }
-         else
-         {
- #if CV_TRY_AVX2
-             if (haveAVX2)
-                 return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
- #endif
-             const __m128 d4 = _mm_set1_ps(delta);
-             for( ; i <= width - 16; i += 16 )
-             {
-                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-                 __m128 x0, x1;
-                 S = src[0] + i;
- 
-                 for( k = 1; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     f = _mm_set1_ps(ky[k]);
-                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
-                 }
- 
-                 _mm_storeu_ps(dst + i, s0);
-                 _mm_storeu_ps(dst + i + 4, s1);
-                 _mm_storeu_ps(dst + i + 8, s2);
-                 _mm_storeu_ps(dst + i + 12, s3);
-             }
- 
-             for( ; i <= width - 4; i += 4 )
+             const v_float32 d4 = vx_setall_f32(delta);
+             for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
               {
-                 __m128 f, x0, s0 = d4;
- 
+                 v_float32 s0 = v_muladd(vx_load(src[0] + i), vx_setall_f32(ky[0]), d4);
                   for( k = 1; k <= ksize2; k++ )
-                 {
-                     f = _mm_set1_ps(ky[k]);
-                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 }
- 
-                 _mm_storeu_ps(dst + i, s0);
+                     s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                 v_store(dst + i, s0);
               }
           }
- 
-         return i;
-     }
- 
-     int symmetryType;
-     float delta;
-     Mat kernel;
-     bool haveSSE;
-     bool haveAVX2;
- };
- 
- 
- struct SymmColumnSmallVec_32f
- {
-     SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; }
-     SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
-     {
-         symmetryType = _symmetryType;
-         kernel = _kernel;
-         delta = (float)_delta;
-         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-     }
- 
-     int operator()(const uchar** _src, uchar* _dst, int width) const
-     {
-         if( !checkHardwareSupport(CV_CPU_SSE) )
-             return 0;
- 
-         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
-         const float* ky = kernel.ptr<float>() + ksize2;
-         int i = 0;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const float** src = (const float**)_src;
-         const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
-         float* dst = (float*)_dst;
-         __m128 d4 = _mm_set1_ps(delta);
- 
-         if( symmetrical )
-         {
-             if( ky[0] == 2 && ky[1] == 1 )
-             {
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0, s1, s2, s3, s4, s5;
-                     s0 = _mm_load_ps(S0 + i);
-                     s1 = _mm_load_ps(S0 + i + 4);
-                     s2 = _mm_load_ps(S1 + i);
-                     s3 = _mm_load_ps(S1 + i + 4);
-                     s4 = _mm_load_ps(S2 + i);
-                     s5 = _mm_load_ps(S2 + i + 4);
-                     s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
-                     s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
-                     s0 = _mm_add_ps(s0, d4);
-                     s1 = _mm_add_ps(s1, d4);
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-             else if( ky[0] == -2 && ky[1] == 1 )
-             {
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0, s1, s2, s3, s4, s5;
-                     s0 = _mm_load_ps(S0 + i);
-                     s1 = _mm_load_ps(S0 + i + 4);
-                     s2 = _mm_load_ps(S1 + i);
-                     s3 = _mm_load_ps(S1 + i + 4);
-                     s4 = _mm_load_ps(S2 + i);
-                     s5 = _mm_load_ps(S2 + i + 4);
-                     s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
-                     s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
-                     s0 = _mm_add_ps(s0, d4);
-                     s1 = _mm_add_ps(s1, d4);
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-             else
-             {
-                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0, s1, x0, x1;
-                     s0 = _mm_load_ps(S1 + i);
-                     s1 = _mm_load_ps(S1 + i + 4);
-                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
-                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
-                     x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
-                     x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-         }
-         else
-         {
-             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
-             {
-                 if( ky[1] < 0 )
-                     std::swap(S0, S2);
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0, s1, s2, s3;
-                     s0 = _mm_load_ps(S2 + i);
-                     s1 = _mm_load_ps(S2 + i + 4);
-                     s2 = _mm_load_ps(S0 + i);
-                     s3 = _mm_load_ps(S0 + i + 4);
-                     s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
-                     s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-             else
-             {
-                 __m128 k1 = _mm_set1_ps(ky[1]);
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0 = d4, s1 = d4, x0, x1;
-                     x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
-                     x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-         }
- 
-         return i;
-     }
- 
-     int symmetryType;
-     float delta;
-     Mat kernel;
- };
- 
- 
- /////////////////////////////// non-separable filters ///////////////////////////////
- 
- ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
- 
- struct FilterVec_8u
- {
-     FilterVec_8u() { delta = 0; _nz = 0; }
-     FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
-     {
-         Mat kernel;
-         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-         delta = (float)(_delta/(1 << _bits));
-         std::vector<Point> coords;
-         preprocess2DKernel(kernel, coords, coeffs);
-         _nz = (int)coords.size();
-     }
- 
-     int operator()(const uchar** src, uchar* dst, int width) const
-     {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
- 
-         const float* kf = (const float*)&coeffs[0];
-         int i = 0, k, nz = _nz;
-         __m128 d4 = _mm_set1_ps(delta);
- 
-         for( ; i <= width - 16; i += 16 )
-         {
-             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-             __m128i x0, x1, z = _mm_setzero_si128();
- 
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0, t1;
-                 f = _mm_shuffle_ps(f, f, 0);
- 
-                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
-                 x1 = _mm_unpackhi_epi8(x0, z);
-                 x0 = _mm_unpacklo_epi8(x0, z);
- 
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
- 
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
-                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
-                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
-                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
-             }
- 
-             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-             x0 = _mm_packus_epi16(x0, x1);
-             _mm_storeu_si128((__m128i*)(dst + i), x0);
-         }
- 
-         for( ; i <= width - 4; i += 4 )
-         {
-             __m128 s0 = d4;
-             __m128i x0, z = _mm_setzero_si128();
- 
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0;
-                 f = _mm_shuffle_ps(f, f, 0);
- 
-                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
-                 x0 = _mm_unpacklo_epi8(x0, z);
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-             }
- 
-             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
-             x0 = _mm_packus_epi16(x0, x0);
-             *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
-         }
- 
-         return i;
-     }
- 
-     int _nz;
-     std::vector<uchar> coeffs;
-     float delta;
- };
- 
- 
- struct FilterVec_8u16s
- {
-     FilterVec_8u16s() { delta = 0; _nz = 0; }
-     FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
-     {
-         Mat kernel;
-         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-         delta = (float)(_delta/(1 << _bits));
-         std::vector<Point> coords;
-         preprocess2DKernel(kernel, coords, coeffs);
-         _nz = (int)coords.size();
-     }
- 
-     int operator()(const uchar** src, uchar* _dst, int width) const
-     {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
- 
-         const float* kf = (const float*)&coeffs[0];
-         short* dst = (short*)_dst;
-         int i = 0, k, nz = _nz;
-         __m128 d4 = _mm_set1_ps(delta);
- 
-         for( ; i <= width - 16; i += 16 )
-         {
-             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-             __m128i x0, x1, z = _mm_setzero_si128();
- 
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0, t1;
-                 f = _mm_shuffle_ps(f, f, 0);
- 
-                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
-                 x1 = _mm_unpackhi_epi8(x0, z);
-                 x0 = _mm_unpacklo_epi8(x0, z);
- 
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
- 
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
-                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
-                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
-                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
-             }
- 
-             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-             _mm_storeu_si128((__m128i*)(dst + i), x0);
-             _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
-         }
- 
-         for( ; i <= width - 4; i += 4 )
-         {
-             __m128 s0 = d4;
-             __m128i x0, z = _mm_setzero_si128();
- 
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0;
-                 f = _mm_shuffle_ps(f, f, 0);
- 
-                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
-                 x0 = _mm_unpacklo_epi8(x0, z);
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-             }
- 
-             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
-             _mm_storel_epi64((__m128i*)(dst + i), x0);
-         }
- 
-         return i;
-     }
- 
-     int _nz;
-     std::vector<uchar> coeffs;
-     float delta;
- };
- 
- 
- struct FilterVec_32f
- {
-     FilterVec_32f() { delta = 0; _nz = 0; }
-     FilterVec_32f(const Mat& _kernel, int, double _delta)
-     {
-         delta = (float)_delta;
-         std::vector<Point> coords;
-         preprocess2DKernel(_kernel, coords, coeffs);
-         _nz = (int)coords.size();
-     }
- 
-     int operator()(const uchar** _src, uchar* _dst, int width) const
-     {
-         if( !checkHardwareSupport(CV_CPU_SSE) )
-             return 0;
- 
-         const float* kf = (const float*)&coeffs[0];
-         const float** src = (const float**)_src;
-         float* dst = (float*)_dst;
-         int i = 0, k, nz = _nz;
-         __m128 d4 = _mm_set1_ps(delta);
- 
-         for( ; i <= width - 16; i += 16 )
-         {
-             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
- 
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0, t1;
-                 f = _mm_shuffle_ps(f, f, 0);
-                 const float* S = src[k] + i;
- 
-                 t0 = _mm_loadu_ps(S);
-                 t1 = _mm_loadu_ps(S + 4);
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
- 
-                 t0 = _mm_loadu_ps(S + 8);
-                 t1 = _mm_loadu_ps(S + 12);
-                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
-                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
-             }
- 
-             _mm_storeu_ps(dst + i, s0);
-             _mm_storeu_ps(dst + i + 4, s1);
-             _mm_storeu_ps(dst + i + 8, s2);
-             _mm_storeu_ps(dst + i + 12, s3);
-         }
- 
-         for( ; i <= width - 4; i += 4 )
-         {
-             __m128 s0 = d4;
- 
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0;
-                 f = _mm_shuffle_ps(f, f, 0);
-                 t0 = _mm_loadu_ps(src[k] + i);
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-             }
-             _mm_storeu_ps(dst + i, s0);
-         }
- 
-         return i;
-     }
- 
-     int _nz;
-     std::vector<uchar> coeffs;
-     float delta;
- };
- 
- 
- #elif CV_NEON
- 
- struct SymmRowSmallVec_8u32s
- {
-     SymmRowSmallVec_8u32s() { smallValues = false; }
-     SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
-     {
-         kernel = _kernel;
-         symmetryType = _symmetryType;
-         smallValues = true;
-         int k, ksize = kernel.rows + kernel.cols - 1;
-         for( k = 0; k < ksize; k++ )
-         {
-             int v = kernel.ptr<int>()[k];
-             if( v < SHRT_MIN || v > SHRT_MAX )
-             {
-                 smallValues = false;
-                 break;
-             }
-         }
-     }
- 
-     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
-     {
-          if( !checkHardwareSupport(CV_CPU_NEON) )
-              return 0;
- 
-         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
-         int* dst = (int*)_dst;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const int* kx = kernel.ptr<int>() + _ksize/2;
-         if( !smallValues )
-             return 0;
- 
-         src += (_ksize/2)*cn;
-         width *= cn;
- 
-         if( symmetrical )
-         {
-             if( _ksize == 1 )
-                 return 0;
-             if( _ksize == 3 )
-             {
-                 if( kx[0] == 2 && kx[1] == 1 )
-                 {
-                     uint16x8_t zq = vdupq_n_u16(0);
- 
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1, x2;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src) );
-                         x2 = vld1_u8( (uint8_t *) (src + cn) );
- 
-                         uint16x8_t y0, y1, y2;
-                         y0 = vaddl_u8(x0, x2);
-                         y1 = vshll_n_u8(x1, 1);
-                         y2 = vaddq_u16(y0, y1);
- 
-                         uint16x8x2_t str;
-                         str.val[0] = y2; str.val[1] = zq;
-                         vst2q_u16( (uint16_t *) (dst + i), str );
-                     }
-                 }
-                 else if( kx[0] == -2 && kx[1] == 1 )
-                     return 0;
-                 else
-                 {
-                     int32x4_t k32 = vdupq_n_s32(0);
-                     k32 = vld1q_lane_s32(kx, k32, 0);
-                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
- 
-                     int16x4_t k = vqmovn_s32(k32);
- 
-                     uint8x8_t z = vdup_n_u8(0);
- 
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1, x2;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src) );
-                         x2 = vld1_u8( (uint8_t *) (src + cn) );
- 
-                         int16x8_t y0, y1;
-                         int32x4_t y2, y3;
-                         y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
-                         y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
-                         y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
-                         y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
-                         y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
-                         y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
- 
-                         vst1q_s32((int32_t *)(dst + i), y2);
-                         vst1q_s32((int32_t *)(dst + i + 4), y3);
-                     }
-                 }
-             }
-             else if( _ksize == 5 )
-             {
-                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                     return 0;
-                 else
-                 {
-                     int32x4_t k32 = vdupq_n_s32(0);
-                     k32 = vld1q_lane_s32(kx, k32, 0);
-                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
-                     k32 = vld1q_lane_s32(kx + 2, k32, 2);
- 
-                     int16x4_t k = vqmovn_s32(k32);
- 
-                     uint8x8_t z = vdup_n_u8(0);
- 
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1, x2, x3, x4;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src) );
-                         x2 = vld1_u8( (uint8_t *) (src + cn) );
- 
-                         int16x8_t y0, y1;
-                         int32x4_t accl, acch;
-                         y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
-                         y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
-                         accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
-                         accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
-                         acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
-                         acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
- 
-                         int16x8_t y2;
-                         x3 = vld1_u8( (uint8_t *) (src - cn*2) );
-                         x4 = vld1_u8( (uint8_t *) (src + cn*2) );
-                         y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
-                         accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
-                         acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
- 
-                         vst1q_s32((int32_t *)(dst + i), accl);
-                         vst1q_s32((int32_t *)(dst + i + 4), acch);
-                     }
-                 }
-             }
-         }
-         else
-         {
-             if( _ksize == 3 )
-             {
-                 if( kx[0] == 0 && kx[1] == 1 )
-                 {
-                     uint8x8_t z = vdup_n_u8(0);
- 
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src + cn) );
- 
-                         int16x8_t y0;
-                         y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
-                                 vreinterpretq_s16_u16(vaddl_u8(x0, z)));
- 
-                         vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
-                         vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
-                     }
-                 }
-                 else
-                 {
-                     int32x4_t k32 = vdupq_n_s32(0);
-                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
- 
-                     int16x4_t k = vqmovn_s32(k32);
- 
-                     uint8x8_t z = vdup_n_u8(0);
- 
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src + cn) );
- 
-                         int16x8_t y0;
-                         int32x4_t y1, y2;
-                         y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
-                             vreinterpretq_s16_u16(vaddl_u8(x0, z)));
-                         y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
-                         y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
- 
-                         vst1q_s32((int32_t *)(dst + i), y1);
-                         vst1q_s32((int32_t *)(dst + i + 4), y2);
-                     }
-                 }
-             }
-             else if( _ksize == 5 )
-             {
-                 int32x4_t k32 = vdupq_n_s32(0);
-                 k32 = vld1q_lane_s32(kx + 1, k32, 1);
-                 k32 = vld1q_lane_s32(kx + 2, k32, 2);
- 
-                 int16x4_t k = vqmovn_s32(k32);
- 
-                 uint8x8_t z = vdup_n_u8(0);
- 
-                 for( ; i <= width - 8; i += 8, src += 8 )
-                 {
-                     uint8x8_t x0, x1;
-                     x0 = vld1_u8( (uint8_t *) (src - cn) );
-                     x1 = vld1_u8( (uint8_t *) (src + cn) );
- 
-                     int32x4_t accl, acch;
-                     int16x8_t y0;
-                     y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
-                         vreinterpretq_s16_u16(vaddl_u8(x0, z)));
-                     accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
-                     acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
- 
-                     uint8x8_t x2, x3;
-                     x2 = vld1_u8( (uint8_t *) (src - cn*2) );
-                     x3 = vld1_u8( (uint8_t *) (src + cn*2) );
- 
-                     int16x8_t y1;
-                     y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
-                         vreinterpretq_s16_u16(vaddl_u8(x2, z)));
-                     accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
-                     acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
- 
-                     vst1q_s32((int32_t *)(dst + i), accl);
-                     vst1q_s32((int32_t *)(dst + i + 4), acch);
-                 }
-             }
-         }
- 
-         return i;
-     }
- 
-     Mat kernel;
-     int symmetryType;
-     bool smallValues;
- };
- 
- 
- struct SymmColumnVec_32s8u
- {
-     SymmColumnVec_32s8u() { symmetryType=0; }
-     SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
-     {
-         symmetryType = _symmetryType;
-         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-         delta = (float)(_delta/(1 << _bits));
-         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-     }
- 
-     int operator()(const uchar** _src, uchar* dst, int width) const
-     {
-          if( !checkHardwareSupport(CV_CPU_NEON) )
-              return 0;
- 
-         int _ksize = kernel.rows + kernel.cols - 1;
-         int ksize2 = _ksize / 2;
-         const float* ky = kernel.ptr<float>() + ksize2;
-         int i = 0, k;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const int** src = (const int**)_src;
-         const int *S, *S2;
- 
-         float32x4_t d4 = vdupq_n_f32(delta);
- 
-         if( symmetrical )
-         {
-             if( _ksize == 1 )
-                 return 0;
- 
- 
-             float32x2_t k32;
-             k32 = vdup_n_f32(0);
-             k32 = vld1_lane_f32(ky, k32, 0);
-             k32 = vld1_lane_f32(ky + 1, k32, 1);
- 
-             for( ; i <= width - 8; i += 8 )
-             {
-                 float32x4_t accl, acch;
-                 float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
- 
-                 S = src[0] + i;
- 
-                 f0l = vcvtq_f32_s32( vld1q_s32(S) );
-                 f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
- 
-                 S = src[1] + i;
-                 S2 = src[-1] + i;
- 
-                 f1l = vcvtq_f32_s32( vld1q_s32(S) );
-                 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
-                 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
- 
-                 accl = acch = d4;
-                 accl = vmlaq_lane_f32(accl, f0l, k32, 0);
-                 acch = vmlaq_lane_f32(acch, f0h, k32, 0);
-                 accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
-                 acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
- 
-                 for( k = 2; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
- 
-                     float32x4_t f3l, f3h, f4l, f4h;
-                     f3l = vcvtq_f32_s32( vld1q_s32(S) );
-                     f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                     f4l = vcvtq_f32_s32( vld1q_s32(S2) );
-                     f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
- 
-                     accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
-                     acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
-                 }
- 
-                 int32x4_t s32l, s32h;
-                 s32l = vcvtq_s32_f32(accl);
-                 s32h = vcvtq_s32_f32(acch);
- 
-                 int16x4_t s16l, s16h;
-                 s16l = vqmovn_s32(s32l);
-                 s16h = vqmovn_s32(s32h);
- 
-                 uint8x8_t u8;
-                 u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
- 
-                 vst1_u8((uint8_t *)(dst + i), u8);
-             }
-         }
-         else
-         {
-             float32x2_t k32;
-             k32 = vdup_n_f32(0);
-             k32 = vld1_lane_f32(ky + 1, k32, 1);
- 
-             for( ; i <= width - 8; i += 8 )
-             {
-                 float32x4_t accl, acch;
-                 float32x4_t f1l, f1h, f2l, f2h;
- 
-                 S = src[1] + i;
-                 S2 = src[-1] + i;
- 
-                 f1l = vcvtq_f32_s32( vld1q_s32(S) );
-                 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
-                 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
- 
-                 accl = acch = d4;
-                 accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
-                 acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
- 
-                 for( k = 2; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
- 
-                     float32x4_t f3l, f3h, f4l, f4h;
-                     f3l = vcvtq_f32_s32( vld1q_s32(S) );
-                     f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                     f4l = vcvtq_f32_s32( vld1q_s32(S2) );
-                     f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
- 
-                     accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
-                     acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
-                 }
- 
-                 int32x4_t s32l, s32h;
-                 s32l = vcvtq_s32_f32(accl);
-                 s32h = vcvtq_s32_f32(acch);
- 
-                 int16x4_t s16l, s16h;
-                 s16l = vqmovn_s32(s32l);
-                 s16h = vqmovn_s32(s32h);
- 
-                 uint8x8_t u8;
-                 u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
- 
-                 vst1_u8((uint8_t *)(dst + i), u8);
+         else
+         {
+ #if CV_TRY_AVX2
+             if (haveAVX2)
+                 return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
+ #endif
+             const v_float32 d4 = vx_setall_f32(delta);
+             for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+             {
+                 v_float32 s0 = d4;
+                 for( k = 1; k <= ksize2; k++ )
+                     s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                 v_store(dst + i, s0);
               }
           }
   
@@@ -2567,128 -1718,45 +1718,45 @@@
       int symmetryType;
       float delta;
       Mat kernel;
+     bool haveAVX2;
   };
   
   
- struct SymmColumnSmallVec_32s16s
+ struct SymmColumnSmallVec_32f
   {
-     SymmColumnSmallVec_32s16s() { symmetryType=0; }
-     SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
+     SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; }
+     SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
       {
           symmetryType = _symmetryType;
-         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-         delta = (float)(_delta/(1 << _bits));
+         kernel = _kernel;
+         delta = (float)_delta;
           CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
       }
   
       int operator()(const uchar** _src, uchar* _dst, int width) const
       {
-          if( !checkHardwareSupport(CV_CPU_NEON) )
-              return 0;
- 
           int ksize2 = (kernel.rows + kernel.cols - 1)/2;
           const float* ky = kernel.ptr<float>() + ksize2;
           int i = 0;
           bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const int** src = (const int**)_src;
-         const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
-         short* dst = (short*)_dst;
-         float32x4_t df4 = vdupq_n_f32(delta);
-         int32x4_t d4 = vcvtq_s32_f32(df4);
+         const float** src = (const float**)_src;
+         const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
+         float* dst = (float*)_dst;
   
+         v_float32 d4 = vx_setall_f32(delta);
           if( symmetrical )
           {
-             if( ky[0] == 2 && ky[1] == 1 )
-             {
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S1 + i));
-                     x2 = vld1q_s32((int32_t const *)(S2 + i));
- 
-                     int32x4_t y0, y1, y2, y3;
-                     y0 = vaddq_s32(x0, x2);
-                     y1 = vqshlq_n_s32(x1, 1);
-                     y2 = vaddq_s32(y0, y1);
-                     y3 = vaddq_s32(y2, d4);
- 
-                     int16x4_t t;
-                     t = vqmovn_s32(y3);
- 
-                     vst1_s16((int16_t *)(dst + i), t);
-                 }
-             }
-             else if( ky[0] == -2 && ky[1] == 1 )
-             {
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S1 + i));
-                     x2 = vld1q_s32((int32_t const *)(S2 + i));
- 
-                     int32x4_t y0, y1, y2, y3;
-                     y0 = vaddq_s32(x0, x2);
-                     y1 = vqshlq_n_s32(x1, 1);
-                     y2 = vsubq_s32(y0, y1);
-                     y3 = vaddq_s32(y2, d4);
- 
-                     int16x4_t t;
-                     t = vqmovn_s32(y3);
- 
-                     vst1_s16((int16_t *)(dst + i), t);
-                 }
-             }
-             else if( ky[0] == 10 && ky[1] == 3 )
+             if( fabs(ky[0]) == 2 && ky[1] == 1 )
               {
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2, x3;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S1 + i));
-                     x2 = vld1q_s32((int32_t const *)(S2 + i));
- 
-                     x3 = vaddq_s32(x0, x2);
- 
-                     int32x4_t y0;
-                     y0 = vmlaq_n_s32(d4, x1, 10);
-                     y0 = vmlaq_n_s32(y0, x3, 3);
- 
-                     int16x4_t t;
-                     t = vqmovn_s32(y0);
- 
-                     vst1_s16((int16_t *)(dst + i), t);
-                 }
+                 v_float32 k0 = vx_setall_f32(ky[0]);
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                     v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4));
               }
               else
               {
-                 float32x2_t k32 = vdup_n_f32(0);
-                 k32 = vld1_lane_f32(ky, k32, 0);
-                 k32 = vld1_lane_f32(ky + 1, k32, 1);
- 
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2, x3, x4;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S1 + i));
-                     x2 = vld1q_s32((int32_t const *)(S2 + i));
- 
-                     x3 = vaddq_s32(x0, x2);
- 
-                     float32x4_t s0, s1, s2;
-                     s0 = vcvtq_f32_s32(x1);
-                     s1 = vcvtq_f32_s32(x3);
-                     s2 = vmlaq_lane_f32(df4, s0, k32, 0);
-                     s2 = vmlaq_lane_f32(s2, s1, k32, 1);
- 
-                     x4 = vcvtq_s32_f32(s2);
- 
-                     int16x4_t x5;
-                     x5 = vqmovn_s32(x4);
- 
-                     vst1_s16((int16_t *)(dst + i), x5);
-                 }
+                 v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                     v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
               }
           }
           else
@@@ -2697,46 -1765,14 +1765,14 @@@
               {
                   if( ky[1] < 0 )
                       std::swap(S0, S2);
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S2 + i));
- 
-                     int32x4_t y0, y1;
-                     y0 = vsubq_s32(x1, x0);
-                     y1 = vqaddq_s32(y0, d4);
- 
-                     int16x4_t t;
-                     t = vqmovn_s32(y1);
- 
-                     vst1_s16((int16_t *)(dst + i), t);
-                 }
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                     v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
               }
               else
               {
-                 float32x2_t k32 = vdup_n_f32(0);
-                 k32 = vld1_lane_f32(ky + 1, k32, 1);
- 
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2, x3;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S2 + i));
- 
-                     x2 = vsubq_s32(x1, x0);
- 
-                     float32x4_t s0, s1;
-                     s0 = vcvtq_f32_s32(x2);
-                     s1 = vmlaq_lane_f32(df4, s0, k32, 1);
- 
-                     x3 = vcvtq_s32_f32(s1);
- 
-                     int16x4_t x4;
-                     x4 = vqmovn_s32(x3);
- 
-                     vst1_s16((int16_t *)(dst + i), x4);
-                 }
+                 v_float32 k1 = vx_setall_f32(ky[1]);
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                     v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4));
               }
           }
   
@@@ -2749,276 -1785,186 +1785,186 @@@
   };
   
   
- struct SymmColumnVec_32f16s
+ /////////////////////////////// non-separable filters ///////////////////////////////
+ 
+ ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
+ 
+ struct FilterVec_8u
   {
-     SymmColumnVec_32f16s() { symmetryType=0; }
-     SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
+     FilterVec_8u() { delta = 0; _nz = 0; }
+     FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
       {
-         symmetryType = _symmetryType;
-         kernel = _kernel;
-         delta = (float)_delta;
-         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-          neon_supported = checkHardwareSupport(CV_CPU_NEON);
+         Mat kernel;
+         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+         delta = (float)(_delta/(1 << _bits));
+         std::vector<Point> coords;
+         preprocess2DKernel(kernel, coords, coeffs);
+         _nz = (int)coords.size();
       }
   
-     int operator()(const uchar** _src, uchar* _dst, int width) const
+     int operator()(const uchar** src, uchar* dst, int width) const
       {
-          if( !neon_supported )
-              return 0;
- 
-         int _ksize = kernel.rows + kernel.cols - 1;
-         int ksize2 = _ksize / 2;
-         const float* ky = kernel.ptr<float>() + ksize2;
-         int i = 0, k;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const float** src = (const float**)_src;
-         const float *S, *S2;
-         short* dst = (short*)_dst;
- 
-         float32x4_t d4 = vdupq_n_f32(delta);
+         const float* kf = (const float*)&coeffs[0];
+         int i = 0, k, nz = _nz;
   
-         if( symmetrical )
+         v_float32 d4 = vx_setall_f32(delta);
+         for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
           {
-             if( _ksize == 1 )
-                 return 0;
- 
- 
-             float32x2_t k32;
-             k32 = vdup_n_f32(0);
-             k32 = vld1_lane_f32(ky, k32, 0);
-             k32 = vld1_lane_f32(ky + 1, k32, 1);
- 
-             for( ; i <= width - 8; i += 8 )
+             v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
+             for( k = 0; k < nz; k++ )
               {
-                 float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
-                 float32x4_t accl, acch;
- 
-                 S = src[0] + i;
- 
-                 x0l = vld1q_f32(S);
-                 x0h = vld1q_f32(S + 4);
- 
-                 S = src[1] + i;
-                 S2 = src[-1] + i;
- 
-                 x1l = vld1q_f32(S);
-                 x1h = vld1q_f32(S + 4);
-                 x2l = vld1q_f32(S2);
-                 x2h = vld1q_f32(S2 + 4);
- 
-                 accl = acch = d4;
-                 accl = vmlaq_lane_f32(accl, x0l, k32, 0);
-                 acch = vmlaq_lane_f32(acch, x0h, k32, 0);
-                 accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
-                 acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
- 
-                 for( k = 2; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
- 
-                     float32x4_t x3l, x3h, x4l, x4h;
-                     x3l = vld1q_f32(S);
-                     x3h = vld1q_f32(S + 4);
-                     x4l = vld1q_f32(S2);
-                     x4h = vld1q_f32(S2 + 4);
- 
-                     accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
-                     acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
-                 }
- 
-                 int32x4_t s32l, s32h;
-                 s32l = vcvtq_s32_f32(accl);
-                 s32h = vcvtq_s32_f32(acch);
- 
-                 int16x4_t s16l, s16h;
-                 s16l = vqmovn_s32(s32l);
-                 s16h = vqmovn_s32(s32h);
- 
-                 vst1_s16((int16_t *)(dst + i), s16l);
-                 vst1_s16((int16_t *)(dst + i + 4), s16h);
-             }
+                 v_float32 f = vx_setall_f32(kf[k]);
+                 v_uint16 xl, xh;
+                 v_expand(vx_load(src[k] + i), xl, xh);
+                 v_uint32 x0, x1, x2, x3;
+                 v_expand(xl, x0, x1);
+                 v_expand(xh, x2, x3);
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
+                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
+                 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f, s2);
+                 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f, s3);
+             }
+             v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
           }
-         else
+         if( i <= width - v_uint16::nlanes )
           {
-             float32x2_t k32;
-             k32 = vdup_n_f32(0);
-             k32 = vld1_lane_f32(ky + 1, k32, 1);
- 
-             for( ; i <= width - 8; i += 8 )
+             v_float32 s0 = d4, s1 = d4;
+             for( k = 0; k < nz; k++ )
               {
-                 float32x4_t x1l, x1h, x2l, x2h;
-                 float32x4_t accl, acch;
- 
-                 S = src[1] + i;
-                 S2 = src[-1] + i;
- 
-                 x1l = vld1q_f32(S);
-                 x1h = vld1q_f32(S + 4);
-                 x2l = vld1q_f32(S2);
-                 x2h = vld1q_f32(S2 + 4);
- 
-                 accl = acch = d4;
-                 accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
-                 acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
- 
-                 for( k = 2; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
- 
-                     float32x4_t x3l, x3h, x4l, x4h;
-                     x3l = vld1q_f32(S);
-                     x3h = vld1q_f32(S + 4);
-                     x4l = vld1q_f32(S2);
-                     x4h = vld1q_f32(S2 + 4);
- 
-                     accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
-                     acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
-                 }
- 
-                 int32x4_t s32l, s32h;
-                 s32l = vcvtq_s32_f32(accl);
-                 s32h = vcvtq_s32_f32(acch);
- 
-                 int16x4_t s16l, s16h;
-                 s16l = vqmovn_s32(s32l);
-                 s16h = vqmovn_s32(s32h);
- 
-                 vst1_s16((int16_t *)(dst + i), s16l);
-                 vst1_s16((int16_t *)(dst + i + 4), s16h);
+                 v_float32 f = vx_setall_f32(kf[k]);
+                 v_uint32 x0, x1;
+                 v_expand(vx_load_expand(src[k] + i), x0, x1);
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
+                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
               }
+             v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+             i += v_uint16::nlanes;
+         }
+ #if CV_SIMD_WIDTH > 16
+         while( i <= width - v_int32x4::nlanes )
+ #else
+         if( i <= width - v_int32x4::nlanes )
+ #endif
+         {
+             v_float32x4 s0 = v_setall_f32(delta);
+             for( k = 0; k < nz; k++ )
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0);
+             v_int32x4 s32 = v_round(s0);
+             v_int16x8 s16 = v_pack(s32, s32);
+             *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+             i += v_int32x4::nlanes;
           }
   
           return i;
       }
   
-     int symmetryType;
+     int _nz;
+     std::vector<uchar> coeffs;
       float delta;
-     Mat kernel;
-     bool neon_supported;
   };
   
   
- struct SymmRowSmallVec_32f
+ struct FilterVec_8u16s
   {
-     SymmRowSmallVec_32f() {}
-     SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
+     FilterVec_8u16s() { delta = 0; _nz = 0; }
+     FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
       {
-         kernel = _kernel;
-         symmetryType = _symmetryType;
+         Mat kernel;
+         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+         delta = (float)(_delta/(1 << _bits));
+         std::vector<Point> coords;
+         preprocess2DKernel(kernel, coords, coeffs);
+         _nz = (int)coords.size();
       }
   
-     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
+     int operator()(const uchar** src, uchar* _dst, int width) const
       {
-          if( !checkHardwareSupport(CV_CPU_NEON) )
-              return 0;
- 
-         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
-         float* dst = (float*)_dst;
-         const float* src = (const float*)_src + (_ksize/2)*cn;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const float* kx = kernel.ptr<float>() + _ksize/2;
-         width *= cn;
+         const float* kf = (const float*)&coeffs[0];
+         short* dst = (short*)_dst;
+         int i = 0, k, nz = _nz;
   
-         if( symmetrical )
+         v_float32 d4 = vx_setall_f32(delta);
+         for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
           {
-             if( _ksize == 1 )
-                 return 0;
-             if( _ksize == 3 )
-             {
-                 if( kx[0] == 2 && kx[1] == 1 )
-                     return 0;
-                 else if( kx[0] == -2 && kx[1] == 1 )
-                     return 0;
-                 else
-                 {
-                     return 0;
-                 }
-             }
-             else if( _ksize == 5 )
+             v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
+             for( k = 0; k < nz; k++ )
               {
-                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                     return 0;
-                 else
-                 {
-                     float32x2_t k0, k1;
-                     k0 = k1 = vdup_n_f32(0);
-                     k0 = vld1_lane_f32(kx + 0, k0, 0);
-                     k0 = vld1_lane_f32(kx + 1, k0, 1);
-                     k1 = vld1_lane_f32(kx + 2, k1, 0);
- 
-                     for( ; i <= width - 4; i += 4, src += 4 )
-                     {
-                         float32x4_t x0, x1, x2, x3, x4;
-                         x0 = vld1q_f32(src);
-                         x1 = vld1q_f32(src - cn);
-                         x2 = vld1q_f32(src + cn);
-                         x3 = vld1q_f32(src - cn*2);
-                         x4 = vld1q_f32(src + cn*2);
- 
-                         float32x4_t y0;
-                         y0 = vmulq_lane_f32(x0, k0, 0);
-                         y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
-                         y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
- 
-                         vst1q_f32(dst + i, y0);
-                     }
-                 }
+                 v_float32 f = vx_setall_f32(kf[k]);
+                 v_uint16 xl, xh;
+                 v_expand(vx_load(src[k] + i), xl, xh);
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0);
+                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1);
+                 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f, s2);
+                 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3);
               }
+             v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+             v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
           }
-         else
+         if( i <= width - v_uint16::nlanes )
           {
-             if( _ksize == 3 )
-             {
-                 if( kx[0] == 0 && kx[1] == 1 )
-                     return 0;
-                 else
-                 {
-                     return 0;
-                 }
-             }
-             else if( _ksize == 5 )
+             v_float32 s0 = d4, s1 = d4;
+             for( k = 0; k < nz; k++ )
               {
-                 float32x2_t k;
-                 k = vdup_n_f32(0);
-                 k = vld1_lane_f32(kx + 1, k, 0);
-                 k = vld1_lane_f32(kx + 2, k, 1);
- 
-                 for( ; i <= width - 4; i += 4, src += 4 )
-                 {
-                     float32x4_t x0, x1, x2, x3;
-                     x0 = vld1q_f32(src - cn);
-                     x1 = vld1q_f32(src + cn);
-                     x2 = vld1q_f32(src - cn*2);
-                     x3 = vld1q_f32(src + cn*2);
- 
-                     float32x4_t y0;
-                     y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
-                     y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
- 
-                     vst1q_f32(dst + i, y0);
-                 }
+                 v_float32 f = vx_setall_f32(kf[k]);
+                 v_uint16 x = vx_load_expand(src[k] + i);
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0);
+                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1);
               }
+             v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+             i += v_uint16::nlanes;
+         }
+         if( i <= width - v_int32::nlanes )
+         {
+             v_float32 s0 = d4;
+             for( k = 0; k < nz; k++ )
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
+             v_pack_store(dst + i, v_round(s0));
+             i += v_int32::nlanes;
           }
   
           return i;
       }
   
-     Mat kernel;
-     int symmetryType;
+     int _nz;
+     std::vector<uchar> coeffs;
+     float delta;
   };
   
   
- typedef RowNoVec RowVec_8u32s;
- typedef RowNoVec RowVec_16s32f;
- typedef RowNoVec RowVec_32f;
- typedef ColumnNoVec SymmColumnVec_32f;
- typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
- typedef FilterNoVec FilterVec_8u;
- typedef FilterNoVec FilterVec_8u16s;
- typedef FilterNoVec FilterVec_32f;
+ struct FilterVec_32f
+ {
+     FilterVec_32f() { delta = 0; _nz = 0; }
+     FilterVec_32f(const Mat& _kernel, int, double _delta)
+     {
+         delta = (float)_delta;
+         std::vector<Point> coords;
+         preprocess2DKernel(_kernel, coords, coeffs);
+         _nz = (int)coords.size();
+     }
+ 
+     int operator()(const uchar** _src, uchar* _dst, int width) const
+     {
+         const float* kf = (const float*)&coeffs[0];
+         const float** src = (const float**)_src;
+         float* dst = (float*)_dst;
+         int i = 0, k, nz = _nz;
+ 
+         v_float32 d4 = vx_setall_f32(delta);
+         for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+         {
+             v_float32 s0 = d4;
+             for( k = 0; k < nz; k++ )
+                 s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0);
+             v_store(dst + i, s0);
+         }
+ 
+         return i;
+     }
   
+     int _nz;
+     std::vector<uchar> coeffs;
+     float delta;
+ };
   
   #else
   
@@@ -4557,7 -3503,7 +3503,7 @@@ static bool replacementFilter2D(int sty
       return success;
   }
   
- -#ifdef HAVE_IPP
+ +#if 0 //defined HAVE_IPP
   static bool ippFilter2D(int stype, int dtype, int kernel_type,
                 uchar * src_data, size_t src_step,
                 uchar * dst_data, size_t dst_step,
@@@ -4655,15 -3601,9 +3601,9 @@@ static bool dftFilter2D(int stype, int 
                           double delta, int borderType)
   {
       {
- #if CV_SSE2
           int sdepth = CV_MAT_DEPTH(stype);
           int ddepth = CV_MAT_DEPTH(dtype);
-         int dft_filter_size = ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3) ? 130 : 50;
- #else
-         CV_UNUSED(stype);
-         CV_UNUSED(dtype);
-         int dft_filter_size = 50;
- #endif
+         int dft_filter_size = checkHardwareSupport(CV_CPU_SSE3) && ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) ? 130 : 50;
           if (kernel_width * kernel_height < dft_filter_size)
               return false;
       }
@@@ -4821,7 -3761,7 +3761,7 @@@ void filter2D(int stype, int dtype, in
       if (res)
           return;
   
- -    CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type,
+ +    /*CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type,
                                 src_data, src_step,
                                 dst_data, dst_step,
                                 width, height,
@@@ -4830,7 -3770,7 +3770,7 @@@
                                 kernel_data, kernel_step,
                                 kernel_width, kernel_height,
                                 anchor_x, anchor_y,
- -                              delta, borderType, isSubmatrix))
+ +                              delta, borderType, isSubmatrix))*/
   
       res = dftFilter2D(stype, dtype, kernel_type,
                         src_data, src_step,
diff --combined modules/imgproc/src/median_blur.cpp

index 9f5a9ba,a07b6d6..567d197
--- 1/modules/imgproc/src/median_blur.cpp
--- 2/modules/imgproc/src/median_blur.cpp
+++ b/modules/imgproc/src/median_blur.cpp
@@@ -282,10 -282,10 +282,10 @@@ medianBlur_8u_O1( const Mat& _src, Mat
                           for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
                           {
   #if CV_SIMD256
-                             v_fine += v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
+                             v_fine = v_fine + v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
   #elif CV_SIMD128
-                             v_finel += v_load(px + 16 * MIN(luc[c][k], n - 1)    ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
-                             v_fineh += v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8);
+                             v_finel = v_finel + v_load(px + 16 * MIN(luc[c][k], n - 1)    ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
+                             v_fineh = v_fineh + v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8);
   #else
                               for (int ind = 0; ind < 16; ++ind)
                                   H[c].fine[k][ind] += px[16 * MIN(luc[c][k], n - 1) + ind] - px[16 * MAX(luc[c][k] - 2 * r - 1, 0) + ind];
@@@ -321,10 -321,10 +321,10 @@@
                       CV_Assert( b < 16 );
                   }
               }
+         }
   #if CV_SIMD
-             vx_cleanup();
+         vx_cleanup();
   #endif
-         }
       }
   
   #undef HOP
@@@ -1080,7 -1080,7 +1080,7 @@@ static bool openvx_medianFilter(InputAr
   }
   #endif
   
- -#ifdef HAVE_IPP
+ +#if 0 //defined HAVE_IPP
   static bool ipp_medianFilter(Mat &src0, Mat &dst, int ksize)
   {
       CV_INSTRUMENT_REGION_IPP();
@@@ -1179,7 -1179,7 +1179,7 @@@ void medianBlur( InputArray _src0, Outp
       CV_OVX_RUN(true,
                  openvx_medianFilter(_src0, _dst, ksize))
   
- -    CV_IPP_RUN_FAST(ipp_medianFilter(src0, dst, ksize));
+ +    //CV_IPP_RUN_FAST(ipp_medianFilter(src0, dst, ksize));
   
   #ifdef HAVE_TEGRA_OPTIMIZATION
       if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize))
diff --combined modules/imgproc/test/test_filter.cpp

index 10b77ca,963de09..155c62f
--- 1/modules/imgproc/test/test_filter.cpp
--- 2/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@@ -50,7 -50,7 +50,7 @@@ public
   
   protected:
       int prepare_test_case( int test_case_idx );
- -    int read_params( CvFileStorage* fs );
+ +    int read_params( const cv::FileStorage& fs );
       void get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types );
       void get_minmax_bounds( int i, int j, int type, Scalar& low, Scalar& high );
       Size aperture_size;
@@@ -76,13 -76,13 +76,13 @@@ CV_FilterBaseTest::CV_FilterBaseTest( b
   }
   
   
- -int CV_FilterBaseTest::read_params( CvFileStorage* fs )
+ +int CV_FilterBaseTest::read_params( const cv::FileStorage& fs )
   {
       int code = cvtest::ArrayTest::read_params( fs );
       if( code < 0 )
           return code;
   
- -    max_aperture_size = cvReadInt( find_param( fs, "max_aperture_size" ), max_aperture_size );
+ +    read( find_param( fs, "max_aperture_size" ), max_aperture_size, max_aperture_size );
       max_aperture_size = cvtest::clipInt( max_aperture_size, 1, 100 );
   
       return code;
@@@ -1265,7 -1265,7 +1265,7 @@@ public
       CV_FeatureSelBaseTest( int width_factor );
   
   protected:
- -    int read_params( CvFileStorage* fs );
+ +    int read_params( const FileStorage& fs );
       void get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types );
       void get_minmax_bounds( int i, int j, int type, Scalar& low, Scalar& high );
       double get_success_error_level( int test_case_idx, int i, int j );
@@@ -1289,15 -1289,15 +1289,15 @@@ CV_FeatureSelBaseTest::CV_FeatureSelBas
   }
   
   
- -int CV_FeatureSelBaseTest::read_params( CvFileStorage* fs )
+ +int CV_FeatureSelBaseTest::read_params( const cv::FileStorage& fs )
   {
       int code = cvtest::BaseTest::read_params( fs );
       if( code < 0 )
           return code;
   
- -    max_aperture_size = cvReadInt( find_param( fs, "max_aperture_size" ), max_aperture_size );
+ +    read( find_param( fs, "max_aperture_size" ), max_aperture_size, max_aperture_size );
       max_aperture_size = cvtest::clipInt( max_aperture_size, 1, 9 );
- -    max_block_size = cvReadInt( find_param( fs, "max_block_size" ), max_block_size );
+ +    read( find_param( fs, "max_block_size" ), max_block_size, max_block_size );
       max_block_size = cvtest::clipInt( max_aperture_size, 1, 100 );
   
       return code;
@@@ -2200,4 -2200,15 +2200,15 @@@ TEST(Imgproc_Filter2D, dftFilter2d_regr
   
       EXPECT_LE(cvtest::norm(dst, expected, NORM_INF), 2);
   }
+ 
+ TEST(Imgproc_MedianBlur, hires_regression_13409)
+ {
+     Mat src(2048, 2048, CV_8UC1), dst_hires, dst_ref;
+     randu(src, 0, 256);
+ 
+     medianBlur(src, dst_hires, 9);
+     medianBlur(src(Rect(512, 512, 1024, 1024)), dst_ref, 9);
+ 
+     ASSERT_EQ(0.0, cvtest::norm(dst_hires(Rect(516, 516, 1016, 1016)), dst_ref(Rect(4, 4, 1016, 1016)), NORM_INF));
+ }
   }} // namespace
diff --combined modules/objdetect/src/qrcode.cpp

index 9a719e4,dd3c48e..ec5d400
--- 1/modules/objdetect/src/qrcode.cpp
--- 2/modules/objdetect/src/qrcode.cpp
+++ b/modules/objdetect/src/qrcode.cpp
@@@ -782,6 -782,9 +782,9 @@@ bool QRCodeDetector::detect(InputArray 
       Mat inarr = in.getMat();
       CV_Assert(!inarr.empty());
       CV_Assert(inarr.depth() == CV_8U);
+     if (inarr.cols <= 20 || inarr.rows <= 20)
+         return false;  // image data is not enough for providing reliable results
+ 
       int incn = inarr.channels();
       if( incn == 3 || incn == 4 )
       {
@@@ -799,6 -802,15 +802,6 @@@
       return true;
   }
   
- -bool detectQRCode(InputArray in, vector<Point> &points, double eps_x, double eps_y)
- -{
- -    QRCodeDetector qrdetector;
- -    qrdetector.setEpsX(eps_x);
- -    qrdetector.setEpsY(eps_y);
- -
- -    return qrdetector.detect(in, points);
- -}
- -
   class QRDecode
   {
   public:
@@@ -1048,12 -1060,21 +1051,14 @@@ bool QRDecode::fullDecodingProcess(
   #endif
   }
   
- -bool decodeQRCode(InputArray in, InputArray points, std::string &decoded_info, OutputArray straight_qrcode)
- -{
- -    QRCodeDetector qrcode;
- -    decoded_info = qrcode.decode(in, points, straight_qrcode);
- -    return !decoded_info.empty();
- -}
- -
- -cv::String QRCodeDetector::decode(InputArray in, InputArray points,
- -                                  OutputArray straight_qrcode)
+ +std::string QRCodeDetector::decode(InputArray in, InputArray points,
+ +                                   OutputArray straight_qrcode)
   {
       Mat inarr = in.getMat();
       CV_Assert(!inarr.empty());
       CV_Assert(inarr.depth() == CV_8U);
+     if (inarr.cols <= 20 || inarr.rows <= 20)
+         return cv::String();  // image data is not enough for providing reliable results
   
       int incn = inarr.channels();
       if( incn == 3 || incn == 4 )
@@@ -1085,13 -1106,15 +1090,15 @@@
       return ok ? decoded_info : std::string();
   }
   
- -cv::String QRCodeDetector::detectAndDecode(InputArray in,
- -                                           OutputArray points_,
- -                                           OutputArray straight_qrcode)
+ +std::string QRCodeDetector::detectAndDecode(InputArray in,
+ +                                            OutputArray points_,
+ +                                            OutputArray straight_qrcode)
   {
       Mat inarr = in.getMat();
       CV_Assert(!inarr.empty());
       CV_Assert(inarr.depth() == CV_8U);
+     if (inarr.cols <= 20 || inarr.rows <= 20)
+         return cv::String();  // image data is not enough for providing reliable results
   
       int incn = inarr.channels();
       if( incn == 3 || incn == 4 )
@@@ -1116,4 -1139,5 +1123,4 @@@
       return decoded_info;
   }
   
- -
   }
diff --combined modules/videoio/src/cap_v4l.cpp

index 930abb1,228c7ba..f34564d
--- 1/modules/videoio/src/cap_v4l.cpp
--- 2/modules/videoio/src/cap_v4l.cpp
+++ b/modules/videoio/src/cap_v4l.cpp
@@@ -237,6 -237,11 +237,11 @@@ make & enjoy
   #include <sys/videoio.h>
   #endif
   
+ // https://github.com/opencv/opencv/issues/13335
+ #ifndef V4L2_CID_ISO_SENSITIVITY
+ #define V4L2_CID_ISO_SENSITIVITY (V4L2_CID_CAMERA_CLASS_BASE+23)
+ #endif
+ 
   /* Defaults - If your board can do better, set it here.  Set for the most common type inputs. */
   #define DEFAULT_V4L_WIDTH  640
   #define DEFAULT_V4L_HEIGHT 480
@@@ -786,7 -791,7 +791,7 @@@ bool CvCaptureCAM_V4L::open(const char
       frame_allocated = false;
       deviceName = _deviceName;
       returnFrame = true;
- -    normalizePropRange = utils::getConfigurationParameterBool("OPENCV_VIDEOIO_V4L_RANGE_NORMALIZED", true);
+ +    normalizePropRange = utils::getConfigurationParameterBool("OPENCV_VIDEOIO_V4L_RANGE_NORMALIZED", false);
       channelNumber = -1;
       bufferIndex = -1;
   
@@@ -1757,7 -1762,7 +1762,7 @@@ bool CvCaptureCAM_V4L::icvSetFrameSize(
       if (_width > 0)
           width_set = _width;
   
-     if (height > 0)
+     if (_height > 0)
           height_set = _height;
   
       /* two subsequent calls setting WIDTH and HEIGHT will change
diff --combined modules/videoio/test/test_camera.cpp

index eb6fb60,d816f63..ac115b1
--- 1/modules/videoio/test/test_camera.cpp
--- 2/modules/videoio/test/test_camera.cpp
+++ b/modules/videoio/test/test_camera.cpp
@@@ -11,7 -11,7 +11,7 @@@
   
   namespace opencv_test { namespace {
   
- static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100)
+ static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100, Mat* lastFrame = NULL)
   {
       Mat frame;
       int64 time0 = cv::getTickCount();
@@@ -26,6 -26,7 +26,7 @@@
       }
       int64 time1 = cv::getTickCount();
       printf("Processed %d frames on %.2f FPS\n", N, (N * cv::getTickFrequency()) / (time1 - time0 + 1));
+     if (lastFrame) *lastFrame = frame.clone();
   }
   
   TEST(DISABLED_VideoIO_Camera, basic)
@@@ -55,18 -56,39 +56,53 @@@ TEST(DISABLED_VideoIO_Camera, validate_
       capture.release();
   }
   
+ +//Following test if for capture device using PhysConn_Video_SerialDigital as crossbar input pin
+ +TEST(DISABLED_VideoIO_Camera, dshow_avermedia_capture)
+ +{
+ +    VideoCapture capture(0);
+ +    ASSERT_TRUE(capture.isOpened());
+ +    capture.set(CAP_PROP_CHANNEL, 6);
+ +    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+ +    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+ +    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+ +    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+ +    test_readFrames(capture);
+ +    capture.release();
+ +}
+ +
+ TEST(DISABLED_VideoIO_Camera, validate_V4L2_FrameSize)
+ {
+     VideoCapture capture(CAP_V4L2);
+     ASSERT_TRUE(capture.isOpened());
+     std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+     std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+     std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+     std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+     int fourcc = (int)capture.get(CAP_PROP_FOURCC);
+     std::cout << "FOURCC code: " << cv::format("0x%8x", fourcc) << std::endl;
+     test_readFrames(capture, 30);
+ 
+     EXPECT_TRUE(capture.set(CAP_PROP_FRAME_WIDTH, 640));
+     EXPECT_TRUE(capture.set(CAP_PROP_FRAME_HEIGHT, 480));
+     std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+     std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+     std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+     Mat frame640x480;
+     test_readFrames(capture, 30, &frame640x480);
+     EXPECT_EQ(640, frame640x480.cols);
+     EXPECT_EQ(480, frame640x480.rows);
+ 
+     EXPECT_TRUE(capture.set(CAP_PROP_FRAME_WIDTH, 1280));
+     EXPECT_TRUE(capture.set(CAP_PROP_FRAME_HEIGHT, 720));
+     std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+     std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+     std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+     Mat frame1280x720;
+     test_readFrames(capture, 30, &frame1280x720);
+     EXPECT_EQ(1280, frame1280x720.cols);
+     EXPECT_EQ(720, frame1280x720.rows);
+ 
+     capture.release();
+ }
+ 
   }} // namespace
author	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Sat, 22 Dec 2018 05:40:15 +0000 (05:40 +0000)
committer	Alexander Alekhin <alexander.a.alekhin@gmail.com>
	Sat, 22 Dec 2018 05:40:15 +0000 (05:40 +0000)
		1	2
modules/dnn/include/opencv2/dnn/dnn.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/dnn/include/opencv2/dnn/version.hpp	patch \|	diff1 \|	\|	blob \| history
modules/dnn/src/onnx/onnx_importer.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/dnn/src/op_inf_engine.hpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/dnn/src/torch/torch_importer.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/dnn/test/test_backends.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/imgproc/src/filter.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/imgproc/src/median_blur.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/imgproc/test/test_filter.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/objdetect/src/qrcode.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/videoio/src/cap_v4l.cpp	patch \|	diff1 \|	diff2 \|	blob \| history
modules/videoio/test/test_camera.cpp	patch \|	diff1 \|	diff2 \|	blob \| history