Merge branch '3.4' into merge-3.4
authorAlexander Alekhin <alexander.a.alekhin@gmail.com>
Sat, 22 Dec 2018 05:40:15 +0000 (05:40 +0000)
committerAlexander Alekhin <alexander.a.alekhin@gmail.com>
Sat, 22 Dec 2018 05:40:15 +0000 (05:40 +0000)
12 files changed:
1  2 
modules/dnn/include/opencv2/dnn/dnn.hpp
modules/dnn/include/opencv2/dnn/version.hpp
modules/dnn/src/onnx/onnx_importer.cpp
modules/dnn/src/op_inf_engine.hpp
modules/dnn/src/torch/torch_importer.cpp
modules/dnn/test/test_backends.cpp
modules/imgproc/src/filter.cpp
modules/imgproc/src/median_blur.cpp
modules/imgproc/test/test_filter.cpp
modules/objdetect/src/qrcode.cpp
modules/videoio/src/cap_v4l.cpp
modules/videoio/test/test_camera.cpp

  #include <vector>
  #include <opencv2/core.hpp>
  
 -#if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_EXPERIMENTAL_NS
 -#define CV__DNN_EXPERIMENTAL_NS_BEGIN namespace experimental_dnn_34_v11 {
 -#define CV__DNN_EXPERIMENTAL_NS_END }
 -namespace cv { namespace dnn { namespace experimental_dnn_34_v11 { } using namespace experimental_dnn_34_v11; }}
 -#else
 -#define CV__DNN_EXPERIMENTAL_NS_BEGIN
 -#define CV__DNN_EXPERIMENTAL_NS_END
 -#endif
 +#include "../dnn/version.hpp"
  
  #include <opencv2/dnn/dict.hpp>
  
  namespace cv {
  namespace dnn {
 -CV__DNN_EXPERIMENTAL_NS_BEGIN
 +CV__DNN_INLINE_NS_BEGIN
  //! @addtogroup dnn
  //! @{
  
@@@ -69,8 -76,7 +69,8 @@@
          DNN_BACKEND_DEFAULT,
          DNN_BACKEND_HALIDE,
          DNN_BACKEND_INFERENCE_ENGINE,
 -        DNN_BACKEND_OPENCV
 +        DNN_BACKEND_OPENCV,
 +        DNN_BACKEND_VKCOM
      };
  
      /**
@@@ -83,7 -89,6 +83,7 @@@
          DNN_TARGET_OPENCL,
          DNN_TARGET_OPENCL_FP16,
          DNN_TARGET_MYRIAD,
 +        DNN_TARGET_VULKAN,
          //! FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
          DNN_TARGET_FPGA
      };
  
          virtual Ptr<BackendNode> initInfEngine(const std::vector<Ptr<BackendWrapper> > &inputs);
  
 +        virtual Ptr<BackendNode> initVkCom(const std::vector<Ptr<BackendWrapper> > &inputs);
         /**
          * @brief Automatic Halide scheduling based on layer hyper-parameters.
          * @param[in] node Backend node with Halide functions.
       *  @brief Reads a network model stored in <a href="http://torch.ch">Torch7</a> framework's format.
       *  @param model    path to the file, dumped from Torch by using torch.save() function.
       *  @param isBinary specifies whether the network was serialized in ascii mode or binary.
+      *  @param evaluate specifies testing phase of network. If true, it's similar to evaluate() method in Torch.
       *  @returns Net object.
       *
       *  @note Ascii mode of Torch serializer is more preferable, because binary mode extensively use `long` type of C language,
       *
       * Also some equivalents of these classes from cunn, cudnn, and fbcunn may be successfully imported.
       */
-      CV_EXPORTS_W Net readNetFromTorch(const String &model, bool isBinary = true);
+      CV_EXPORTS_W Net readNetFromTorch(const String &model, bool isBinary = true, bool evaluate = true);
  
       /**
        * @brief Read deep learning network represented in one of the supported formats.
      CV_EXPORTS_W void resetMyriadDevice();
  
  //! @}
 -CV__DNN_EXPERIMENTAL_NS_END
 +CV__DNN_INLINE_NS_END
  }
  }
  
index 7d0f125,0000000..b41efda
mode 100644,000000..100644
--- /dev/null
@@@ -1,21 -1,0 +1,21 @@@
- #define OPENCV_DNN_API_VERSION 20181205
 +// This file is part of OpenCV project.
 +// It is subject to the license terms in the LICENSE file found in the top-level directory
 +// of this distribution and at http://opencv.org/license.html.
 +
 +#ifndef OPENCV_DNN_VERSION_HPP
 +#define OPENCV_DNN_VERSION_HPP
 +
 +/// Use with major OpenCV version only.
++#define OPENCV_DNN_API_VERSION 20181221
 +
 +#if !defined CV_DOXYGEN && !defined CV_DNN_DONT_ADD_INLINE_NS
 +#define CV__DNN_INLINE_NS __CV_CAT(dnn4_v, OPENCV_DNN_API_VERSION)
 +#define CV__DNN_INLINE_NS_BEGIN namespace CV__DNN_INLINE_NS {
 +#define CV__DNN_INLINE_NS_END }
 +namespace cv { namespace dnn { namespace CV__DNN_INLINE_NS { } using namespace CV__DNN_INLINE_NS; }}
 +#else
 +#define CV__DNN_INLINE_NS_BEGIN
 +#define CV__DNN_INLINE_NS_END
 +#endif
 +
 +#endif  // OPENCV_DNN_VERSION_HPP
@@@ -28,7 -28,7 +28,7 @@@
  
  namespace cv {
  namespace dnn {
 -CV__DNN_EXPERIMENTAL_NS_BEGIN
 +CV__DNN_INLINE_NS_BEGIN
  
  
  class ONNXImporter
@@@ -420,31 -420,30 +420,30 @@@ void ONNXImporter::populateNet(Net dstN
          }
          else if (layer_type == "Sub")
          {
-             Mat blob = (-1.0f) * getBlob(node_proto, constBlobs, 1);
-             blob = blob.reshape(1, 1);
+             Mat blob = getBlob(node_proto, constBlobs, 1);
              if (blob.total() == 1) {
                  layerParams.type = "Power";
-                 layerParams.set("shift", blob.at<float>(0));
+                 layerParams.set("shift", -blob.at<float>(0));
              }
              else {
                  layerParams.type = "Scale";
                  layerParams.set("has_bias", true);
-                 layerParams.blobs.push_back(blob);
+                 layerParams.blobs.push_back(-1.0f * blob.reshape(1, 1));
              }
          }
          else if (layer_type == "Div")
          {
              Mat blob = getBlob(node_proto, constBlobs, 1);
              CV_Assert_N(blob.type() == CV_32F, blob.total());
-             divide(1.0, blob, blob);
              if (blob.total() == 1)
              {
-                 layerParams.set("scale", blob.at<float>(0));
+                 layerParams.set("scale", 1.0f / blob.at<float>(0));
                  layerParams.type = "Power";
              }
              else
              {
                  layerParams.type = "Scale";
+                 divide(1.0, blob, blob);
                  layerParams.blobs.push_back(blob);
                  layerParams.set("bias_term", false);
              }
@@@ -760,7 -759,7 +759,7 @@@ Mat readTensorFromONNX(const String& pa
      return mat;
  }
  
 -CV__DNN_EXPERIMENTAL_NS_END
 +CV__DNN_INLINE_NS_END
  }} // namespace
  
  #endif
  #define INF_ENGINE_RELEASE_2018R2 2018020000
  #define INF_ENGINE_RELEASE_2018R3 2018030000
  #define INF_ENGINE_RELEASE_2018R4 2018040000
+ #define INF_ENGINE_RELEASE_2018R5 2018050000
  
  #ifndef INF_ENGINE_RELEASE
- #warning("IE version have not been provided via command-line. Using 2018R4 by default")
- #define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2018R4
+ #warning("IE version have not been provided via command-line. Using 2018R5 by default")
+ #define INF_ENGINE_RELEASE INF_ENGINE_RELEASE_2018R5
  #endif
  
  #define INF_ENGINE_VER_MAJOR_GT(ver) (((INF_ENGINE_RELEASE) / 10000) > ((ver) / 10000))
@@@ -48,69 -49,69 +49,69 @@@ public
  
      InfEngineBackendNet(InferenceEngine::CNNNetwork& net);
  
 -    virtual void Release() noexcept CV_OVERRIDE;
 +    virtual void Release() CV_NOEXCEPT CV_OVERRIDE;
  
 -    void setPrecision(InferenceEngine::Precision p) noexcept;
 +    void setPrecision(InferenceEngine::Precision p) CV_NOEXCEPT;
  
 -    virtual InferenceEngine::Precision getPrecision() noexcept;
 +    virtual InferenceEngine::Precision getPrecision() CV_NOEXCEPT;
  
 -    virtual InferenceEngine::Precision getPrecision() const noexcept;
 +    virtual InferenceEngine::Precision getPrecision() const CV_NOEXCEPT;
  
 -    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) noexcept /*CV_OVERRIDE*/;
 +    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) CV_NOEXCEPT /*CV_OVERRIDE*/;
  
 -    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) const noexcept /*CV_OVERRIDE*/;
 +    virtual void getOutputsInfo(InferenceEngine::OutputsDataMap &out) const CV_NOEXCEPT /*CV_OVERRIDE*/;
  
 -    virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) noexcept /*CV_OVERRIDE*/;
 +    virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) CV_NOEXCEPT /*CV_OVERRIDE*/;
  
 -    virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) const noexcept /*CV_OVERRIDE*/;
 +    virtual void getInputsInfo(InferenceEngine::InputsDataMap &inputs) const CV_NOEXCEPT /*CV_OVERRIDE*/;
  
 -    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) noexcept;
 +    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) CV_NOEXCEPT;
  
 -    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) const noexcept;
 +    virtual InferenceEngine::InputInfo::Ptr getInput(const std::string &inputName) const CV_NOEXCEPT;
  
 -    virtual InferenceEngine::StatusCode serialize(const std::string &xmlPath, const std::string &binPath, InferenceEngine::ResponseDesc* resp) const noexcept;
 +    virtual InferenceEngine::StatusCode serialize(const std::string &xmlPath, const std::string &binPath, InferenceEngine::ResponseDesc* resp) const CV_NOEXCEPT;
  
 -    virtual void getName(char *pName, size_t len) noexcept;
 +    virtual void getName(char *pName, size_t len) CV_NOEXCEPT;
  
 -    virtual void getName(char *pName, size_t len) const noexcept;
 +    virtual void getName(char *pName, size_t len) const CV_NOEXCEPT;
  
 -    virtual const std::string& getName() const noexcept;
 +    virtual const std::string& getName() const CV_NOEXCEPT;
  
 -    virtual size_t layerCount() noexcept;
 +    virtual size_t layerCount() CV_NOEXCEPT;
  
 -    virtual size_t layerCount() const noexcept;
 +    virtual size_t layerCount() const CV_NOEXCEPT;
  
 -    virtual InferenceEngine::DataPtr& getData(const char *dname) noexcept CV_OVERRIDE;
 +    virtual InferenceEngine::DataPtr& getData(const char *dname) CV_NOEXCEPT CV_OVERRIDE;
  
 -    virtual void addLayer(const InferenceEngine::CNNLayerPtr &layer) noexcept CV_OVERRIDE;
 +    virtual void addLayer(const InferenceEngine::CNNLayerPtr &layer) CV_NOEXCEPT CV_OVERRIDE;
  
      virtual InferenceEngine::StatusCode addOutput(const std::string &layerName,
                                                    size_t outputIndex = 0,
 -                                                  InferenceEngine::ResponseDesc *resp = nullptr) noexcept;
 +                                                  InferenceEngine::ResponseDesc *resp = nullptr) CV_NOEXCEPT;
  
      virtual InferenceEngine::StatusCode getLayerByName(const char *layerName,
                                                         InferenceEngine::CNNLayerPtr &out,
 -                                                       InferenceEngine::ResponseDesc *resp) noexcept;
 +                                                       InferenceEngine::ResponseDesc *resp) CV_NOEXCEPT;
  
      virtual InferenceEngine::StatusCode getLayerByName(const char *layerName,
                                                         InferenceEngine::CNNLayerPtr &out,
 -                                                       InferenceEngine::ResponseDesc *resp) const noexcept;
 +                                                       InferenceEngine::ResponseDesc *resp) const CV_NOEXCEPT;
  
 -    virtual void setTargetDevice(InferenceEngine::TargetDevice device) noexcept CV_OVERRIDE;
 +    virtual void setTargetDevice(InferenceEngine::TargetDevice device) CV_NOEXCEPT CV_OVERRIDE;
  
 -    virtual InferenceEngine::TargetDevice getTargetDevice() noexcept;
 +    virtual InferenceEngine::TargetDevice getTargetDevice() CV_NOEXCEPT;
  
 -    virtual InferenceEngine::TargetDevice getTargetDevice() const noexcept;
 +    virtual InferenceEngine::TargetDevice getTargetDevice() const CV_NOEXCEPT;
  
 -    virtual InferenceEngine::StatusCode setBatchSize(const size_t size) noexcept CV_OVERRIDE;
 +    virtual InferenceEngine::StatusCode setBatchSize(const size_t size) CV_NOEXCEPT CV_OVERRIDE;
  
 -    virtual InferenceEngine::StatusCode setBatchSize(size_t size, InferenceEngine::ResponseDesc* responseDesc) noexcept;
 +    virtual InferenceEngine::StatusCode setBatchSize(size_t size, InferenceEngine::ResponseDesc* responseDesc) CV_NOEXCEPT;
  
 -    virtual size_t getBatchSize() const noexcept CV_OVERRIDE;
 +    virtual size_t getBatchSize() const CV_NOEXCEPT CV_OVERRIDE;
  
  #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R2)
 -    virtual InferenceEngine::StatusCode AddExtension(const InferenceEngine::IShapeInferExtensionPtr& extension, InferenceEngine::ResponseDesc* resp) noexcept;
 -    virtual InferenceEngine::StatusCode reshape(const InputShapes& inputShapes, InferenceEngine::ResponseDesc* resp) noexcept;
 +    virtual InferenceEngine::StatusCode AddExtension(const InferenceEngine::IShapeInferExtensionPtr& extension, InferenceEngine::ResponseDesc* resp) CV_NOEXCEPT;
 +    virtual InferenceEngine::StatusCode reshape(const InputShapes& inputShapes, InferenceEngine::ResponseDesc* resp) CV_NOEXCEPT;
  #endif
  
      void init(int targetId);
@@@ -51,7 -51,7 +51,7 @@@
  
  namespace cv {
  namespace dnn {
 -CV__DNN_EXPERIMENTAL_NS_BEGIN
 +CV__DNN_INLINE_NS_BEGIN
  
  using namespace TH;
  
@@@ -129,13 -129,15 +129,15 @@@ struct TorchImporte
      Module *rootModule;
      Module *curModule;
      int moduleCounter;
+     bool testPhase;
  
-     TorchImporter(String filename, bool isBinary)
+     TorchImporter(String filename, bool isBinary, bool evaluate)
      {
          CV_TRACE_FUNCTION();
  
          rootModule = curModule = NULL;
          moduleCounter = 0;
+         testPhase = evaluate;
  
          file = cv::Ptr<THFile>(THDiskFile_new(filename, "r", 0), THFile_free);
          CV_Assert(file && THFile_isOpened(file));
                      layerParams.blobs.push_back(tensorParams["bias"].second);
                  }
  
-                 if (nnName == "InstanceNormalization")
+                 bool trainPhase = scalarParams.get<bool>("train", false);
+                 if (nnName == "InstanceNormalization" || (trainPhase && !testPhase))
                  {
                      cv::Ptr<Module> mvnModule(new Module(nnName));
                      mvnModule->apiType = "MVN";
  
  Mat readTorchBlob(const String &filename, bool isBinary)
  {
-     TorchImporter importer(filename, isBinary);
+     TorchImporter importer(filename, isBinary, true);
      importer.readObject();
      CV_Assert(importer.tensors.size() == 1);
  
      return importer.tensors.begin()->second;
  }
  
- Net readNetFromTorch(const String &model, bool isBinary)
+ Net readNetFromTorch(const String &model, bool isBinary, bool evaluate)
  {
      CV_TRACE_FUNCTION();
  
-     TorchImporter importer(model, isBinary);
+     TorchImporter importer(model, isBinary, evaluate);
      Net net;
      importer.populateNet(net);
      return net;
  }
  
 -CV__DNN_EXPERIMENTAL_NS_END
 +CV__DNN_INLINE_NS_END
  }} // namespace
@@@ -226,9 -226,9 +226,9 @@@ TEST_P(DNNTestNetwork, OpenPose_pose_mp
  TEST_P(DNNTestNetwork, OpenFace)
  {
  #if defined(INF_ENGINE_RELEASE)
- #if INF_ENGINE_RELEASE < 2018030000
+ #if (INF_ENGINE_RELEASE < 2018030000 || INF_ENGINE_RELEASE == 2018050000)
      if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
-         throw SkipTestException("Test is enabled starts from OpenVINO 2018R3");
+         throw SkipTestException("");
  #elif INF_ENGINE_RELEASE < 2018040000
      if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_OPENCL_FP16)
          throw SkipTestException("Test is enabled starts from OpenVINO 2018R4");
@@@ -292,6 -292,6 +292,6 @@@ TEST_P(DNNTestNetwork, FastNeuralStyle_
      processNet("dnn/fast_neural_style_eccv16_starry_night.t7", "", inp, "", "", l1, lInf);
  }
  
 -INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets(true, true, false));
 +INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets(true, true, false, true));
  
  }} // namespace
@@@ -44,6 -44,7 +44,7 @@@
  #include "opencv2/core/opencl/ocl_defs.hpp"
  #include "opencl_kernels_imgproc.hpp"
  #include "hal_replacement.hpp"
+ #include "opencv2/core/hal/intrin.hpp"
  #include "filter.hpp"
  
  
@@@ -477,7 -478,7 +478,7 @@@ struct FilterNoVe
  };
  
  
- #if CV_SSE2
+ #if CV_SIMD
  
  ///////////////////////////////////// 8u-16s & 8u-8u //////////////////////////////////
  
@@@ -502,9 -503,6 +503,6 @@@ struct RowVec_8u32
  
      int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
      {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
          int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
          int* dst = (int*)_dst;
          const int* _kx = kernel.ptr<int>();
  
          if( smallValues )
          {
-             __m128i z = _mm_setzero_si128();
-             for( ; i <= width - 8; i += 8 )
+             for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
              {
                  const uchar* src = _src + i;
-                 __m128i s0 = z, s1 = z;
-                 for( k = 0; k < _ksize; k++, src += cn )
+                 v_int32 s0 = vx_setzero_s32();
+                 v_int32 s1 = vx_setzero_s32();
+                 v_int32 s2 = vx_setzero_s32();
+                 v_int32 s3 = vx_setzero_s32();
+                 k = 0;
+                 for (; k <= _ksize - 2; k += 2, src += 2 * cn)
+                 {
+                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+                     v_uint8 x0, x1;
+                     v_zip(vx_load(src), vx_load(src + cn), x0, x1);
+                     s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
+                     s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
+                     s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
+                     s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+                 }
+                 if (k < _ksize)
+                 {
+                     v_int32 f = vx_setall_s32(_kx[k]);
+                     v_uint16 x0, x1;
+                     v_expand(vx_load(src), x0, x1);
+                     s0 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x0)), v_reinterpret_as_s16(f));
+                     s1 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x0)), v_reinterpret_as_s16(f));
+                     s2 += v_dotprod(v_reinterpret_as_s16(v_expand_low(x1)), v_reinterpret_as_s16(f));
+                     s3 += v_dotprod(v_reinterpret_as_s16(v_expand_high(x1)), v_reinterpret_as_s16(f));
+                 }
+                 v_store(dst + i, s0);
+                 v_store(dst + i + v_int32::nlanes, s1);
+                 v_store(dst + i + 2*v_int32::nlanes, s2);
+                 v_store(dst + i + 3*v_int32::nlanes, s3);
+             }
+             if( i <= width - v_uint16::nlanes )
+             {
+                 const uchar* src = _src + i;
+                 v_int32 s0 = vx_setzero_s32();
+                 v_int32 s1 = vx_setzero_s32();
+                 k = 0;
+                 for( ; k <= _ksize - 2; k += 2, src += 2*cn )
                  {
-                     __m128i f = _mm_cvtsi32_si128(_kx[k]);
-                     f = _mm_shuffle_epi32(f, 0);
-                     __m128i x0 = _mm_loadl_epi64((const __m128i*)src);
-                     x0 = _mm_unpacklo_epi8(x0, z);
-                     __m128i x1 = _mm_unpackhi_epi16(x0, z);
-                     x0 = _mm_unpacklo_epi16(x0, z);
-                     x0 = _mm_madd_epi16(x0, f);
-                     x1 = _mm_madd_epi16(x1, f);
-                     s0 = _mm_add_epi32(s0, x0);
-                     s1 = _mm_add_epi32(s1, x1);
+                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+                     v_uint16 x0, x1;
+                     v_zip(vx_load_expand(src), vx_load_expand(src + cn), x0, x1);
+                     s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
+                     s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
                  }
-                 _mm_store_si128((__m128i*)(dst + i), s0);
-                 _mm_store_si128((__m128i*)(dst + i + 4), s1);
+                 if( k < _ksize )
+                 {
+                     v_int32 f = vx_setall_s32(_kx[k]);
+                     v_uint32 x0, x1;
+                     v_expand(vx_load_expand(src), x0, x1);
+                     s0 += v_dotprod(v_reinterpret_as_s16(x0), v_reinterpret_as_s16(f));
+                     s1 += v_dotprod(v_reinterpret_as_s16(x1), v_reinterpret_as_s16(f));
+                 }
+                 v_store(dst + i, s0);
+                 v_store(dst + i + v_int32::nlanes, s1);
+                 i += v_uint16::nlanes;
              }
-             if( i <= width - 4 )
+             if( i <= width - v_uint32::nlanes )
              {
+                 v_int32 d = vx_setzero_s32();
+                 k = 0;
                  const uchar* src = _src + i;
-                 __m128i s0 = z;
-                 for( k = 0; k < _ksize; k++, src += cn )
+                 for (; k <= _ksize - 2; k += 2, src += 2*cn)
                  {
-                     __m128i f = _mm_cvtsi32_si128(_kx[k]);
-                     f = _mm_shuffle_epi32(f, 0);
-                     __m128i x0 = _mm_cvtsi32_si128(*(const int*)src);
-                     x0 = _mm_unpacklo_epi8(x0, z);
-                     x0 = _mm_unpacklo_epi16(x0, z);
-                     x0 = _mm_madd_epi16(x0, f);
-                     s0 = _mm_add_epi32(s0, x0);
+                     v_int32 f = vx_setall_s32((_kx[k] & 0xFFFF) | (_kx[k + 1] << 16));
+                     v_uint32 x0, x1;
+                     v_zip(vx_load_expand_q(src), vx_load_expand_q(src + cn), x0, x1);
+                     d += v_dotprod(v_pack(v_reinterpret_as_s32(x0), v_reinterpret_as_s32(x1)), v_reinterpret_as_s16(f));
                  }
-                 _mm_store_si128((__m128i*)(dst + i), s0);
-                 i += 4;
+                 if (k < _ksize)
+                     d += v_dotprod(v_reinterpret_as_s16(vx_load_expand_q(src)), v_reinterpret_as_s16(vx_setall_s32(_kx[k])));
+                 v_store(dst + i, d);
+                 i += v_uint32::nlanes;
              }
          }
          return i;
@@@ -590,9 -617,6 +617,6 @@@ struct SymmRowSmallVec_8u32
  
      int operator()(const uchar* src, uchar* _dst, int width, int cn) const
      {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
          int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1;
          int* dst = (int*)_dst;
          bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
          src += (_ksize/2)*cn;
          width *= cn;
  
-         __m128i z = _mm_setzero_si128();
          if( symmetrical )
          {
              if( _ksize == 1 )
              if( _ksize == 3 )
              {
                  if( kx[0] == 2 && kx[1] == 1 )
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                 {
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src), x1l, x1h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+                         x1l = v_add_wrap(v_add_wrap(x1l, x1l), v_add_wrap(x0l, x2l));
+                         x1h = v_add_wrap(v_add_wrap(x1h, x1h), v_add_wrap(x0h, x2h));
+                         v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x1l)));
+                         v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1l)));
+                         v_store(dst + i + 2*v_int32::nlanes, v_reinterpret_as_s32(v_expand_low(x1h)));
+                         v_store(dst + i + 3*v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x1h)));
+                     }
+                     if( i <= width - v_uint16::nlanes )
+                     {
+                         v_uint16 x = vx_load_expand(src);
+                         x = v_add_wrap(v_add_wrap(x, x), v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)));
+                         v_store(dst + i, v_reinterpret_as_s32(v_expand_low(x)));
+                         v_store(dst + i + v_int32::nlanes, v_reinterpret_as_s32(v_expand_high(x)));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
+                     }
+                     if( i <= width - v_uint32::nlanes )
                      {
-                         __m128i x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
-                         x1 = _mm_loadu_si128((__m128i*)src);
-                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
-                         y0 = _mm_unpackhi_epi8(x0, z);
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         y1 = _mm_unpackhi_epi8(x1, z);
-                         x1 = _mm_unpacklo_epi8(x1, z);
-                         y2 = _mm_unpackhi_epi8(x2, z);
-                         x2 = _mm_unpacklo_epi8(x2, z);
-                         x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2));
-                         y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2));
-                         _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z));
-                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z));
-                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z));
-                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z));
+                         v_uint32 x = vx_load_expand_q(src);
+                         x = (x + x) + vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn);
+                         v_store(dst + i, v_reinterpret_as_s32(x));
+                         i += v_uint32::nlanes;
                      }
+                 }
                  else if( kx[0] == -2 && kx[1] == 1 )
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                 {
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src), x1l, x1h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+                         x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
+                         x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
+                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
+                         v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
+                         v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+                     }
+                     if( i <= width - v_uint16::nlanes )
+                     {
+                         v_uint16 x = vx_load_expand(src);
+                         x = v_sub_wrap(v_add_wrap(vx_load_expand(src - cn), vx_load_expand(src + cn)), v_add_wrap(x, x));
+                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
+                     }
+                     if( i <= width - v_uint32::nlanes )
                      {
-                         __m128i x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_si128((__m128i*)(src - cn));
-                         x1 = _mm_loadu_si128((__m128i*)src);
-                         x2 = _mm_loadu_si128((__m128i*)(src + cn));
-                         y0 = _mm_unpackhi_epi8(x0, z);
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         y1 = _mm_unpackhi_epi8(x1, z);
-                         x1 = _mm_unpacklo_epi8(x1, z);
-                         y2 = _mm_unpackhi_epi8(x2, z);
-                         x2 = _mm_unpacklo_epi8(x2, z);
-                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
-                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
-                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+                         v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
+                         x = v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) - (x + x);
+                         v_store(dst + i, x);
+                         i += v_uint32::nlanes;
                      }
+                 }
                  else
                  {
-                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
-                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0);
-                     k1 = _mm_packs_epi32(k1, k1);
-                     for( ; i <= width - 8; i += 8, src += 8 )
+                     v_int16 k0 = vx_setall_s16((short)kx[0]);
+                     v_int16 k1 = vx_setall_s16((short)kx[1]);
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src), x1l, x1h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+                         v_int32 dl, dh;
+                         v_int16 x0, x1;
+                         v_mul_expand(v_reinterpret_as_s16(x1l), k0, dl, dh);
+                         v_zip(v_reinterpret_as_s16(x0l), v_reinterpret_as_s16(x2l), x0, x1);
+                         dl += v_dotprod(x0, k1);
+                         dh += v_dotprod(x1, k1);
+                         v_store(dst + i, dl);
+                         v_store(dst + i + v_int32::nlanes, dh);
+                         v_mul_expand(v_reinterpret_as_s16(x1h), k0, dl, dh);
+                         v_zip(v_reinterpret_as_s16(x0h), v_reinterpret_as_s16(x2h), x0, x1);
+                         dl += v_dotprod(x0, k1);
+                         dh += v_dotprod(x1, k1);
+                         v_store(dst + i + 2*v_int32::nlanes, dl);
+                         v_store(dst + i + 3*v_int32::nlanes, dh);
+                     }
+                     if ( i <= width - v_uint16::nlanes )
+                     {
+                         v_int32 dl, dh;
+                         v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, dl, dh);
+                         v_int16 x0, x1;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn)), v_reinterpret_as_s16(vx_load_expand(src + cn)), x0, x1);
+                         dl += v_dotprod(x0, k1);
+                         dh += v_dotprod(x1, k1);
+                         v_store(dst + i, dl);
+                         v_store(dst + i + v_int32::nlanes, dh);
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
+                     }
+                     if ( i <= width - v_uint32::nlanes )
                      {
-                         __m128i x0 = _mm_loadl_epi64((__m128i*)(src - cn));
-                         __m128i x1 = _mm_loadl_epi64((__m128i*)src);
-                         __m128i x2 = _mm_loadl_epi64((__m128i*)(src + cn));
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         x1 = _mm_unpacklo_epi8(x1, z);
-                         x2 = _mm_unpacklo_epi8(x2, z);
-                         __m128i x3 = _mm_unpacklo_epi16(x0, x2);
-                         __m128i x4 = _mm_unpackhi_epi16(x0, x2);
-                         __m128i x5 = _mm_unpacklo_epi16(x1, z);
-                         __m128i x6 = _mm_unpackhi_epi16(x1, z);
-                         x3 = _mm_madd_epi16(x3, k1);
-                         x4 = _mm_madd_epi16(x4, k1);
-                         x5 = _mm_madd_epi16(x5, k0);
-                         x6 = _mm_madd_epi16(x6, k0);
-                         x3 = _mm_add_epi32(x3, x5);
-                         x4 = _mm_add_epi32(x4, x6);
-                         _mm_store_si128((__m128i*)(dst + i), x3);
-                         _mm_store_si128((__m128i*)(dst + i + 4), x4);
+                         v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]), v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)) * vx_setall_s32(kx[1])));
+                         i += v_uint32::nlanes;
                      }
                  }
              }
              else if( _ksize == 5 )
              {
                  if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                 {
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h;
+                         v_expand(vx_load(src - 2*cn), x0l, x0h);
+                         v_expand(vx_load(src), x1l, x1h);
+                         v_expand(vx_load(src + 2*cn), x2l, x2h);
+                         x1l = v_sub_wrap(v_add_wrap(x0l, x2l), v_add_wrap(x1l, x1l));
+                         x1h = v_sub_wrap(v_add_wrap(x0h, x2h), v_add_wrap(x1h, x1h));
+                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x1l)));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1l)));
+                         v_store(dst + i + 2*v_int32::nlanes, v_expand_low(v_reinterpret_as_s16(x1h)));
+                         v_store(dst + i + 3*v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x1h)));
+                     }
+                     if( i <= width - v_uint16::nlanes )
                      {
-                         __m128i x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_si128((__m128i*)(src - cn*2));
-                         x1 = _mm_loadu_si128((__m128i*)src);
-                         x2 = _mm_loadu_si128((__m128i*)(src + cn*2));
-                         y0 = _mm_unpackhi_epi8(x0, z);
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         y1 = _mm_unpackhi_epi8(x1, z);
-                         x1 = _mm_unpacklo_epi8(x1, z);
-                         y2 = _mm_unpackhi_epi8(x2, z);
-                         x2 = _mm_unpacklo_epi8(x2, z);
-                         x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1)));
-                         y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1)));
-                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+                         v_uint16 x = vx_load_expand(src);
+                         x = v_sub_wrap(v_add_wrap(vx_load_expand(src - 2*cn), vx_load_expand(src + 2*cn)), v_add_wrap(x, x));
+                         v_store(dst + i, v_expand_low(v_reinterpret_as_s16(x)));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(v_reinterpret_as_s16(x)));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
                      }
+                     if( i <= width - v_uint32::nlanes )
+                     {
+                         v_int32 x = v_reinterpret_as_s32(vx_load_expand_q(src));
+                         x = v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) - (x + x);
+                         v_store(dst + i, x);
+                         i += v_uint32::nlanes;
+                     }
+                 }
                  else
                  {
-                     __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0),
-                             k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0),
-                             k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0);
-                     k1 = _mm_packs_epi32(k1, k1);
-                     k2 = _mm_packs_epi32(k2, k2);
+                     v_int16 k0 = vx_setall_s16((short)(kx[0]));
+                     v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_int32 x0, x1, x2, x3;
+                         v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
+                         v_int16 xl, xh;
+                         v_expand(vx_load(src), x0l, x0h);
+                         v_mul_expand(v_reinterpret_as_s16(x0l), k0, x0, x1);
+                         v_mul_expand(v_reinterpret_as_s16(x0h), k0, x2, x3);
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src + cn), x1l, x1h);
+                         v_expand(vx_load(src - 2*cn), x2l, x2h);
+                         v_expand(vx_load(src + 2*cn), x3l, x3h);
+                         v_zip(v_reinterpret_as_s16(x0l + x1l), v_reinterpret_as_s16(x2l + x3l), xl, xh);
+                         x0 += v_dotprod(xl, k12);
+                         x1 += v_dotprod(xh, k12);
+                         v_zip(v_reinterpret_as_s16(x0h + x1h), v_reinterpret_as_s16(x2h + x3h), xl, xh);
+                         x2 += v_dotprod(xl, k12);
+                         x3 += v_dotprod(xh, k12);
+                         v_store(dst + i, x0);
+                         v_store(dst + i + v_int32::nlanes, x1);
+                         v_store(dst + i + 2*v_int32::nlanes, x2);
+                         v_store(dst + i + 3*v_int32::nlanes, x3);
+                     }
+                     if( i <= width - v_uint16::nlanes )
+                     {
+                         v_int32 x1, x2;
+                         v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, x1, x2);
  
-                     for( ; i <= width - 8; i += 8, src += 8 )
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - cn) + vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - 2*cn) + vx_load_expand(src + 2*cn)), xl, xh);
+                         x1 += v_dotprod(xl, k12);
+                         x2 += v_dotprod(xh, k12);
+                         v_store(dst + i, x1);
+                         v_store(dst + i + v_int32::nlanes, x2);
+                         i += v_uint16::nlanes, src += v_uint16::nlanes;
+                     }
+                     if( i <= width - v_uint32::nlanes )
+                     {
+                         v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src)), vx_setall_s32(kx[0]),
+                                          v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - cn) + vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]),
+                                                   v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn) + vx_load_expand_q(src + 2*cn)) * vx_setall_s32(kx[2]))));
+                         i += v_uint32::nlanes;
+                     }
+                 }
+             }
+             else
+             {
+                 v_int16 k0 = vx_setall_s16((short)(kx[0]));
+                 for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                 {
+                     v_uint8 v_src = vx_load(src);
+                     v_int32 s0, s1, s2, s3;
+                     v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1);
+                     v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3);
+                     for (k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn)
+                     {
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+                         v_uint8 v_src0 = vx_load(src - j);
+                         v_uint8 v_src1 = vx_load(src - j - cn);
+                         v_uint8 v_src2 = vx_load(src + j);
+                         v_uint8 v_src3 = vx_load(src + j + cn);
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src0) + v_expand_low(v_src2)), v_reinterpret_as_s16(v_expand_low(v_src1) + v_expand_low(v_src3)), xl, xh);
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src0) + v_expand_high(v_src2)), v_reinterpret_as_s16(v_expand_high(v_src1) + v_expand_high(v_src3)), xl, xh);
+                         s2 += v_dotprod(xl, k12);
+                         s3 += v_dotprod(xh, k12);
+                     }
+                     if( k < _ksize / 2 + 1 )
+                     {
+                         v_int16 k1 = vx_setall_s16((short)(kx[k]));
+                         v_uint8 v_src0 = vx_load(src - j);
+                         v_uint8 v_src1 = vx_load(src + j);
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src0)), v_reinterpret_as_s16(v_expand_low(v_src1)), xl, xh);
+                         s0 += v_dotprod(xl, k1);
+                         s1 += v_dotprod(xh, k1);
+                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src0)), v_reinterpret_as_s16(v_expand_high(v_src1)), xl, xh);
+                         s2 += v_dotprod(xl, k1);
+                         s3 += v_dotprod(xh, k1);
+                     }
+                     v_store(dst + i, s0);
+                     v_store(dst + i + v_int32::nlanes, s1);
+                     v_store(dst + i + 2*v_int32::nlanes, s2);
+                     v_store(dst + i + 3*v_int32::nlanes, s3);
+                 }
+                 if( i <= width - v_uint16::nlanes )
+                 {
+                     v_int32 s0, s1;
+                     v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
+                     for (k = 1, j = cn; k <= _ksize / 2 - 1; k+=2, j += 2*cn)
+                     {
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - j) + vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j - cn) + vx_load_expand(src + j + cn)), xl, xh);
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k+1] << 16)));
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                     }
+                     if ( k < _ksize / 2 + 1 )
                      {
-                         __m128i x0 = _mm_loadl_epi64((__m128i*)src);
-                         x0 = _mm_unpacklo_epi8(x0, z);
-                         __m128i x1 = _mm_unpacklo_epi16(x0, z);
-                         __m128i x2 = _mm_unpackhi_epi16(x0, z);
-                         x1 = _mm_madd_epi16(x1, k0);
-                         x2 = _mm_madd_epi16(x2, k0);
-                         __m128i x3 = _mm_loadl_epi64((__m128i*)(src - cn));
-                         __m128i x4 = _mm_loadl_epi64((__m128i*)(src + cn));
-                         x3 = _mm_unpacklo_epi8(x3, z);
-                         x4 = _mm_unpacklo_epi8(x4, z);
-                         __m128i x5 = _mm_unpacklo_epi16(x3, x4);
-                         __m128i x6 = _mm_unpackhi_epi16(x3, x4);
-                         x5 = _mm_madd_epi16(x5, k1);
-                         x6 = _mm_madd_epi16(x6, k1);
-                         x1 = _mm_add_epi32(x1, x5);
-                         x2 = _mm_add_epi32(x2, x6);
-                         x3 = _mm_loadl_epi64((__m128i*)(src - cn*2));
-                         x4 = _mm_loadl_epi64((__m128i*)(src + cn*2));
-                         x3 = _mm_unpacklo_epi8(x3, z);
-                         x4 = _mm_unpacklo_epi8(x4, z);
-                         x5 = _mm_unpacklo_epi16(x3, x4);
-                         x6 = _mm_unpackhi_epi16(x3, x4);
-                         x5 = _mm_madd_epi16(x5, k2);
-                         x6 = _mm_madd_epi16(x6, k2);
-                         x1 = _mm_add_epi32(x1, x5);
-                         x2 = _mm_add_epi32(x2, x6);
-                         _mm_store_si128((__m128i*)(dst + i), x1);
-                         _mm_store_si128((__m128i*)(dst + i + 4), x2);
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src - j)), v_reinterpret_as_s16(vx_load_expand(src + j)), xl, xh);
+                         v_int16 k1 = vx_setall_s16((short)(kx[k]));
+                         s0 += v_dotprod(xl, k1);
+                         s1 += v_dotprod(xh, k1);
                      }
+                     v_store(dst + i, s0);
+                     v_store(dst + i + v_int32::nlanes, s1);
+                     i += v_uint16::nlanes; src += v_uint16::nlanes;
+                 }
+                 if( i <= width - v_uint32::nlanes )
+                 {
+                     v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                     for( k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn )
+                         s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src - j) + vx_load_expand_q(src + j)), vx_setall_s32(kx[k]), s0);
+                     v_store(dst + i, s0);
+                     i += v_uint32::nlanes;
                  }
              }
          }
              if( _ksize == 3 )
              {
                  if( kx[0] == 0 && kx[1] == 1 )
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                 {
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+                         v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(x2l, x0l));
+                         v_int16 dh = v_reinterpret_as_s16(v_sub_wrap(x2h, x0h));
+                         v_store(dst + i, v_expand_low(dl));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
+                         v_store(dst + i + 2*v_int32::nlanes, v_expand_low(dh));
+                         v_store(dst + i + 3*v_int32::nlanes, v_expand_high(dh));
+                     }
+                     if( i <= width - v_uint16::nlanes )
                      {
-                         __m128i x0, x1, y0;
-                         x0 = _mm_loadu_si128((__m128i*)(src + cn));
-                         x1 = _mm_loadu_si128((__m128i*)(src - cn));
-                         y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z));
-                         x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z));
-                         _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16));
-                         _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16));
+                         v_int16 dl = v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn)));
+                         v_store(dst + i, v_expand_low(dl));
+                         v_store(dst + i + v_int32::nlanes, v_expand_high(dl));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
                      }
+                     if (i <= width - v_uint32::nlanes)
+                     {
+                         v_store(dst + i, v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)));
+                         i += v_uint32::nlanes;
+                     }
+                 }
                  else
                  {
-                     __m128i k0 = _mm_set_epi32(-kx[1], kx[1], -kx[1], kx[1]);
-                     k0 = _mm_packs_epi32(k0, k0);
-                     for( ; i <= width - 16; i += 16, src += 16 )
+                     v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (-kx[1] << 16)));
+                     for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                     {
+                         v_uint16 x0l, x0h, x2l, x2h;
+                         v_expand(vx_load(src - cn), x0l, x0h);
+                         v_expand(vx_load(src + cn), x2l, x2h);
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(x2l), v_reinterpret_as_s16(x0l), xl, xh);
+                         v_store(dst + i, v_dotprod(xl, k0));
+                         v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+                         v_zip(v_reinterpret_as_s16(x2h), v_reinterpret_as_s16(x0h), xl, xh);
+                         v_store(dst + i + 2*v_int32::nlanes, v_dotprod(xl, k0));
+                         v_store(dst + i + 3*v_int32::nlanes, v_dotprod(xh, k0));
+                     }
+                     if( i <= width - v_uint16::nlanes )
+                     {
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src + cn)), v_reinterpret_as_s16(vx_load_expand(src - cn)), xl, xh);
+                         v_store(dst + i, v_dotprod(xl, k0));
+                         v_store(dst + i + v_int32::nlanes, v_dotprod(xh, k0));
+                         i += v_uint16::nlanes; src += v_uint16::nlanes;
+                     }
+                     if (i <= width - v_uint32::nlanes)
                      {
-                         __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn));
-                         __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn));
-                         __m128i x2 = _mm_unpacklo_epi8(x0, z);
-                         __m128i x3 = _mm_unpacklo_epi8(x1, z);
-                         __m128i x4 = _mm_unpackhi_epi8(x0, z);
-                         __m128i x5 = _mm_unpackhi_epi8(x1, z);
-                         __m128i x6 = _mm_unpacklo_epi16(x2, x3);
-                         __m128i x7 = _mm_unpacklo_epi16(x4, x5);
-                         __m128i x8 = _mm_unpackhi_epi16(x2, x3);
-                         __m128i x9 = _mm_unpackhi_epi16(x4, x5);
-                         x6 = _mm_madd_epi16(x6, k0);
-                         x7 = _mm_madd_epi16(x7, k0);
-                         x8 = _mm_madd_epi16(x8, k0);
-                         x9 = _mm_madd_epi16(x9, k0);
-                         _mm_store_si128((__m128i*)(dst + i), x6);
-                         _mm_store_si128((__m128i*)(dst + i + 4), x8);
-                         _mm_store_si128((__m128i*)(dst + i + 8), x7);
-                         _mm_store_si128((__m128i*)(dst + i + 12), x9);
+                         v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)), vx_setall_s32(kx[1]), v_reinterpret_as_s32(vx_load_expand_q(src - cn)) * vx_setall_s32(-kx[1])));
+                         i += v_uint32::nlanes;
                      }
                  }
              }
              else if( _ksize == 5 )
              {
-                 __m128i k0 = _mm_loadl_epi64((__m128i*)(kx + 1));
-                 k0 = _mm_unpacklo_epi64(k0, k0);
-                 k0 = _mm_packs_epi32(k0, k0);
-                 for( ; i <= width - 16; i += 16, src += 16 )
+                 v_int16 k0 = v_reinterpret_as_s16(vx_setall_s32((kx[1] & 0xFFFF) | (kx[2] << 16)));
+                 for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                 {
+                     v_uint16 x0l, x0h, x1l, x1h, x2l, x2h, x3l, x3h;
+                     v_expand(vx_load(src - cn), x0l, x0h);
+                     v_expand(vx_load(src - 2*cn), x1l, x1h);
+                     v_expand(vx_load(src + cn), x2l, x2h);
+                     v_expand(vx_load(src + 2*cn), x3l, x3h);
+                     v_int16 x0, x1;
+                     v_zip(v_reinterpret_as_s16(v_sub_wrap(x2l, x0l)), v_reinterpret_as_s16(v_sub_wrap(x3l, x1l)), x0, x1);
+                     v_store(dst + i, v_dotprod(x0, k0));
+                     v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+                     v_zip(v_reinterpret_as_s16(v_sub_wrap(x2h, x0h)), v_reinterpret_as_s16(v_sub_wrap(x3h, x1h)), x0, x1);
+                     v_store(dst + i + 2*v_int32::nlanes, v_dotprod(x0, k0));
+                     v_store(dst + i + 3*v_int32::nlanes, v_dotprod(x1, k0));
+                 }
+                 if( i <= width - v_uint16::nlanes )
+                 {
+                     v_int16 x0, x1;
+                     v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + cn), vx_load_expand(src - cn))),
+                           v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + 2*cn), vx_load_expand(src - 2*cn))), x0, x1);
+                     v_store(dst + i, v_dotprod(x0, k0));
+                     v_store(dst + i + v_int32::nlanes, v_dotprod(x1, k0));
+                     i += v_uint16::nlanes; src += v_uint16::nlanes;
+                 }
+                 if( i <= width - v_uint32::nlanes )
                  {
-                     __m128i x0 = _mm_loadu_si128((__m128i*)(src + cn));
-                     __m128i x1 = _mm_loadu_si128((__m128i*)(src - cn));
-                     __m128i x2 = _mm_unpackhi_epi8(x0, z);
-                     __m128i x3 = _mm_unpackhi_epi8(x1, z);
-                     x0 = _mm_unpacklo_epi8(x0, z);
-                     x1 = _mm_unpacklo_epi8(x1, z);
-                     __m128i x5 = _mm_sub_epi16(x2, x3);
-                     __m128i x4 = _mm_sub_epi16(x0, x1);
-                     __m128i x6 = _mm_loadu_si128((__m128i*)(src + cn * 2));
-                     __m128i x7 = _mm_loadu_si128((__m128i*)(src - cn * 2));
-                     __m128i x8 = _mm_unpackhi_epi8(x6, z);
-                     __m128i x9 = _mm_unpackhi_epi8(x7, z);
-                     x6 = _mm_unpacklo_epi8(x6, z);
-                     x7 = _mm_unpacklo_epi8(x7, z);
-                     __m128i x11 = _mm_sub_epi16(x8, x9);
-                     __m128i x10 = _mm_sub_epi16(x6, x7);
-                     __m128i x13 = _mm_unpackhi_epi16(x5, x11);
-                     __m128i x12 = _mm_unpackhi_epi16(x4, x10);
-                     x5 = _mm_unpacklo_epi16(x5, x11);
-                     x4 = _mm_unpacklo_epi16(x4, x10);
-                     x5 = _mm_madd_epi16(x5, k0);
-                     x4 = _mm_madd_epi16(x4, k0);
-                     x13 = _mm_madd_epi16(x13, k0);
-                     x12 = _mm_madd_epi16(x12, k0);
-                     _mm_store_si128((__m128i*)(dst + i), x4);
-                     _mm_store_si128((__m128i*)(dst + i + 4), x12);
-                     _mm_store_si128((__m128i*)(dst + i + 8), x5);
-                     _mm_store_si128((__m128i*)(dst + i + 12), x13);
+                     v_store(dst + i, v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - cn)), vx_setall_s32(kx[1]),
+                                              (v_reinterpret_as_s32(vx_load_expand_q(src + 2*cn)) - v_reinterpret_as_s32(vx_load_expand_q(src - 2*cn))) * vx_setall_s32(kx[2])));
+                     i += v_uint32::nlanes;
                  }
              }
-         }
-         src -= (_ksize/2)*cn;
-         kx -= _ksize/2;
-         for( ; i <= width - 4; i += 4, src += 4 )
-         {
-             __m128i s0 = z;
-             for( k = j = 0; k < _ksize; k++, j += cn )
+             else
              {
-                 __m128i f = _mm_cvtsi32_si128(kx[k]);
-                 f = _mm_shuffle_epi32(f, 0);
-                 __m128i x0 = _mm_cvtsi32_si128(*(const int*)(src + j));
-                 x0 = _mm_unpacklo_epi8(x0, z);
-                 x0 = _mm_unpacklo_epi16(x0, z);
-                 x0 = _mm_madd_epi16(x0, f);
-                 s0 = _mm_add_epi32(s0, x0);
+                 v_int16 k0 = vx_setall_s16((short)(kx[0]));
+                 for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes, src += v_uint8::nlanes )
+                 {
+                     v_uint8 v_src = vx_load(src);
+                     v_int32 s0, s1, s2, s3;
+                     v_mul_expand(v_reinterpret_as_s16(v_expand_low(v_src)), k0, s0, s1);
+                     v_mul_expand(v_reinterpret_as_s16(v_expand_high(v_src)), k0, s2, s3);
+                     for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn )
+                     {
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+                         v_uint8 v_src0 = vx_load(src - j);
+                         v_uint8 v_src1 = vx_load(src - j - cn);
+                         v_uint8 v_src2 = vx_load(src + j);
+                         v_uint8 v_src3 = vx_load(src + j + cn);
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src2), v_expand_low(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_low(v_src3), v_expand_low(v_src1))), xl, xh);
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                         v_zip(v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src2), v_expand_high(v_src0))), v_reinterpret_as_s16(v_sub_wrap(v_expand_high(v_src3), v_expand_high(v_src1))), xl, xh);
+                         s2 += v_dotprod(xl, k12);
+                         s3 += v_dotprod(xh, k12);
+                     }
+                     if( k < _ksize / 2 + 1 )
+                     {
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
+                         v_uint8 v_src0 = vx_load(src - j);
+                         v_uint8 v_src1 = vx_load(src + j);
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_expand_low(v_src1)), v_reinterpret_as_s16(v_expand_low(v_src0)), xl, xh);
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                         v_zip(v_reinterpret_as_s16(v_expand_high(v_src1)), v_reinterpret_as_s16(v_expand_high(v_src0)), xl, xh);
+                         s2 += v_dotprod(xl, k12);
+                         s3 += v_dotprod(xh, k12);
+                     }
+                     v_store(dst + i, s0);
+                     v_store(dst + i + v_int32::nlanes, s1);
+                     v_store(dst + i + 2*v_int32::nlanes, s2);
+                     v_store(dst + i + 3*v_int32::nlanes, s3);
+                 }
+                 if( i <= width - v_uint16::nlanes )
+                 {
+                     v_int32 s0, s1;
+                     v_mul_expand(v_reinterpret_as_s16(vx_load_expand(src)), k0, s0, s1);
+                     for( k = 1, j = cn; k <= _ksize / 2 - 1; k += 2, j += 2 * cn )
+                     {
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j), vx_load_expand(src - j))), v_reinterpret_as_s16(v_sub_wrap(vx_load_expand(src + j + cn), vx_load_expand(src - j - cn))), xl, xh);
+                         v_int16 k12 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (kx[k + 1] << 16)));
+                         s0 += v_dotprod(xl, k12);
+                         s1 += v_dotprod(xh, k12);
+                     }
+                     if( k < _ksize / 2 + 1 )
+                     {
+                         v_int16 k1 = v_reinterpret_as_s16(vx_setall_s32((kx[k] & 0xFFFF) | (-kx[k] << 16)));
+                         v_int16 xl, xh;
+                         v_zip(v_reinterpret_as_s16(vx_load_expand(src + j)), v_reinterpret_as_s16(vx_load_expand(src - j)), xl, xh);
+                         s0 += v_dotprod(xl, k1);
+                         s1 += v_dotprod(xh, k1);
+                     }
+                     v_store(dst + i, s0);
+                     v_store(dst + i + v_int32::nlanes, s1);
+                     i += v_uint16::nlanes; src += v_uint16::nlanes;
+                 }
+                 if( i <= width - v_uint32::nlanes )
+                 {
+                     v_int32 s0 = v_reinterpret_as_s32(vx_load_expand_q(src)) * vx_setall_s32(kx[0]);
+                     for (k = 1, j = cn; k < _ksize / 2 + 1; k++, j += cn)
+                         s0 = v_muladd(v_reinterpret_as_s32(vx_load_expand_q(src + j)) - v_reinterpret_as_s32(vx_load_expand_q(src - j)), vx_setall_s32(kx[k]), s0);
+                     v_store(dst + i, s0);
+                     i += v_uint32::nlanes;
+                 }
              }
-             _mm_store_si128((__m128i*)(dst + i), s0);
          }
  
          return i;
@@@ -885,129 -1105,117 +1105,117 @@@ struct SymmColumnVec_32s8
  
      int operator()(const uchar** _src, uchar* dst, int width) const
      {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
-         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+         int _ksize = kernel.rows + kernel.cols - 1;
+         int ksize2 = _ksize/2;
          const float* ky = kernel.ptr<float>() + ksize2;
          int i = 0, k;
          bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
          const int** src = (const int**)_src;
-         const __m128i *S, *S2;
-         __m128 d4 = _mm_set1_ps(delta);
  
+         v_float32 d4 = vx_setall_f32(delta);
          if( symmetrical )
          {
-             for( ; i <= width - 16; i += 16 )
-             {
-                 __m128 f = _mm_load_ss(ky);
-                 f = _mm_shuffle_ps(f, f, 0);
-                 __m128 s0, s1, s2, s3;
-                 __m128i x0, x1;
-                 S = (const __m128i*)(src[0] + i);
-                 s0 = _mm_cvtepi32_ps(_mm_load_si128(S));
-                 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1));
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
-                 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2));
-                 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3));
-                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
-                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
+             if (_ksize == 1)
+                 return 0;
+             v_float32 f0 = vx_setall_f32(ky[0]);
+             for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
+             {
+                 const int* S = src[0] + i;
+                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
+                 v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
+                 v_float32 s2 = v_muladd(v_cvt_f32(vx_load(S + 2*v_int32::nlanes)), f0, d4);
+                 v_float32 s3 = v_muladd(v_cvt_f32(vx_load(S + 3*v_int32::nlanes)), f0, d4);
                  for( k = 1; k <= ksize2; k++ )
                  {
-                     S = (const __m128i*)(src[k] + i);
-                     S2 = (const __m128i*)(src[-k] + i);
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                     x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
-                     x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
-                     x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
+                     v_float32 f = vx_setall_f32(ky[k]);
+                     const int* S0 = src[k] + i;
+                     const int* S1 = src[-k] + i;
+                     s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
+                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
+                     s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) + vx_load(S1 + 2*v_int32::nlanes)), f, s2);
+                     s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) + vx_load(S1 + 3*v_int32::nlanes)), f, s3);
                  }
-                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-                 x0 = _mm_packus_epi16(x0, x1);
-                 _mm_storeu_si128((__m128i*)(dst + i), x0);
+                 v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
              }
-             for( ; i <= width - 4; i += 4 )
+             if( i <= width - v_uint16::nlanes )
              {
-                 __m128 f = _mm_load_ss(ky);
-                 f = _mm_shuffle_ps(f, f, 0);
-                 __m128i x0;
-                 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i)));
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
+                 const int* S = src[0] + i;
+                 v_float32 s0 = v_muladd(v_cvt_f32(vx_load(S)), f0, d4);
+                 v_float32 s1 = v_muladd(v_cvt_f32(vx_load(S + v_int32::nlanes)), f0, d4);
                  for( k = 1; k <= ksize2; k++ )
                  {
-                     S = (const __m128i*)(src[k] + i);
-                     S2 = (const __m128i*)(src[-k] + i);
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
+                     v_float32 f = vx_setall_f32(ky[k]);
+                     const int* S0 = src[k] + i;
+                     const int* S1 = src[-k] + i;
+                     s0 = v_muladd(v_cvt_f32(vx_load(S0) + vx_load(S1)), f, s0);
+                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) + vx_load(S1 + v_int32::nlanes)), f, s1);
                  }
-                 x0 = _mm_cvtps_epi32(s0);
-                 x0 = _mm_packs_epi32(x0, x0);
-                 x0 = _mm_packus_epi16(x0, x0);
-                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
+                 v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                 i += v_uint16::nlanes;
+             }
+ #if CV_SIMD_WIDTH > 16
+             while( i <= width - v_int32x4::nlanes )
+ #else
+             if( i <= width - v_int32x4::nlanes )
+ #endif
+             {
+                 v_float32x4 s0 = v_muladd(v_cvt_f32(v_load(src[0] + i)), v_setall_f32(ky[0]), v_setall_f32(delta));
+                 for( k = 1; k <= ksize2; k++ )
+                     s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) + v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
+                 v_int32x4 s32 = v_round(s0);
+                 v_int16x8 s16 = v_pack(s32, s32);
+                 *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+                 i += v_int32x4::nlanes;
              }
          }
          else
          {
-             for( ; i <= width - 16; i += 16 )
+             for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
              {
-                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-                 __m128i x0, x1;
-                 for( k = 1; k <= ksize2; k++ )
+                 v_float32 s0 = d4;
+                 v_float32 s1 = d4;
+                 v_float32 s2 = d4;
+                 v_float32 s3 = d4;
+                 for ( k = 1; k <= ksize2; k++ )
                  {
-                     S = (const __m128i*)(src[k] + i);
-                     S2 = (const __m128i*)(src[-k] + i);
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                     x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
-                     x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2));
-                     x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f));
+                     v_float32 f = vx_setall_f32(ky[k]);
+                     const int* S0 = src[k] + i;
+                     const int* S1 = src[-k] + i;
+                     s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
+                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
+                     s2 = v_muladd(v_cvt_f32(vx_load(S0 + 2*v_int32::nlanes) - vx_load(S1 + 2*v_int32::nlanes)), f, s2);
+                     s3 = v_muladd(v_cvt_f32(vx_load(S0 + 3*v_int32::nlanes) - vx_load(S1 + 3*v_int32::nlanes)), f, s3);
                  }
-                 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-                 x0 = _mm_packus_epi16(x0, x1);
-                 _mm_storeu_si128((__m128i*)(dst + i), x0);
+                 v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
              }
-             for( ; i <= width - 4; i += 4 )
+             if( i <= width - v_uint16::nlanes )
              {
-                 __m128 f, s0 = d4;
-                 __m128i x0;
-                 for( k = 1; k <= ksize2; k++ )
+                 v_float32 s0 = d4;
+                 v_float32 s1 = d4;
+                 for ( k = 1; k <= ksize2; k++ )
                  {
-                     S = (const __m128i*)(src[k] + i);
-                     S2 = (const __m128i*)(src[-k] + i);
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f));
+                     v_float32 f = vx_setall_f32(ky[k]);
+                     const int* S0 = src[k] + i;
+                     const int* S1 = src[-k] + i;
+                     s0 = v_muladd(v_cvt_f32(vx_load(S0) - vx_load(S1)), f, s0);
+                     s1 = v_muladd(v_cvt_f32(vx_load(S0 + v_int32::nlanes) - vx_load(S1 + v_int32::nlanes)), f, s1);
                  }
-                 x0 = _mm_cvtps_epi32(s0);
-                 x0 = _mm_packs_epi32(x0, x0);
-                 x0 = _mm_packus_epi16(x0, x0);
-                 *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
+                 v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+                 i += v_uint16::nlanes;
+             }
+ #if CV_SIMD_WIDTH > 16
+             while( i <= width - v_int32x4::nlanes )
+ #else
+             if( i <= width - v_int32x4::nlanes )
+ #endif
+             {
+                 v_float32x4 s0 = v_setall_f32(delta);
+                 for (k = 1; k <= ksize2; k++)
+                     s0 = v_muladd(v_cvt_f32(v_load(src[k] + i) - v_load(src[-k] + i)), v_setall_f32(ky[k]), s0);
+                 v_int32x4 s32 = v_round(s0);
+                 v_int16x8 s16 = v_pack(s32, s32);
+                 *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+                 i += v_int32x4::nlanes;
              }
          }
  
@@@ -1033,9 -1241,6 +1241,6 @@@ struct SymmColumnSmallVec_32s16
  
      int operator()(const uchar** _src, uchar* _dst, int width) const
      {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
          int ksize2 = (kernel.rows + kernel.cols - 1)/2;
          const float* ky = kernel.ptr<float>() + ksize2;
          int i = 0;
          const int** src = (const int**)_src;
          const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
          short* dst = (short*)_dst;
-         __m128 df4 = _mm_set1_ps(delta);
-         __m128i d4 = _mm_cvtps_epi32(df4);
  
+         v_float32 df4 = vx_setall_f32(delta);
+         v_int32 d4 = v_round(df4);
          if( symmetrical )
          {
              if( ky[0] == 2 && ky[1] == 1 )
              {
-                 for( ; i <= width - 8; i += 8 )
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                 {
+                     v_int32 sl = vx_load(S1 + i);
+                     v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
+                     v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 + (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 + (sh + sh)));
+                 }
+                 if( i <= width - v_int32::nlanes )
                  {
-                     __m128i s0, s1, s2, s3, s4, s5;
-                     s0 = _mm_load_si128((__m128i*)(S0 + i));
-                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
-                     s2 = _mm_load_si128((__m128i*)(S1 + i));
-                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
-                     s4 = _mm_load_si128((__m128i*)(S2 + i));
-                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
-                     s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2)));
-                     s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3)));
-                     s0 = _mm_add_epi32(s0, d4);
-                     s1 = _mm_add_epi32(s1, d4);
-                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+                     v_int32 s = vx_load(S1 + i);
+                     v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 + (s + s));
+                     i += v_int32::nlanes;
                  }
              }
              else if( ky[0] == -2 && ky[1] == 1 )
              {
-                 for( ; i <= width - 8; i += 8 )
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                 {
+                     v_int32 sl = vx_load(S1 + i);
+                     v_int32 sh = vx_load(S1 + i + v_int32::nlanes);
+                     v_store(dst + i, v_pack(vx_load(S0 + i) + vx_load(S2 + i) + d4 - (sl + sl), vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes) + d4 - (sh + sh)));
+                 }
+                 if( i <= width - v_int32::nlanes )
+                 {
+                     v_int32 s = vx_load(S1 + i);
+                     v_pack_store(dst + i, vx_load(S0 + i) + vx_load(S2 + i) + d4 - (s + s));
+                     i += v_int32::nlanes;
+                 }
+             }
+             else if( ky[0] == (float)((int)ky[0]) && ky[1] == (float)((int)ky[1]) )
+             {
+                 v_int32 k0 = vx_setall_s32((int)ky[0]), k1 = vx_setall_s32((int)ky[1]);
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                     v_store(dst + i, v_pack(v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)),
+                                             v_muladd(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes), k1, v_muladd(vx_load(S1 + i + v_int32::nlanes), k0, d4))));
+                 if( i <= width - v_int32::nlanes )
                  {
-                     __m128i s0, s1, s2, s3, s4, s5;
-                     s0 = _mm_load_si128((__m128i*)(S0 + i));
-                     s1 = _mm_load_si128((__m128i*)(S0 + i + 4));
-                     s2 = _mm_load_si128((__m128i*)(S1 + i));
-                     s3 = _mm_load_si128((__m128i*)(S1 + i + 4));
-                     s4 = _mm_load_si128((__m128i*)(S2 + i));
-                     s5 = _mm_load_si128((__m128i*)(S2 + i + 4));
-                     s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2)));
-                     s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3)));
-                     s0 = _mm_add_epi32(s0, d4);
-                     s1 = _mm_add_epi32(s1, d4);
-                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+                     v_pack_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
+                     i += v_int32::nlanes;
                  }
              }
              else
              {
-                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
-                 for( ; i <= width - 8; i += 8 )
+                 v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                     v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))),
+                                             v_round(v_muladd(v_cvt_f32(vx_load(S0 + i + v_int32::nlanes) + vx_load(S2 + i + v_int32::nlanes)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i + v_int32::nlanes)), k0, df4)))));
+                 if( i <= width - v_int32::nlanes )
                  {
-                     __m128 s0, s1;
-                     s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i)));
-                     s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4)));
-                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4);
-                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4);
-                     __m128i x0, x1;
-                     x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)),
-                                        _mm_load_si128((__m128i*)(S2 + i)));
-                     x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)),
-                                        _mm_load_si128((__m128i*)(S2 + i + 4)));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
-                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                     _mm_storeu_si128((__m128i*)(dst + i), x0);
+                     v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S0 + i) + vx_load(S2 + i)), k1, v_muladd(v_cvt_f32(vx_load(S1 + i)), k0, df4))));
+                     i += v_int32::nlanes;
                  }
              }
          }
              {
                  if( ky[1] < 0 )
                      std::swap(S0, S2);
-                 for( ; i <= width - 8; i += 8 )
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                     v_store(dst + i, v_pack(vx_load(S2 + i) - vx_load(S0 + i) + d4, vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes) + d4));
+                 if( i <= width - v_int32::nlanes )
                  {
-                     __m128i s0, s1, s2, s3;
-                     s0 = _mm_load_si128((__m128i*)(S2 + i));
-                     s1 = _mm_load_si128((__m128i*)(S2 + i + 4));
-                     s2 = _mm_load_si128((__m128i*)(S0 + i));
-                     s3 = _mm_load_si128((__m128i*)(S0 + i + 4));
-                     s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4);
-                     s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4);
-                     _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1));
+                     v_pack_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
+                     i += v_int32::nlanes;
                  }
              }
              else
              {
-                 __m128 k1 = _mm_set1_ps(ky[1]);
-                 for( ; i <= width - 8; i += 8 )
+                 v_float32 k1 = vx_setall_f32(ky[1]);
+                 for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
+                     v_store(dst + i, v_pack(v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)),
+                                             v_round(v_muladd(v_cvt_f32(vx_load(S2 + i + v_int32::nlanes) - vx_load(S0 + i + v_int32::nlanes)), k1, df4))));
+                 if( i <= width - v_int32::nlanes )
                  {
-                     __m128 s0 = df4, s1 = df4;
-                     __m128i x0, x1;
-                     x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)),
-                                        _mm_load_si128((__m128i*)(S0 + i)));
-                     x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)),
-                                        _mm_load_si128((__m128i*)(S0 + i + 4)));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1));
-                     x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-                     _mm_storeu_si128((__m128i*)(dst + i), x0);
+                     v_pack_store(dst + i, v_round(v_muladd(v_cvt_f32(vx_load(S2 + i) - vx_load(S0 + i)), k1, df4)));
+                     i += v_int32::nlanes;
                  }
              }
          }
  
  struct RowVec_16s32f
  {
-     RowVec_16s32f() { sse2_supported = false; }
+     RowVec_16s32f() {}
      RowVec_16s32f( const Mat& _kernel )
      {
          kernel = _kernel;
-         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
      }
  
      int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
      {
-         if( !sse2_supported )
-             return 0;
          int i = 0, k, _ksize = kernel.rows + kernel.cols - 1;
          float* dst = (float*)_dst;
          const float* _kx = kernel.ptr<float>();
          width *= cn;
  
-         for( ; i <= width - 8; i += 8 )
+         for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
          {
              const short* src = (const short*)_src + i;
-             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
+             v_float32 s0 = vx_setzero_f32();
+             v_float32 s1 = vx_setzero_f32();
              for( k = 0; k < _ksize; k++, src += cn )
              {
-                 f = _mm_load_ss(_kx+k);
-                 f = _mm_shuffle_ps(f, f, 0);
-                 __m128i x0i = _mm_loadu_si128((const __m128i*)src);
-                 __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16);
-                 x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16);
-                 x0 = _mm_cvtepi32_ps(x0i);
-                 x1 = _mm_cvtepi32_ps(x1i);
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
+                 v_int16 x = vx_load(src);
+                 s0 = v_muladd(v_cvt_f32(v_expand_low(x)), vx_setall_f32(_kx[k]), s0);
+                 s1 = v_muladd(v_cvt_f32(v_expand_high(x)), vx_setall_f32(_kx[k]), s1);
              }
-             _mm_store_ps(dst + i, s0);
-             _mm_store_ps(dst + i + 4, s1);
+             v_store(dst + i, s0);
+             v_store(dst + i + v_float32::nlanes, s1);
+         }
+         if( i <= width - v_float32::nlanes )
+         {
+             const short* src = (const short*)_src + i;
+             v_float32 s0 = vx_setzero_f32();
+             for( k = 0; k < _ksize; k++, src += cn )
+                 s0 = v_muladd(v_cvt_f32(vx_load_expand(src)), vx_setall_f32(_kx[k]), s0);
+             v_store(dst + i, s0);
+             i += v_float32::nlanes;
          }
          return i;
      }
  
      Mat kernel;
-     bool sse2_supported;
  };
  
  
  struct SymmColumnVec_32f16s
  {
-     SymmColumnVec_32f16s() { symmetryType=0; delta = 0; sse2_supported = false; }
+     SymmColumnVec_32f16s() { symmetryType=0; delta = 0; }
      SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
      {
          symmetryType = _symmetryType;
          kernel = _kernel;
          delta = (float)_delta;
          CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-         sse2_supported = checkHardwareSupport(CV_CPU_SSE2);
      }
  
      int operator()(const uchar** _src, uchar* _dst, int width) const
      {
-         if( !sse2_supported )
-             return 0;
-         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
+         int _ksize = kernel.rows + kernel.cols - 1;
+         int ksize2 = _ksize / 2;
          const float* ky = kernel.ptr<float>() + ksize2;
          int i = 0, k;
          bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
          const float** src = (const float**)_src;
-         const float *S, *S2;
          short* dst = (short*)_dst;
-         __m128 d4 = _mm_set1_ps(delta);
  
+         v_float32 d4 = vx_setall_f32(delta);
          if( symmetrical )
          {
-             for( ; i <= width - 16; i += 16 )
+             if (_ksize == 1)
+                 return 0;
+             v_float32 k0 = vx_setall_f32(ky[0]);
+             for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
              {
-                 __m128 f = _mm_load_ss(ky);
-                 f = _mm_shuffle_ps(f, f, 0);
-                 __m128 s0, s1, s2, s3;
-                 __m128 x0, x1;
-                 S = src[0] + i;
-                 s0 = _mm_load_ps(S);
-                 s1 = _mm_load_ps(S+4);
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
-                 s2 = _mm_load_ps(S+8);
-                 s3 = _mm_load_ps(S+12);
-                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
-                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
+                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
+                 v_float32 s1 = v_muladd(vx_load(src[0] + i + v_float32::nlanes), k0, d4);
                  for( k = 1; k <= ksize2; k++ )
                  {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
+                     v_float32 k1 = vx_setall_f32(ky[k]);
+                     s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), k1, s0);
+                     s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) + vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
                  }
-                 __m128i s0i = _mm_cvtps_epi32(s0);
-                 __m128i s1i = _mm_cvtps_epi32(s1);
-                 __m128i s2i = _mm_cvtps_epi32(s2);
-                 __m128i s3i = _mm_cvtps_epi32(s3);
-                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
-                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
+                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
              }
-             for( ; i <= width - 4; i += 4 )
+             if( i <= width - v_float32::nlanes )
              {
-                 __m128 f = _mm_load_ss(ky);
-                 f = _mm_shuffle_ps(f, f, 0);
-                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
+                 v_float32 s0 = v_muladd(vx_load(src[0] + i), k0, d4);
                  for( k = 1; k <= ksize2; k++ )
-                 {
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 }
-                 __m128i s0i = _mm_cvtps_epi32(s0);
-                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
+                     s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                 v_pack_store(dst + i, v_round(s0));
+                 i += v_float32::nlanes;
              }
          }
          else
          {
-             for( ; i <= width - 16; i += 16 )
+             for( ; i <= width - v_int16::nlanes; i += v_int16::nlanes )
              {
-                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-                 __m128 x0, x1;
-                 S = src[0] + i;
+                 v_float32 s0 = d4;
+                 v_float32 s1 = d4;
                  for( k = 1; k <= ksize2; k++ )
                  {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
+                     v_float32 k1 = vx_setall_f32(ky[k]);
+                     s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), k1, s0);
+                     s1 = v_muladd(vx_load(src[k] + i + v_float32::nlanes) - vx_load(src[-k] + i + v_float32::nlanes), k1, s1);
                  }
-                 __m128i s0i = _mm_cvtps_epi32(s0);
-                 __m128i s1i = _mm_cvtps_epi32(s1);
-                 __m128i s2i = _mm_cvtps_epi32(s2);
-                 __m128i s3i = _mm_cvtps_epi32(s3);
-                 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i));
-                 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i));
+                 v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
              }
-             for( ; i <= width - 4; i += 4 )
+             if( i <= width - v_float32::nlanes )
              {
-                 __m128 f, x0, s0 = d4;
+                 v_float32 s0 = d4;
                  for( k = 1; k <= ksize2; k++ )
-                 {
-                     f = _mm_load_ss(ky+k);
-                     f = _mm_shuffle_ps(f, f, 0);
-                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 }
-                 __m128i s0i = _mm_cvtps_epi32(s0);
-                 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i));
+                     s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                 v_pack_store(dst + i, v_round(s0));
+                 i += v_float32::nlanes;
              }
          }
  
      int symmetryType;
      float delta;
      Mat kernel;
-     bool sse2_supported;
  };
  
  
@@@ -1357,7 -1479,6 +1479,6 @@@ struct RowVec_32
  {
      RowVec_32f()
      {
-         haveSSE = checkHardwareSupport(CV_CPU_SSE);
          haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
  #if defined USE_IPP_SEP_FILTERS
          bufsz = -1;
      RowVec_32f( const Mat& _kernel )
      {
          kernel = _kernel;
-         haveSSE = checkHardwareSupport(CV_CPU_SSE);
          haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
  #if defined USE_IPP_SEP_FILTERS
          bufsz = -1;
          float* dst = (float*)_dst;
          const float* _kx = kernel.ptr<float>();
  
-         if( !haveSSE )
-             return 0;
          int i = 0, k;
          width *= cn;
  
          if (haveAVX2)
              return RowVec_32f_AVX(src0, _kx, dst, width, cn, _ksize);
  #endif
-         for( ; i <= width - 8; i += 8 )
+         for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
          {
              const float* src = src0 + i;
-             __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1;
+             v_float32 s0 = vx_setzero_f32();
              for( k = 0; k < _ksize; k++, src += cn )
-             {
-                 f = _mm_set1_ps(_kx[k]);
-                 x0 = _mm_loadu_ps(src);
-                 x1 = _mm_loadu_ps(src + 4);
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-             }
-             _mm_store_ps(dst + i, s0);
-             _mm_store_ps(dst + i + 4, s1);
+                 s0 = v_muladd(vx_load(src), vx_setall_f32(_kx[k]), s0);
+             v_store(dst + i, s0);
          }
          return i;
      }
  
      Mat kernel;
-     bool haveSSE;
      bool haveAVX2;
  #if defined USE_IPP_SEP_FILTERS
  private:
@@@ -1475,9 -1583,6 +1583,6 @@@ struct SymmRowSmallVec_32
  
      int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
      {
-         if( !checkHardwareSupport(CV_CPU_SSE) )
-             return 0;
          int i = 0, _ksize = kernel.rows + kernel.cols - 1;
          float* dst = (float*)_dst;
          const float* src = (const float*)_src + (_ksize/2)*cn;
                  return 0;
              if( _ksize == 3 )
              {
-                 if( kx[0] == 2 && kx[1] == 1 )
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn);
-                         y0 = _mm_loadu_ps(src - cn + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn + 4);
-                         x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2));
-                         y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2));
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
-                 else if( kx[0] == -2 && kx[1] == 1 )
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn);
-                         y0 = _mm_loadu_ps(src - cn + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn + 4);
-                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
-                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                 if( fabs(kx[0]) == 2 && kx[1] == 1 )
+                 {
+                     v_float32 k0 = vx_setall_f32(kx[0]);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - cn) + vx_load(src + cn)));
+                 }
                  else
                  {
-                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn);
-                         y0 = _mm_loadu_ps(src - cn + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn + 4);
-                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
-                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
-                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
-                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                     v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1));
                  }
              }
              else if( _ksize == 5 )
              {
                  if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn*2);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn*2);
-                         y0 = _mm_loadu_ps(src - cn*2 + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn*2 + 4);
-                         x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1)));
-                         y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1)));
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                 {
+                     v_float32 k0 = vx_setall_f32(-2);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, v_muladd(vx_load(src), k0, vx_load(src - 2*cn) + vx_load(src + 2*cn)));
+                 }
                  else
                  {
-                     __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x1, x2, y0, y1, y2;
-                         x0 = _mm_loadu_ps(src - cn);
-                         x1 = _mm_loadu_ps(src);
-                         x2 = _mm_loadu_ps(src + cn);
-                         y0 = _mm_loadu_ps(src - cn + 4);
-                         y1 = _mm_loadu_ps(src + 4);
-                         y2 = _mm_loadu_ps(src + cn + 4);
-                         x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1);
-                         y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1);
-                         x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0));
-                         y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0));
-                         x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
-                         y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
-                         x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
-                         y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                     v_float32 k0 = vx_setall_f32(kx[0]), k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, v_muladd(vx_load(src + 2*cn) + vx_load(src - 2*cn), k2, v_muladd(vx_load(src), k0, (vx_load(src - cn) + vx_load(src + cn)) * k1)));
                  }
              }
          }
              if( _ksize == 3 )
              {
                  if( kx[0] == 0 && kx[1] == 1 )
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x2, y0, y2;
-                         x0 = _mm_loadu_ps(src + cn);
-                         x2 = _mm_loadu_ps(src - cn);
-                         y0 = _mm_loadu_ps(src + cn + 4);
-                         y2 = _mm_loadu_ps(src - cn + 4);
-                         x0 = _mm_sub_ps(x0, x2);
-                         y0 = _mm_sub_ps(y0, y2);
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, vx_load(src + cn) - vx_load(src - cn));
                  else
                  {
-                     __m128 k1 = _mm_set1_ps(kx[1]);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         __m128 x0, x2, y0, y2;
-                         x0 = _mm_loadu_ps(src + cn);
-                         x2 = _mm_loadu_ps(src - cn);
-                         y0 = _mm_loadu_ps(src + cn + 4);
-                         y2 = _mm_loadu_ps(src - cn + 4);
-                         x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
-                         y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
-                         _mm_store_ps(dst + i, x0);
-                         _mm_store_ps(dst + i + 4, y0);
-                     }
+                     v_float32 k1 = vx_setall_f32(kx[1]);
+                     for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                         v_store(dst + i, (vx_load(src + cn) - vx_load(src - cn)) * k1);
                  }
              }
              else if( _ksize == 5 )
              {
-                 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]);
-                 for( ; i <= width - 8; i += 8, src += 8 )
-                 {
-                     __m128 x0, x2, y0, y2;
-                     x0 = _mm_loadu_ps(src + cn);
-                     x2 = _mm_loadu_ps(src - cn);
-                     y0 = _mm_loadu_ps(src + cn + 4);
-                     y2 = _mm_loadu_ps(src - cn + 4);
-                     x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1);
-                     y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1);
-                     x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2));
-                     y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4));
-                     x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2));
-                     y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2));
-                     _mm_store_ps(dst + i, x0);
-                     _mm_store_ps(dst + i + 4, y0);
-                 }
+                 v_float32 k1 = vx_setall_f32(kx[1]), k2 = vx_setall_f32(kx[2]);
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes, src += v_float32::nlanes )
+                     v_store(dst + i, v_muladd(vx_load(src + 2*cn) - vx_load(src - 2*cn), k2, (vx_load(src + cn) - vx_load(src - cn)) * k1));
              }
          }
  
@@@ -1661,7 -1659,6 +1659,6 @@@ struct SymmColumnVec_32
  {
      SymmColumnVec_32f() {
          symmetryType=0;
-         haveSSE = checkHardwareSupport(CV_CPU_SSE);
          haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
          delta = 0;
      }
          symmetryType = _symmetryType;
          kernel = _kernel;
          delta = (float)_delta;
-         haveSSE = checkHardwareSupport(CV_CPU_SSE);
          haveAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
          CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
      }
  
      int operator()(const uchar** _src, uchar* _dst, int width) const
      {
-         if( !haveSSE )
-             return 0;
          int ksize2 = (kernel.rows + kernel.cols - 1)/2;
          const float* ky = kernel.ptr<float>() + ksize2;
          int i = 0, k;
          bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
          const float** src = (const float**)_src;
-         const float *S, *S2;
          float* dst = (float*)_dst;
  
          if( symmetrical )
              if (haveAVX2)
                  return SymmColumnVec_32f_Symm_AVX(src, ky, dst, delta, width, ksize2);
  #endif
-             const __m128 d4 = _mm_set1_ps(delta);
-             for( ; i <= width - 16; i += 16 )
-             {
-                 __m128 f = _mm_set1_ps(ky[0]);
-                 __m128 s0, s1, s2, s3;
-                 __m128 x0, x1;
-                 S = src[0] + i;
-                 s0 = _mm_load_ps(S);
-                 s1 = _mm_load_ps(S+4);
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4);
-                 s2 = _mm_load_ps(S+8);
-                 s3 = _mm_load_ps(S+12);
-                 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4);
-                 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4);
-                 for( k = 1; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     f = _mm_set1_ps(ky[k]);
-                     x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                     x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                     x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                     x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
-                 }
-                 _mm_storeu_ps(dst + i, s0);
-                 _mm_storeu_ps(dst + i + 4, s1);
-                 _mm_storeu_ps(dst + i + 8, s2);
-                 _mm_storeu_ps(dst + i + 12, s3);
-             }
-             for( ; i <= width - 4; i += 4 )
-             {
-                 __m128 f = _mm_set1_ps(ky[0]);
-                 __m128 x0, s0 = _mm_load_ps(src[0] + i);
-                 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);
-                 for( k = 1; k <= ksize2; k++ )
-                 {
-                     f = _mm_set1_ps(ky[k]);
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 }
-                 _mm_storeu_ps(dst + i, s0);
-             }
-         }
-         else
-         {
- #if CV_TRY_AVX2
-             if (haveAVX2)
-                 return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
- #endif
-             const __m128 d4 = _mm_set1_ps(delta);
-             for( ; i <= width - 16; i += 16 )
-             {
-                 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-                 __m128 x0, x1;
-                 S = src[0] + i;
-                 for( k = 1; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     f = _mm_set1_ps(ky[k]);
-                     x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2));
-                     x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f));
-                     x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8));
-                     x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12));
-                     s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f));
-                     s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f));
-                 }
-                 _mm_storeu_ps(dst + i, s0);
-                 _mm_storeu_ps(dst + i + 4, s1);
-                 _mm_storeu_ps(dst + i + 8, s2);
-                 _mm_storeu_ps(dst + i + 12, s3);
-             }
-             for( ; i <= width - 4; i += 4 )
+             const v_float32 d4 = vx_setall_f32(delta);
+             for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
              {
-                 __m128 f, x0, s0 = d4;
+                 v_float32 s0 = v_muladd(vx_load(src[0] + i), vx_setall_f32(ky[0]), d4);
                  for( k = 1; k <= ksize2; k++ )
-                 {
-                     f = _mm_set1_ps(ky[k]);
-                     x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
-                 }
-                 _mm_storeu_ps(dst + i, s0);
+                     s0 = v_muladd(vx_load(src[k] + i) + vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                 v_store(dst + i, s0);
              }
          }
-         return i;
-     }
-     int symmetryType;
-     float delta;
-     Mat kernel;
-     bool haveSSE;
-     bool haveAVX2;
- };
- struct SymmColumnSmallVec_32f
- {
-     SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; }
-     SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
-     {
-         symmetryType = _symmetryType;
-         kernel = _kernel;
-         delta = (float)_delta;
-         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-     }
-     int operator()(const uchar** _src, uchar* _dst, int width) const
-     {
-         if( !checkHardwareSupport(CV_CPU_SSE) )
-             return 0;
-         int ksize2 = (kernel.rows + kernel.cols - 1)/2;
-         const float* ky = kernel.ptr<float>() + ksize2;
-         int i = 0;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const float** src = (const float**)_src;
-         const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
-         float* dst = (float*)_dst;
-         __m128 d4 = _mm_set1_ps(delta);
-         if( symmetrical )
-         {
-             if( ky[0] == 2 && ky[1] == 1 )
-             {
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0, s1, s2, s3, s4, s5;
-                     s0 = _mm_load_ps(S0 + i);
-                     s1 = _mm_load_ps(S0 + i + 4);
-                     s2 = _mm_load_ps(S1 + i);
-                     s3 = _mm_load_ps(S1 + i + 4);
-                     s4 = _mm_load_ps(S2 + i);
-                     s5 = _mm_load_ps(S2 + i + 4);
-                     s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2)));
-                     s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3)));
-                     s0 = _mm_add_ps(s0, d4);
-                     s1 = _mm_add_ps(s1, d4);
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-             else if( ky[0] == -2 && ky[1] == 1 )
-             {
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0, s1, s2, s3, s4, s5;
-                     s0 = _mm_load_ps(S0 + i);
-                     s1 = _mm_load_ps(S0 + i + 4);
-                     s2 = _mm_load_ps(S1 + i);
-                     s3 = _mm_load_ps(S1 + i + 4);
-                     s4 = _mm_load_ps(S2 + i);
-                     s5 = _mm_load_ps(S2 + i + 4);
-                     s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2)));
-                     s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3)));
-                     s0 = _mm_add_ps(s0, d4);
-                     s1 = _mm_add_ps(s1, d4);
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-             else
-             {
-                 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]);
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0, s1, x0, x1;
-                     s0 = _mm_load_ps(S1 + i);
-                     s1 = _mm_load_ps(S1 + i + 4);
-                     s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4);
-                     s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4);
-                     x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i));
-                     x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-         }
-         else
-         {
-             if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] )
-             {
-                 if( ky[1] < 0 )
-                     std::swap(S0, S2);
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0, s1, s2, s3;
-                     s0 = _mm_load_ps(S2 + i);
-                     s1 = _mm_load_ps(S2 + i + 4);
-                     s2 = _mm_load_ps(S0 + i);
-                     s3 = _mm_load_ps(S0 + i + 4);
-                     s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4);
-                     s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4);
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-             else
-             {
-                 __m128 k1 = _mm_set1_ps(ky[1]);
-                 for( ; i <= width - 8; i += 8 )
-                 {
-                     __m128 s0 = d4, s1 = d4, x0, x1;
-                     x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i));
-                     x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4));
-                     s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1));
-                     s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1));
-                     _mm_storeu_ps(dst + i, s0);
-                     _mm_storeu_ps(dst + i + 4, s1);
-                 }
-             }
-         }
-         return i;
-     }
-     int symmetryType;
-     float delta;
-     Mat kernel;
- };
- /////////////////////////////// non-separable filters ///////////////////////////////
- ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
- struct FilterVec_8u
- {
-     FilterVec_8u() { delta = 0; _nz = 0; }
-     FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
-     {
-         Mat kernel;
-         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-         delta = (float)(_delta/(1 << _bits));
-         std::vector<Point> coords;
-         preprocess2DKernel(kernel, coords, coeffs);
-         _nz = (int)coords.size();
-     }
-     int operator()(const uchar** src, uchar* dst, int width) const
-     {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
-         const float* kf = (const float*)&coeffs[0];
-         int i = 0, k, nz = _nz;
-         __m128 d4 = _mm_set1_ps(delta);
-         for( ; i <= width - 16; i += 16 )
-         {
-             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-             __m128i x0, x1, z = _mm_setzero_si128();
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0, t1;
-                 f = _mm_shuffle_ps(f, f, 0);
-                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
-                 x1 = _mm_unpackhi_epi8(x0, z);
-                 x0 = _mm_unpacklo_epi8(x0, z);
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
-                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
-                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
-                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
-             }
-             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-             x0 = _mm_packus_epi16(x0, x1);
-             _mm_storeu_si128((__m128i*)(dst + i), x0);
-         }
-         for( ; i <= width - 4; i += 4 )
-         {
-             __m128 s0 = d4;
-             __m128i x0, z = _mm_setzero_si128();
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0;
-                 f = _mm_shuffle_ps(f, f, 0);
-                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
-                 x0 = _mm_unpacklo_epi8(x0, z);
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-             }
-             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
-             x0 = _mm_packus_epi16(x0, x0);
-             *(int*)(dst + i) = _mm_cvtsi128_si32(x0);
-         }
-         return i;
-     }
-     int _nz;
-     std::vector<uchar> coeffs;
-     float delta;
- };
- struct FilterVec_8u16s
- {
-     FilterVec_8u16s() { delta = 0; _nz = 0; }
-     FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
-     {
-         Mat kernel;
-         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-         delta = (float)(_delta/(1 << _bits));
-         std::vector<Point> coords;
-         preprocess2DKernel(kernel, coords, coeffs);
-         _nz = (int)coords.size();
-     }
-     int operator()(const uchar** src, uchar* _dst, int width) const
-     {
-         if( !checkHardwareSupport(CV_CPU_SSE2) )
-             return 0;
-         const float* kf = (const float*)&coeffs[0];
-         short* dst = (short*)_dst;
-         int i = 0, k, nz = _nz;
-         __m128 d4 = _mm_set1_ps(delta);
-         for( ; i <= width - 16; i += 16 )
-         {
-             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-             __m128i x0, x1, z = _mm_setzero_si128();
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0, t1;
-                 f = _mm_shuffle_ps(f, f, 0);
-                 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i));
-                 x1 = _mm_unpackhi_epi8(x0, z);
-                 x0 = _mm_unpacklo_epi8(x0, z);
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z));
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z));
-                 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z));
-                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
-                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
-             }
-             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1));
-             x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3));
-             _mm_storeu_si128((__m128i*)(dst + i), x0);
-             _mm_storeu_si128((__m128i*)(dst + i + 8), x1);
-         }
-         for( ; i <= width - 4; i += 4 )
-         {
-             __m128 s0 = d4;
-             __m128i x0, z = _mm_setzero_si128();
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0;
-                 f = _mm_shuffle_ps(f, f, 0);
-                 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i));
-                 x0 = _mm_unpacklo_epi8(x0, z);
-                 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z));
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-             }
-             x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z);
-             _mm_storel_epi64((__m128i*)(dst + i), x0);
-         }
-         return i;
-     }
-     int _nz;
-     std::vector<uchar> coeffs;
-     float delta;
- };
- struct FilterVec_32f
- {
-     FilterVec_32f() { delta = 0; _nz = 0; }
-     FilterVec_32f(const Mat& _kernel, int, double _delta)
-     {
-         delta = (float)_delta;
-         std::vector<Point> coords;
-         preprocess2DKernel(_kernel, coords, coeffs);
-         _nz = (int)coords.size();
-     }
-     int operator()(const uchar** _src, uchar* _dst, int width) const
-     {
-         if( !checkHardwareSupport(CV_CPU_SSE) )
-             return 0;
-         const float* kf = (const float*)&coeffs[0];
-         const float** src = (const float**)_src;
-         float* dst = (float*)_dst;
-         int i = 0, k, nz = _nz;
-         __m128 d4 = _mm_set1_ps(delta);
-         for( ; i <= width - 16; i += 16 )
-         {
-             __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0, t1;
-                 f = _mm_shuffle_ps(f, f, 0);
-                 const float* S = src[k] + i;
-                 t0 = _mm_loadu_ps(S);
-                 t1 = _mm_loadu_ps(S + 4);
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-                 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));
-                 t0 = _mm_loadu_ps(S + 8);
-                 t1 = _mm_loadu_ps(S + 12);
-                 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
-                 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
-             }
-             _mm_storeu_ps(dst + i, s0);
-             _mm_storeu_ps(dst + i + 4, s1);
-             _mm_storeu_ps(dst + i + 8, s2);
-             _mm_storeu_ps(dst + i + 12, s3);
-         }
-         for( ; i <= width - 4; i += 4 )
-         {
-             __m128 s0 = d4;
-             for( k = 0; k < nz; k++ )
-             {
-                 __m128 f = _mm_load_ss(kf+k), t0;
-                 f = _mm_shuffle_ps(f, f, 0);
-                 t0 = _mm_loadu_ps(src[k] + i);
-                 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
-             }
-             _mm_storeu_ps(dst + i, s0);
-         }
-         return i;
-     }
-     int _nz;
-     std::vector<uchar> coeffs;
-     float delta;
- };
- #elif CV_NEON
- struct SymmRowSmallVec_8u32s
- {
-     SymmRowSmallVec_8u32s() { smallValues = false; }
-     SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType )
-     {
-         kernel = _kernel;
-         symmetryType = _symmetryType;
-         smallValues = true;
-         int k, ksize = kernel.rows + kernel.cols - 1;
-         for( k = 0; k < ksize; k++ )
-         {
-             int v = kernel.ptr<int>()[k];
-             if( v < SHRT_MIN || v > SHRT_MAX )
-             {
-                 smallValues = false;
-                 break;
-             }
-         }
-     }
-     int operator()(const uchar* src, uchar* _dst, int width, int cn) const
-     {
-          if( !checkHardwareSupport(CV_CPU_NEON) )
-              return 0;
-         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
-         int* dst = (int*)_dst;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const int* kx = kernel.ptr<int>() + _ksize/2;
-         if( !smallValues )
-             return 0;
-         src += (_ksize/2)*cn;
-         width *= cn;
-         if( symmetrical )
-         {
-             if( _ksize == 1 )
-                 return 0;
-             if( _ksize == 3 )
-             {
-                 if( kx[0] == 2 && kx[1] == 1 )
-                 {
-                     uint16x8_t zq = vdupq_n_u16(0);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1, x2;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src) );
-                         x2 = vld1_u8( (uint8_t *) (src + cn) );
-                         uint16x8_t y0, y1, y2;
-                         y0 = vaddl_u8(x0, x2);
-                         y1 = vshll_n_u8(x1, 1);
-                         y2 = vaddq_u16(y0, y1);
-                         uint16x8x2_t str;
-                         str.val[0] = y2; str.val[1] = zq;
-                         vst2q_u16( (uint16_t *) (dst + i), str );
-                     }
-                 }
-                 else if( kx[0] == -2 && kx[1] == 1 )
-                     return 0;
-                 else
-                 {
-                     int32x4_t k32 = vdupq_n_s32(0);
-                     k32 = vld1q_lane_s32(kx, k32, 0);
-                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
-                     int16x4_t k = vqmovn_s32(k32);
-                     uint8x8_t z = vdup_n_u8(0);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1, x2;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src) );
-                         x2 = vld1_u8( (uint8_t *) (src + cn) );
-                         int16x8_t y0, y1;
-                         int32x4_t y2, y3;
-                         y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
-                         y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
-                         y2 = vmull_lane_s16(vget_low_s16(y0), k, 0);
-                         y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1);
-                         y3 = vmull_lane_s16(vget_high_s16(y0), k, 0);
-                         y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1);
-                         vst1q_s32((int32_t *)(dst + i), y2);
-                         vst1q_s32((int32_t *)(dst + i + 4), y3);
-                     }
-                 }
-             }
-             else if( _ksize == 5 )
-             {
-                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                     return 0;
-                 else
-                 {
-                     int32x4_t k32 = vdupq_n_s32(0);
-                     k32 = vld1q_lane_s32(kx, k32, 0);
-                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
-                     k32 = vld1q_lane_s32(kx + 2, k32, 2);
-                     int16x4_t k = vqmovn_s32(k32);
-                     uint8x8_t z = vdup_n_u8(0);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1, x2, x3, x4;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src) );
-                         x2 = vld1_u8( (uint8_t *) (src + cn) );
-                         int16x8_t y0, y1;
-                         int32x4_t accl, acch;
-                         y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z));
-                         y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
-                         accl = vmull_lane_s16(vget_low_s16(y0), k, 0);
-                         accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1);
-                         acch = vmull_lane_s16(vget_high_s16(y0), k, 0);
-                         acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1);
-                         int16x8_t y2;
-                         x3 = vld1_u8( (uint8_t *) (src - cn*2) );
-                         x4 = vld1_u8( (uint8_t *) (src + cn*2) );
-                         y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4));
-                         accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2);
-                         acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2);
-                         vst1q_s32((int32_t *)(dst + i), accl);
-                         vst1q_s32((int32_t *)(dst + i + 4), acch);
-                     }
-                 }
-             }
-         }
-         else
-         {
-             if( _ksize == 3 )
-             {
-                 if( kx[0] == 0 && kx[1] == 1 )
-                 {
-                     uint8x8_t z = vdup_n_u8(0);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src + cn) );
-                         int16x8_t y0;
-                         y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
-                                 vreinterpretq_s16_u16(vaddl_u8(x0, z)));
-                         vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0)));
-                         vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0)));
-                     }
-                 }
-                 else
-                 {
-                     int32x4_t k32 = vdupq_n_s32(0);
-                     k32 = vld1q_lane_s32(kx + 1, k32, 1);
-                     int16x4_t k = vqmovn_s32(k32);
-                     uint8x8_t z = vdup_n_u8(0);
-                     for( ; i <= width - 8; i += 8, src += 8 )
-                     {
-                         uint8x8_t x0, x1;
-                         x0 = vld1_u8( (uint8_t *) (src - cn) );
-                         x1 = vld1_u8( (uint8_t *) (src + cn) );
-                         int16x8_t y0;
-                         int32x4_t y1, y2;
-                         y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
-                             vreinterpretq_s16_u16(vaddl_u8(x0, z)));
-                         y1 = vmull_lane_s16(vget_low_s16(y0), k, 1);
-                         y2 = vmull_lane_s16(vget_high_s16(y0), k, 1);
-                         vst1q_s32((int32_t *)(dst + i), y1);
-                         vst1q_s32((int32_t *)(dst + i + 4), y2);
-                     }
-                 }
-             }
-             else if( _ksize == 5 )
-             {
-                 int32x4_t k32 = vdupq_n_s32(0);
-                 k32 = vld1q_lane_s32(kx + 1, k32, 1);
-                 k32 = vld1q_lane_s32(kx + 2, k32, 2);
-                 int16x4_t k = vqmovn_s32(k32);
-                 uint8x8_t z = vdup_n_u8(0);
-                 for( ; i <= width - 8; i += 8, src += 8 )
-                 {
-                     uint8x8_t x0, x1;
-                     x0 = vld1_u8( (uint8_t *) (src - cn) );
-                     x1 = vld1_u8( (uint8_t *) (src + cn) );
-                     int32x4_t accl, acch;
-                     int16x8_t y0;
-                     y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)),
-                         vreinterpretq_s16_u16(vaddl_u8(x0, z)));
-                     accl = vmull_lane_s16(vget_low_s16(y0), k, 1);
-                     acch = vmull_lane_s16(vget_high_s16(y0), k, 1);
-                     uint8x8_t x2, x3;
-                     x2 = vld1_u8( (uint8_t *) (src - cn*2) );
-                     x3 = vld1_u8( (uint8_t *) (src + cn*2) );
-                     int16x8_t y1;
-                     y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)),
-                         vreinterpretq_s16_u16(vaddl_u8(x2, z)));
-                     accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2);
-                     acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2);
-                     vst1q_s32((int32_t *)(dst + i), accl);
-                     vst1q_s32((int32_t *)(dst + i + 4), acch);
-                 }
-             }
-         }
-         return i;
-     }
-     Mat kernel;
-     int symmetryType;
-     bool smallValues;
- };
- struct SymmColumnVec_32s8u
- {
-     SymmColumnVec_32s8u() { symmetryType=0; }
-     SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
-     {
-         symmetryType = _symmetryType;
-         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-         delta = (float)(_delta/(1 << _bits));
-         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-     }
-     int operator()(const uchar** _src, uchar* dst, int width) const
-     {
-          if( !checkHardwareSupport(CV_CPU_NEON) )
-              return 0;
-         int _ksize = kernel.rows + kernel.cols - 1;
-         int ksize2 = _ksize / 2;
-         const float* ky = kernel.ptr<float>() + ksize2;
-         int i = 0, k;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const int** src = (const int**)_src;
-         const int *S, *S2;
-         float32x4_t d4 = vdupq_n_f32(delta);
-         if( symmetrical )
-         {
-             if( _ksize == 1 )
-                 return 0;
-             float32x2_t k32;
-             k32 = vdup_n_f32(0);
-             k32 = vld1_lane_f32(ky, k32, 0);
-             k32 = vld1_lane_f32(ky + 1, k32, 1);
-             for( ; i <= width - 8; i += 8 )
-             {
-                 float32x4_t accl, acch;
-                 float32x4_t f0l, f0h, f1l, f1h, f2l, f2h;
-                 S = src[0] + i;
-                 f0l = vcvtq_f32_s32( vld1q_s32(S) );
-                 f0h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                 S = src[1] + i;
-                 S2 = src[-1] + i;
-                 f1l = vcvtq_f32_s32( vld1q_s32(S) );
-                 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
-                 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-                 accl = acch = d4;
-                 accl = vmlaq_lane_f32(accl, f0l, k32, 0);
-                 acch = vmlaq_lane_f32(acch, f0h, k32, 0);
-                 accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1);
-                 acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1);
-                 for( k = 2; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     float32x4_t f3l, f3h, f4l, f4h;
-                     f3l = vcvtq_f32_s32( vld1q_s32(S) );
-                     f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                     f4l = vcvtq_f32_s32( vld1q_s32(S2) );
-                     f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-                     accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]);
-                     acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]);
-                 }
-                 int32x4_t s32l, s32h;
-                 s32l = vcvtq_s32_f32(accl);
-                 s32h = vcvtq_s32_f32(acch);
-                 int16x4_t s16l, s16h;
-                 s16l = vqmovn_s32(s32l);
-                 s16h = vqmovn_s32(s32h);
-                 uint8x8_t u8;
-                 u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
-                 vst1_u8((uint8_t *)(dst + i), u8);
-             }
-         }
-         else
-         {
-             float32x2_t k32;
-             k32 = vdup_n_f32(0);
-             k32 = vld1_lane_f32(ky + 1, k32, 1);
-             for( ; i <= width - 8; i += 8 )
-             {
-                 float32x4_t accl, acch;
-                 float32x4_t f1l, f1h, f2l, f2h;
-                 S = src[1] + i;
-                 S2 = src[-1] + i;
-                 f1l = vcvtq_f32_s32( vld1q_s32(S) );
-                 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                 f2l = vcvtq_f32_s32( vld1q_s32(S2) );
-                 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-                 accl = acch = d4;
-                 accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1);
-                 acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1);
-                 for( k = 2; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     float32x4_t f3l, f3h, f4l, f4h;
-                     f3l = vcvtq_f32_s32( vld1q_s32(S) );
-                     f3h = vcvtq_f32_s32( vld1q_s32(S + 4) );
-                     f4l = vcvtq_f32_s32( vld1q_s32(S2) );
-                     f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) );
-                     accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]);
-                     acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]);
-                 }
-                 int32x4_t s32l, s32h;
-                 s32l = vcvtq_s32_f32(accl);
-                 s32h = vcvtq_s32_f32(acch);
-                 int16x4_t s16l, s16h;
-                 s16l = vqmovn_s32(s32l);
-                 s16h = vqmovn_s32(s32h);
-                 uint8x8_t u8;
-                 u8 =  vqmovun_s16(vcombine_s16(s16l, s16h));
-                 vst1_u8((uint8_t *)(dst + i), u8);
+         else
+         {
+ #if CV_TRY_AVX2
+             if (haveAVX2)
+                 return SymmColumnVec_32f_Unsymm_AVX(src, ky, dst, delta, width, ksize2);
+ #endif
+             const v_float32 d4 = vx_setall_f32(delta);
+             for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+             {
+                 v_float32 s0 = d4;
+                 for( k = 1; k <= ksize2; k++ )
+                     s0 = v_muladd(vx_load(src[k] + i) - vx_load(src[-k] + i), vx_setall_f32(ky[k]), s0);
+                 v_store(dst + i, s0);
              }
          }
  
      int symmetryType;
      float delta;
      Mat kernel;
+     bool haveAVX2;
  };
  
  
- struct SymmColumnSmallVec_32s16s
+ struct SymmColumnSmallVec_32f
  {
-     SymmColumnSmallVec_32s16s() { symmetryType=0; }
-     SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta)
+     SymmColumnSmallVec_32f() { symmetryType=0; delta = 0; }
+     SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta)
      {
          symmetryType = _symmetryType;
-         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
-         delta = (float)(_delta/(1 << _bits));
+         kernel = _kernel;
+         delta = (float)_delta;
          CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
      }
  
      int operator()(const uchar** _src, uchar* _dst, int width) const
      {
-          if( !checkHardwareSupport(CV_CPU_NEON) )
-              return 0;
          int ksize2 = (kernel.rows + kernel.cols - 1)/2;
          const float* ky = kernel.ptr<float>() + ksize2;
          int i = 0;
          bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const int** src = (const int**)_src;
-         const int *S0 = src[-1], *S1 = src[0], *S2 = src[1];
-         short* dst = (short*)_dst;
-         float32x4_t df4 = vdupq_n_f32(delta);
-         int32x4_t d4 = vcvtq_s32_f32(df4);
+         const float** src = (const float**)_src;
+         const float *S0 = src[-1], *S1 = src[0], *S2 = src[1];
+         float* dst = (float*)_dst;
  
+         v_float32 d4 = vx_setall_f32(delta);
          if( symmetrical )
          {
-             if( ky[0] == 2 && ky[1] == 1 )
-             {
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S1 + i));
-                     x2 = vld1q_s32((int32_t const *)(S2 + i));
-                     int32x4_t y0, y1, y2, y3;
-                     y0 = vaddq_s32(x0, x2);
-                     y1 = vqshlq_n_s32(x1, 1);
-                     y2 = vaddq_s32(y0, y1);
-                     y3 = vaddq_s32(y2, d4);
-                     int16x4_t t;
-                     t = vqmovn_s32(y3);
-                     vst1_s16((int16_t *)(dst + i), t);
-                 }
-             }
-             else if( ky[0] == -2 && ky[1] == 1 )
-             {
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S1 + i));
-                     x2 = vld1q_s32((int32_t const *)(S2 + i));
-                     int32x4_t y0, y1, y2, y3;
-                     y0 = vaddq_s32(x0, x2);
-                     y1 = vqshlq_n_s32(x1, 1);
-                     y2 = vsubq_s32(y0, y1);
-                     y3 = vaddq_s32(y2, d4);
-                     int16x4_t t;
-                     t = vqmovn_s32(y3);
-                     vst1_s16((int16_t *)(dst + i), t);
-                 }
-             }
-             else if( ky[0] == 10 && ky[1] == 3 )
+             if( fabs(ky[0]) == 2 && ky[1] == 1 )
              {
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2, x3;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S1 + i));
-                     x2 = vld1q_s32((int32_t const *)(S2 + i));
-                     x3 = vaddq_s32(x0, x2);
-                     int32x4_t y0;
-                     y0 = vmlaq_n_s32(d4, x1, 10);
-                     y0 = vmlaq_n_s32(y0, x3, 3);
-                     int16x4_t t;
-                     t = vqmovn_s32(y0);
-                     vst1_s16((int16_t *)(dst + i), t);
-                 }
+                 v_float32 k0 = vx_setall_f32(ky[0]);
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                     v_store(dst + i, v_muladd(vx_load(S1 + i), k0, vx_load(S0 + i) + vx_load(S2 + i) + d4));
              }
              else
              {
-                 float32x2_t k32 = vdup_n_f32(0);
-                 k32 = vld1_lane_f32(ky, k32, 0);
-                 k32 = vld1_lane_f32(ky + 1, k32, 1);
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2, x3, x4;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S1 + i));
-                     x2 = vld1q_s32((int32_t const *)(S2 + i));
-                     x3 = vaddq_s32(x0, x2);
-                     float32x4_t s0, s1, s2;
-                     s0 = vcvtq_f32_s32(x1);
-                     s1 = vcvtq_f32_s32(x3);
-                     s2 = vmlaq_lane_f32(df4, s0, k32, 0);
-                     s2 = vmlaq_lane_f32(s2, s1, k32, 1);
-                     x4 = vcvtq_s32_f32(s2);
-                     int16x4_t x5;
-                     x5 = vqmovn_s32(x4);
-                     vst1_s16((int16_t *)(dst + i), x5);
-                 }
+                 v_float32 k0 = vx_setall_f32(ky[0]), k1 = vx_setall_f32(ky[1]);
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                     v_store(dst + i, v_muladd(vx_load(S0 + i) + vx_load(S2 + i), k1, v_muladd(vx_load(S1 + i), k0, d4)));
              }
          }
          else
              {
                  if( ky[1] < 0 )
                      std::swap(S0, S2);
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S2 + i));
-                     int32x4_t y0, y1;
-                     y0 = vsubq_s32(x1, x0);
-                     y1 = vqaddq_s32(y0, d4);
-                     int16x4_t t;
-                     t = vqmovn_s32(y1);
-                     vst1_s16((int16_t *)(dst + i), t);
-                 }
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                     v_store(dst + i, vx_load(S2 + i) - vx_load(S0 + i) + d4);
              }
              else
              {
-                 float32x2_t k32 = vdup_n_f32(0);
-                 k32 = vld1_lane_f32(ky + 1, k32, 1);
-                 for( ; i <= width - 4; i += 4 )
-                 {
-                     int32x4_t x0, x1, x2, x3;
-                     x0 = vld1q_s32((int32_t const *)(S0 + i));
-                     x1 = vld1q_s32((int32_t const *)(S2 + i));
-                     x2 = vsubq_s32(x1, x0);
-                     float32x4_t s0, s1;
-                     s0 = vcvtq_f32_s32(x2);
-                     s1 = vmlaq_lane_f32(df4, s0, k32, 1);
-                     x3 = vcvtq_s32_f32(s1);
-                     int16x4_t x4;
-                     x4 = vqmovn_s32(x3);
-                     vst1_s16((int16_t *)(dst + i), x4);
-                 }
+                 v_float32 k1 = vx_setall_f32(ky[1]);
+                 for ( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+                     v_store(dst + i, v_muladd(vx_load(S2 + i) - vx_load(S0 + i), k1, d4));
              }
          }
  
  };
  
  
- struct SymmColumnVec_32f16s
+ /////////////////////////////// non-separable filters ///////////////////////////////
+ ///////////////////////////////// 8u<->8u, 8u<->16s /////////////////////////////////
+ struct FilterVec_8u
  {
-     SymmColumnVec_32f16s() { symmetryType=0; }
-     SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta)
+     FilterVec_8u() { delta = 0; _nz = 0; }
+     FilterVec_8u(const Mat& _kernel, int _bits, double _delta)
      {
-         symmetryType = _symmetryType;
-         kernel = _kernel;
-         delta = (float)_delta;
-         CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 );
-          neon_supported = checkHardwareSupport(CV_CPU_NEON);
+         Mat kernel;
+         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+         delta = (float)(_delta/(1 << _bits));
+         std::vector<Point> coords;
+         preprocess2DKernel(kernel, coords, coeffs);
+         _nz = (int)coords.size();
      }
  
-     int operator()(const uchar** _src, uchar* _dst, int width) const
+     int operator()(const uchar** src, uchar* dst, int width) const
      {
-          if( !neon_supported )
-              return 0;
-         int _ksize = kernel.rows + kernel.cols - 1;
-         int ksize2 = _ksize / 2;
-         const float* ky = kernel.ptr<float>() + ksize2;
-         int i = 0, k;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const float** src = (const float**)_src;
-         const float *S, *S2;
-         short* dst = (short*)_dst;
-         float32x4_t d4 = vdupq_n_f32(delta);
+         const float* kf = (const float*)&coeffs[0];
+         int i = 0, k, nz = _nz;
  
-         if( symmetrical )
+         v_float32 d4 = vx_setall_f32(delta);
+         for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
          {
-             if( _ksize == 1 )
-                 return 0;
-             float32x2_t k32;
-             k32 = vdup_n_f32(0);
-             k32 = vld1_lane_f32(ky, k32, 0);
-             k32 = vld1_lane_f32(ky + 1, k32, 1);
-             for( ; i <= width - 8; i += 8 )
+             v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
+             for( k = 0; k < nz; k++ )
              {
-                 float32x4_t x0l, x0h, x1l, x1h, x2l, x2h;
-                 float32x4_t accl, acch;
-                 S = src[0] + i;
-                 x0l = vld1q_f32(S);
-                 x0h = vld1q_f32(S + 4);
-                 S = src[1] + i;
-                 S2 = src[-1] + i;
-                 x1l = vld1q_f32(S);
-                 x1h = vld1q_f32(S + 4);
-                 x2l = vld1q_f32(S2);
-                 x2h = vld1q_f32(S2 + 4);
-                 accl = acch = d4;
-                 accl = vmlaq_lane_f32(accl, x0l, k32, 0);
-                 acch = vmlaq_lane_f32(acch, x0h, k32, 0);
-                 accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1);
-                 acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1);
-                 for( k = 2; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     float32x4_t x3l, x3h, x4l, x4h;
-                     x3l = vld1q_f32(S);
-                     x3h = vld1q_f32(S + 4);
-                     x4l = vld1q_f32(S2);
-                     x4h = vld1q_f32(S2 + 4);
-                     accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]);
-                     acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]);
-                 }
-                 int32x4_t s32l, s32h;
-                 s32l = vcvtq_s32_f32(accl);
-                 s32h = vcvtq_s32_f32(acch);
-                 int16x4_t s16l, s16h;
-                 s16l = vqmovn_s32(s32l);
-                 s16h = vqmovn_s32(s32h);
-                 vst1_s16((int16_t *)(dst + i), s16l);
-                 vst1_s16((int16_t *)(dst + i + 4), s16h);
-             }
+                 v_float32 f = vx_setall_f32(kf[k]);
+                 v_uint16 xl, xh;
+                 v_expand(vx_load(src[k] + i), xl, xh);
+                 v_uint32 x0, x1, x2, x3;
+                 v_expand(xl, x0, x1);
+                 v_expand(xh, x2, x3);
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
+                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
+                 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x2)), f, s2);
+                 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x3)), f, s3);
+             }
+             v_store(dst + i, v_pack_u(v_pack(v_round(s0), v_round(s1)), v_pack(v_round(s2), v_round(s3))));
          }
-         else
+         if( i <= width - v_uint16::nlanes )
          {
-             float32x2_t k32;
-             k32 = vdup_n_f32(0);
-             k32 = vld1_lane_f32(ky + 1, k32, 1);
-             for( ; i <= width - 8; i += 8 )
+             v_float32 s0 = d4, s1 = d4;
+             for( k = 0; k < nz; k++ )
              {
-                 float32x4_t x1l, x1h, x2l, x2h;
-                 float32x4_t accl, acch;
-                 S = src[1] + i;
-                 S2 = src[-1] + i;
-                 x1l = vld1q_f32(S);
-                 x1h = vld1q_f32(S + 4);
-                 x2l = vld1q_f32(S2);
-                 x2h = vld1q_f32(S2 + 4);
-                 accl = acch = d4;
-                 accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1);
-                 acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1);
-                 for( k = 2; k <= ksize2; k++ )
-                 {
-                     S = src[k] + i;
-                     S2 = src[-k] + i;
-                     float32x4_t x3l, x3h, x4l, x4h;
-                     x3l = vld1q_f32(S);
-                     x3h = vld1q_f32(S + 4);
-                     x4l = vld1q_f32(S2);
-                     x4h = vld1q_f32(S2 + 4);
-                     accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]);
-                     acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]);
-                 }
-                 int32x4_t s32l, s32h;
-                 s32l = vcvtq_s32_f32(accl);
-                 s32h = vcvtq_s32_f32(acch);
-                 int16x4_t s16l, s16h;
-                 s16l = vqmovn_s32(s32l);
-                 s16h = vqmovn_s32(s32h);
-                 vst1_s16((int16_t *)(dst + i), s16l);
-                 vst1_s16((int16_t *)(dst + i + 4), s16h);
+                 v_float32 f = vx_setall_f32(kf[k]);
+                 v_uint32 x0, x1;
+                 v_expand(vx_load_expand(src[k] + i), x0, x1);
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x0)), f, s0);
+                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(x1)), f, s1);
              }
+             v_pack_u_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+             i += v_uint16::nlanes;
+         }
+ #if CV_SIMD_WIDTH > 16
+         while( i <= width - v_int32x4::nlanes )
+ #else
+         if( i <= width - v_int32x4::nlanes )
+ #endif
+         {
+             v_float32x4 s0 = v_setall_f32(delta);
+             for( k = 0; k < nz; k++ )
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_load_expand_q(src[k] + i))), v_setall_f32(kf[k]), s0);
+             v_int32x4 s32 = v_round(s0);
+             v_int16x8 s16 = v_pack(s32, s32);
+             *(int*)(dst + i) = v_reinterpret_as_s32(v_pack_u(s16, s16)).get0();
+             i += v_int32x4::nlanes;
          }
  
          return i;
      }
  
-     int symmetryType;
+     int _nz;
+     std::vector<uchar> coeffs;
      float delta;
-     Mat kernel;
-     bool neon_supported;
  };
  
  
- struct SymmRowSmallVec_32f
+ struct FilterVec_8u16s
  {
-     SymmRowSmallVec_32f() {}
-     SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType )
+     FilterVec_8u16s() { delta = 0; _nz = 0; }
+     FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta)
      {
-         kernel = _kernel;
-         symmetryType = _symmetryType;
+         Mat kernel;
+         _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0);
+         delta = (float)(_delta/(1 << _bits));
+         std::vector<Point> coords;
+         preprocess2DKernel(kernel, coords, coeffs);
+         _nz = (int)coords.size();
      }
  
-     int operator()(const uchar* _src, uchar* _dst, int width, int cn) const
+     int operator()(const uchar** src, uchar* _dst, int width) const
      {
-          if( !checkHardwareSupport(CV_CPU_NEON) )
-              return 0;
-         int i = 0, _ksize = kernel.rows + kernel.cols - 1;
-         float* dst = (float*)_dst;
-         const float* src = (const float*)_src + (_ksize/2)*cn;
-         bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0;
-         const float* kx = kernel.ptr<float>() + _ksize/2;
-         width *= cn;
+         const float* kf = (const float*)&coeffs[0];
+         short* dst = (short*)_dst;
+         int i = 0, k, nz = _nz;
  
-         if( symmetrical )
+         v_float32 d4 = vx_setall_f32(delta);
+         for( ; i <= width - v_uint8::nlanes; i += v_uint8::nlanes )
          {
-             if( _ksize == 1 )
-                 return 0;
-             if( _ksize == 3 )
-             {
-                 if( kx[0] == 2 && kx[1] == 1 )
-                     return 0;
-                 else if( kx[0] == -2 && kx[1] == 1 )
-                     return 0;
-                 else
-                 {
-                     return 0;
-                 }
-             }
-             else if( _ksize == 5 )
+             v_float32 s0 = d4, s1 = d4, s2 = d4, s3 = d4;
+             for( k = 0; k < nz; k++ )
              {
-                 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 )
-                     return 0;
-                 else
-                 {
-                     float32x2_t k0, k1;
-                     k0 = k1 = vdup_n_f32(0);
-                     k0 = vld1_lane_f32(kx + 0, k0, 0);
-                     k0 = vld1_lane_f32(kx + 1, k0, 1);
-                     k1 = vld1_lane_f32(kx + 2, k1, 0);
-                     for( ; i <= width - 4; i += 4, src += 4 )
-                     {
-                         float32x4_t x0, x1, x2, x3, x4;
-                         x0 = vld1q_f32(src);
-                         x1 = vld1q_f32(src - cn);
-                         x2 = vld1q_f32(src + cn);
-                         x3 = vld1q_f32(src - cn*2);
-                         x4 = vld1q_f32(src + cn*2);
-                         float32x4_t y0;
-                         y0 = vmulq_lane_f32(x0, k0, 0);
-                         y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1);
-                         y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0);
-                         vst1q_f32(dst + i, y0);
-                     }
-                 }
+                 v_float32 f = vx_setall_f32(kf[k]);
+                 v_uint16 xl, xh;
+                 v_expand(vx_load(src[k] + i), xl, xh);
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xl))), f, s0);
+                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xl))), f, s1);
+                 s2 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(xh))), f, s2);
+                 s3 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(xh))), f, s3);
              }
+             v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+             v_store(dst + i + v_int16::nlanes, v_pack(v_round(s2), v_round(s3)));
          }
-         else
+         if( i <= width - v_uint16::nlanes )
          {
-             if( _ksize == 3 )
-             {
-                 if( kx[0] == 0 && kx[1] == 1 )
-                     return 0;
-                 else
-                 {
-                     return 0;
-                 }
-             }
-             else if( _ksize == 5 )
+             v_float32 s0 = d4, s1 = d4;
+             for( k = 0; k < nz; k++ )
              {
-                 float32x2_t k;
-                 k = vdup_n_f32(0);
-                 k = vld1_lane_f32(kx + 1, k, 0);
-                 k = vld1_lane_f32(kx + 2, k, 1);
-                 for( ; i <= width - 4; i += 4, src += 4 )
-                 {
-                     float32x4_t x0, x1, x2, x3;
-                     x0 = vld1q_f32(src - cn);
-                     x1 = vld1q_f32(src + cn);
-                     x2 = vld1q_f32(src - cn*2);
-                     x3 = vld1q_f32(src + cn*2);
-                     float32x4_t y0;
-                     y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0);
-                     y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1);
-                     vst1q_f32(dst + i, y0);
-                 }
+                 v_float32 f = vx_setall_f32(kf[k]);
+                 v_uint16 x = vx_load_expand(src[k] + i);
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_low(x))), f, s0);
+                 s1 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(v_expand_high(x))), f, s1);
              }
+             v_store(dst + i, v_pack(v_round(s0), v_round(s1)));
+             i += v_uint16::nlanes;
+         }
+         if( i <= width - v_int32::nlanes )
+         {
+             v_float32 s0 = d4;
+             for( k = 0; k < nz; k++ )
+                 s0 = v_muladd(v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(src[k] + i))), vx_setall_f32(kf[k]), s0);
+             v_pack_store(dst + i, v_round(s0));
+             i += v_int32::nlanes;
          }
  
          return i;
      }
  
-     Mat kernel;
-     int symmetryType;
+     int _nz;
+     std::vector<uchar> coeffs;
+     float delta;
  };
  
  
- typedef RowNoVec RowVec_8u32s;
- typedef RowNoVec RowVec_16s32f;
- typedef RowNoVec RowVec_32f;
- typedef ColumnNoVec SymmColumnVec_32f;
- typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f;
- typedef FilterNoVec FilterVec_8u;
- typedef FilterNoVec FilterVec_8u16s;
- typedef FilterNoVec FilterVec_32f;
+ struct FilterVec_32f
+ {
+     FilterVec_32f() { delta = 0; _nz = 0; }
+     FilterVec_32f(const Mat& _kernel, int, double _delta)
+     {
+         delta = (float)_delta;
+         std::vector<Point> coords;
+         preprocess2DKernel(_kernel, coords, coeffs);
+         _nz = (int)coords.size();
+     }
+     int operator()(const uchar** _src, uchar* _dst, int width) const
+     {
+         const float* kf = (const float*)&coeffs[0];
+         const float** src = (const float**)_src;
+         float* dst = (float*)_dst;
+         int i = 0, k, nz = _nz;
+         v_float32 d4 = vx_setall_f32(delta);
+         for( ; i <= width - v_float32::nlanes; i += v_float32::nlanes )
+         {
+             v_float32 s0 = d4;
+             for( k = 0; k < nz; k++ )
+                 s0 = v_muladd(vx_load(src[k] + i), vx_setall_f32(kf[k]), s0);
+             v_store(dst + i, s0);
+         }
+         return i;
+     }
  
+     int _nz;
+     std::vector<uchar> coeffs;
+     float delta;
+ };
  
  #else
  
@@@ -4557,7 -3503,7 +3503,7 @@@ static bool replacementFilter2D(int sty
      return success;
  }
  
 -#ifdef HAVE_IPP
 +#if 0 //defined HAVE_IPP
  static bool ippFilter2D(int stype, int dtype, int kernel_type,
                uchar * src_data, size_t src_step,
                uchar * dst_data, size_t dst_step,
@@@ -4655,15 -3601,9 +3601,9 @@@ static bool dftFilter2D(int stype, int 
                          double delta, int borderType)
  {
      {
- #if CV_SSE2
          int sdepth = CV_MAT_DEPTH(stype);
          int ddepth = CV_MAT_DEPTH(dtype);
-         int dft_filter_size = ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3) ? 130 : 50;
- #else
-         CV_UNUSED(stype);
-         CV_UNUSED(dtype);
-         int dft_filter_size = 50;
- #endif
+         int dft_filter_size = checkHardwareSupport(CV_CPU_SSE3) && ((sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || (sdepth == CV_32F && ddepth == CV_32F)) ? 130 : 50;
          if (kernel_width * kernel_height < dft_filter_size)
              return false;
      }
@@@ -4821,7 -3761,7 +3761,7 @@@ void filter2D(int stype, int dtype, in
      if (res)
          return;
  
 -    CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type,
 +    /*CV_IPP_RUN_FAST(ippFilter2D(stype, dtype, kernel_type,
                                src_data, src_step,
                                dst_data, dst_step,
                                width, height,
                                kernel_data, kernel_step,
                                kernel_width, kernel_height,
                                anchor_x, anchor_y,
 -                              delta, borderType, isSubmatrix))
 +                              delta, borderType, isSubmatrix))*/
  
      res = dftFilter2D(stype, dtype, kernel_type,
                        src_data, src_step,
@@@ -282,10 -282,10 +282,10 @@@ medianBlur_8u_O1( const Mat& _src, Mat
                          for ( ; luc[c][k] < j+r+1; ++luc[c][k] )
                          {
  #if CV_SIMD256
-                             v_fine += v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
+                             v_fine = v_fine + v256_load(px + 16 * MIN(luc[c][k], n - 1)) - v256_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
  #elif CV_SIMD128
-                             v_finel += v_load(px + 16 * MIN(luc[c][k], n - 1)    ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
-                             v_fineh += v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8);
+                             v_finel = v_finel + v_load(px + 16 * MIN(luc[c][k], n - 1)    ) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0));
+                             v_fineh = v_fineh + v_load(px + 16 * MIN(luc[c][k], n - 1) + 8) - v_load(px + 16 * MAX(luc[c][k] - 2 * r - 1, 0) + 8);
  #else
                              for (int ind = 0; ind < 16; ++ind)
                                  H[c].fine[k][ind] += px[16 * MIN(luc[c][k], n - 1) + ind] - px[16 * MAX(luc[c][k] - 2 * r - 1, 0) + ind];
                      CV_Assert( b < 16 );
                  }
              }
+         }
  #if CV_SIMD
-             vx_cleanup();
+         vx_cleanup();
  #endif
-         }
      }
  
  #undef HOP
@@@ -1080,7 -1080,7 +1080,7 @@@ static bool openvx_medianFilter(InputAr
  }
  #endif
  
 -#ifdef HAVE_IPP
 +#if 0 //defined HAVE_IPP
  static bool ipp_medianFilter(Mat &src0, Mat &dst, int ksize)
  {
      CV_INSTRUMENT_REGION_IPP();
@@@ -1179,7 -1179,7 +1179,7 @@@ void medianBlur( InputArray _src0, Outp
      CV_OVX_RUN(true,
                 openvx_medianFilter(_src0, _dst, ksize))
  
 -    CV_IPP_RUN_FAST(ipp_medianFilter(src0, dst, ksize));
 +    //CV_IPP_RUN_FAST(ipp_medianFilter(src0, dst, ksize));
  
  #ifdef HAVE_TEGRA_OPTIMIZATION
      if (tegra::useTegra() && tegra::medianBlur(src0, dst, ksize))
@@@ -50,7 -50,7 +50,7 @@@ public
  
  protected:
      int prepare_test_case( int test_case_idx );
 -    int read_params( CvFileStorage* fs );
 +    int read_params( const cv::FileStorage& fs );
      void get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types );
      void get_minmax_bounds( int i, int j, int type, Scalar& low, Scalar& high );
      Size aperture_size;
@@@ -76,13 -76,13 +76,13 @@@ CV_FilterBaseTest::CV_FilterBaseTest( b
  }
  
  
 -int CV_FilterBaseTest::read_params( CvFileStorage* fs )
 +int CV_FilterBaseTest::read_params( const cv::FileStorage& fs )
  {
      int code = cvtest::ArrayTest::read_params( fs );
      if( code < 0 )
          return code;
  
 -    max_aperture_size = cvReadInt( find_param( fs, "max_aperture_size" ), max_aperture_size );
 +    read( find_param( fs, "max_aperture_size" ), max_aperture_size, max_aperture_size );
      max_aperture_size = cvtest::clipInt( max_aperture_size, 1, 100 );
  
      return code;
@@@ -1265,7 -1265,7 +1265,7 @@@ public
      CV_FeatureSelBaseTest( int width_factor );
  
  protected:
 -    int read_params( CvFileStorage* fs );
 +    int read_params( const FileStorage& fs );
      void get_test_array_types_and_sizes( int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types );
      void get_minmax_bounds( int i, int j, int type, Scalar& low, Scalar& high );
      double get_success_error_level( int test_case_idx, int i, int j );
@@@ -1289,15 -1289,15 +1289,15 @@@ CV_FeatureSelBaseTest::CV_FeatureSelBas
  }
  
  
 -int CV_FeatureSelBaseTest::read_params( CvFileStorage* fs )
 +int CV_FeatureSelBaseTest::read_params( const cv::FileStorage& fs )
  {
      int code = cvtest::BaseTest::read_params( fs );
      if( code < 0 )
          return code;
  
 -    max_aperture_size = cvReadInt( find_param( fs, "max_aperture_size" ), max_aperture_size );
 +    read( find_param( fs, "max_aperture_size" ), max_aperture_size, max_aperture_size );
      max_aperture_size = cvtest::clipInt( max_aperture_size, 1, 9 );
 -    max_block_size = cvReadInt( find_param( fs, "max_block_size" ), max_block_size );
 +    read( find_param( fs, "max_block_size" ), max_block_size, max_block_size );
      max_block_size = cvtest::clipInt( max_aperture_size, 1, 100 );
  
      return code;
@@@ -2200,4 -2200,15 +2200,15 @@@ TEST(Imgproc_Filter2D, dftFilter2d_regr
  
      EXPECT_LE(cvtest::norm(dst, expected, NORM_INF), 2);
  }
+ TEST(Imgproc_MedianBlur, hires_regression_13409)
+ {
+     Mat src(2048, 2048, CV_8UC1), dst_hires, dst_ref;
+     randu(src, 0, 256);
+     medianBlur(src, dst_hires, 9);
+     medianBlur(src(Rect(512, 512, 1024, 1024)), dst_ref, 9);
+     ASSERT_EQ(0.0, cvtest::norm(dst_hires(Rect(516, 516, 1016, 1016)), dst_ref(Rect(4, 4, 1016, 1016)), NORM_INF));
+ }
  }} // namespace
@@@ -782,6 -782,9 +782,9 @@@ bool QRCodeDetector::detect(InputArray 
      Mat inarr = in.getMat();
      CV_Assert(!inarr.empty());
      CV_Assert(inarr.depth() == CV_8U);
+     if (inarr.cols <= 20 || inarr.rows <= 20)
+         return false;  // image data is not enough for providing reliable results
      int incn = inarr.channels();
      if( incn == 3 || incn == 4 )
      {
      return true;
  }
  
 -bool detectQRCode(InputArray in, vector<Point> &points, double eps_x, double eps_y)
 -{
 -    QRCodeDetector qrdetector;
 -    qrdetector.setEpsX(eps_x);
 -    qrdetector.setEpsY(eps_y);
 -
 -    return qrdetector.detect(in, points);
 -}
 -
  class QRDecode
  {
  public:
@@@ -1048,12 -1060,21 +1051,14 @@@ bool QRDecode::fullDecodingProcess(
  #endif
  }
  
 -bool decodeQRCode(InputArray in, InputArray points, std::string &decoded_info, OutputArray straight_qrcode)
 -{
 -    QRCodeDetector qrcode;
 -    decoded_info = qrcode.decode(in, points, straight_qrcode);
 -    return !decoded_info.empty();
 -}
 -
 -cv::String QRCodeDetector::decode(InputArray in, InputArray points,
 -                                  OutputArray straight_qrcode)
 +std::string QRCodeDetector::decode(InputArray in, InputArray points,
 +                                   OutputArray straight_qrcode)
  {
      Mat inarr = in.getMat();
      CV_Assert(!inarr.empty());
      CV_Assert(inarr.depth() == CV_8U);
+     if (inarr.cols <= 20 || inarr.rows <= 20)
+         return cv::String();  // image data is not enough for providing reliable results
  
      int incn = inarr.channels();
      if( incn == 3 || incn == 4 )
      return ok ? decoded_info : std::string();
  }
  
 -cv::String QRCodeDetector::detectAndDecode(InputArray in,
 -                                           OutputArray points_,
 -                                           OutputArray straight_qrcode)
 +std::string QRCodeDetector::detectAndDecode(InputArray in,
 +                                            OutputArray points_,
 +                                            OutputArray straight_qrcode)
  {
      Mat inarr = in.getMat();
      CV_Assert(!inarr.empty());
      CV_Assert(inarr.depth() == CV_8U);
+     if (inarr.cols <= 20 || inarr.rows <= 20)
+         return cv::String();  // image data is not enough for providing reliable results
  
      int incn = inarr.channels();
      if( incn == 3 || incn == 4 )
      return decoded_info;
  }
  
 -
  }
@@@ -237,6 -237,11 +237,11 @@@ make & enjoy
  #include <sys/videoio.h>
  #endif
  
+ // https://github.com/opencv/opencv/issues/13335
+ #ifndef V4L2_CID_ISO_SENSITIVITY
+ #define V4L2_CID_ISO_SENSITIVITY (V4L2_CID_CAMERA_CLASS_BASE+23)
+ #endif
  /* Defaults - If your board can do better, set it here.  Set for the most common type inputs. */
  #define DEFAULT_V4L_WIDTH  640
  #define DEFAULT_V4L_HEIGHT 480
@@@ -786,7 -791,7 +791,7 @@@ bool CvCaptureCAM_V4L::open(const char
      frame_allocated = false;
      deviceName = _deviceName;
      returnFrame = true;
 -    normalizePropRange = utils::getConfigurationParameterBool("OPENCV_VIDEOIO_V4L_RANGE_NORMALIZED", true);
 +    normalizePropRange = utils::getConfigurationParameterBool("OPENCV_VIDEOIO_V4L_RANGE_NORMALIZED", false);
      channelNumber = -1;
      bufferIndex = -1;
  
@@@ -1757,7 -1762,7 +1762,7 @@@ bool CvCaptureCAM_V4L::icvSetFrameSize(
      if (_width > 0)
          width_set = _width;
  
-     if (height > 0)
+     if (_height > 0)
          height_set = _height;
  
      /* two subsequent calls setting WIDTH and HEIGHT will change
@@@ -11,7 -11,7 +11,7 @@@
  
  namespace opencv_test { namespace {
  
- static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100)
+ static void test_readFrames(/*const*/ VideoCapture& capture, const int N = 100, Mat* lastFrame = NULL)
  {
      Mat frame;
      int64 time0 = cv::getTickCount();
@@@ -26,6 -26,7 +26,7 @@@
      }
      int64 time1 = cv::getTickCount();
      printf("Processed %d frames on %.2f FPS\n", N, (N * cv::getTickFrequency()) / (time1 - time0 + 1));
+     if (lastFrame) *lastFrame = frame.clone();
  }
  
  TEST(DISABLED_VideoIO_Camera, basic)
@@@ -55,18 -56,39 +56,53 @@@ TEST(DISABLED_VideoIO_Camera, validate_
      capture.release();
  }
  
 +//Following test if for capture device using PhysConn_Video_SerialDigital as crossbar input pin
 +TEST(DISABLED_VideoIO_Camera, dshow_avermedia_capture)
 +{
 +    VideoCapture capture(0);
 +    ASSERT_TRUE(capture.isOpened());
 +    capture.set(CAP_PROP_CHANNEL, 6);
 +    std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
 +    std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
 +    std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
 +    std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
 +    test_readFrames(capture);
 +    capture.release();
 +}
 +
+ TEST(DISABLED_VideoIO_Camera, validate_V4L2_FrameSize)
+ {
+     VideoCapture capture(CAP_V4L2);
+     ASSERT_TRUE(capture.isOpened());
+     std::cout << "Camera 0 via " << capture.getBackendName() << " backend" << std::endl;
+     std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+     std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+     std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+     int fourcc = (int)capture.get(CAP_PROP_FOURCC);
+     std::cout << "FOURCC code: " << cv::format("0x%8x", fourcc) << std::endl;
+     test_readFrames(capture, 30);
+     EXPECT_TRUE(capture.set(CAP_PROP_FRAME_WIDTH, 640));
+     EXPECT_TRUE(capture.set(CAP_PROP_FRAME_HEIGHT, 480));
+     std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+     std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+     std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+     Mat frame640x480;
+     test_readFrames(capture, 30, &frame640x480);
+     EXPECT_EQ(640, frame640x480.cols);
+     EXPECT_EQ(480, frame640x480.rows);
+     EXPECT_TRUE(capture.set(CAP_PROP_FRAME_WIDTH, 1280));
+     EXPECT_TRUE(capture.set(CAP_PROP_FRAME_HEIGHT, 720));
+     std::cout << "Frame width: " << capture.get(CAP_PROP_FRAME_WIDTH) << std::endl;
+     std::cout << "     height: " << capture.get(CAP_PROP_FRAME_HEIGHT) << std::endl;
+     std::cout << "Capturing FPS: " << capture.get(CAP_PROP_FPS) << std::endl;
+     Mat frame1280x720;
+     test_readFrames(capture, 30, &frame1280x720);
+     EXPECT_EQ(1280, frame1280x720.cols);
+     EXPECT_EQ(720, frame1280x720.rows);
+     capture.release();
+ }
  }} // namespace