Merge pull request #13224 from seiko2plus:core_ppc64le_infa
authorAlexander Alekhin <alexander.a.alekhin@gmail.com>
Tue, 20 Nov 2018 21:26:05 +0000 (21:26 +0000)
committerAlexander Alekhin <alexander.a.alekhin@gmail.com>
Tue, 20 Nov 2018 21:26:05 +0000 (21:26 +0000)
20 files changed:
modules/core/include/opencv2/core/hal/intrin_avx.hpp
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
modules/core/include/opencv2/core/types.hpp
modules/core/test/test_operations.cpp
modules/dnn/include/opencv2/dnn/dnn.hpp
modules/dnn/perf/perf_net.cpp
modules/dnn/src/dnn.cpp
modules/dnn/src/layers/blank_layer.cpp
modules/dnn/src/layers/convolution_layer.cpp
modules/dnn/src/op_inf_engine.cpp
modules/dnn/test/test_common.hpp
modules/dnn/test/test_ie_models.cpp
modules/dnn/test/test_layers.cpp
modules/dnn/test/test_misc.cpp
modules/features2d/src/draw.cpp
modules/imgproc/src/bilateral_filter.cpp
samples/dnn/tf_text_graph_mask_rcnn.py

index f8cc7a4..3037704 100644 (file)
@@ -905,6 +905,11 @@ OPENCV_HAL_IMPL_AVX_CMP_OP_64BIT(v_int64x4)
 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float32x8, ps)
 OPENCV_HAL_IMPL_AVX_CMP_OP_FLT(v_float64x4, pd)
 
+inline v_float32x8 v_not_nan(const v_float32x8& a)
+{ return v_float32x8(_mm256_cmp_ps(a.val, a.val, _CMP_ORD_Q)); }
+inline v_float64x4 v_not_nan(const v_float64x4& a)
+{ return v_float64x4(_mm256_cmp_pd(a.val, a.val, _CMP_ORD_Q)); }
+
 /** min/max **/
 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_min, v_uint8x32,  _mm256_min_epu8)
 OPENCV_HAL_IMPL_AVX_BIN_FUNC(v_max, v_uint8x32,  _mm256_max_epu8)
index 5712f16..1cfb14a 100644 (file)
@@ -683,6 +683,25 @@ OPENCV_HAL_IMPL_CMP_OP(==)
 For all types except 64-bit integer values. */
 OPENCV_HAL_IMPL_CMP_OP(!=)
 
+template<int n>
+inline v_reg<float, n> v_not_nan(const v_reg<float, n>& a)
+{
+typedef typename V_TypeTraits<float>::int_type itype;
+v_reg<float, n> c;
+for (int i = 0; i < n; i++)
+    c.s[i] = V_TypeTraits<float>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+template<int n>
+inline v_reg<double, n> v_not_nan(const v_reg<double, n>& a)
+{
+    typedef typename V_TypeTraits<double>::int_type itype;
+    v_reg<double, n> c;
+    for (int i = 0; i < n; i++)
+        c.s[i] = V_TypeTraits<double>::reinterpret_from_int((itype)-(int)(a.s[i] == a.s[i]));
+    return c;
+}
+
 //! @brief Helper macro
 //! @ingroup core_hal_intrin_impl
 #define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
index 50c9b15..2de4e45 100644 (file)
@@ -764,6 +764,13 @@ OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_int64x2, vreinterpretq_s64_u64, s64, u64)
 OPENCV_HAL_IMPL_NEON_INT_CMP_OP(v_float64x2, vreinterpretq_f64_u64, f64, u64)
 #endif
 
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vreinterpretq_f32_u32(vceqq_f32(a.val, a.val))); }
+#if CV_SIMD128_64F
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vreinterpretq_f64_u64(vceqq_f64(a.val, a.val))); }
+#endif
+
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_add_wrap, vaddq_u8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_int8x16, v_add_wrap, vaddq_s8)
 OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint16x8, v_add_wrap, vaddq_u16)
index c49d0de..283c515 100644 (file)
@@ -1041,6 +1041,11 @@ inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_uint64x2, v_reinterpret_as_u64)
 OPENCV_HAL_IMPL_SSE_64BIT_CMP_OP(v_int64x2, v_reinterpret_as_s64)
 
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(_mm_cmpord_ps(a.val, a.val)); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(_mm_cmpord_pd(a.val, a.val)); }
+
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
index b23e199..fe4a5db 100644 (file)
@@ -607,6 +607,11 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_float64x2)
 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_uint64x2)
 OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
 
+inline v_float32x4 v_not_nan(const v_float32x4& a)
+{ return v_float32x4(vec_cmpeq(a.val, a.val)); }
+inline v_float64x2 v_not_nan(const v_float64x2& a)
+{ return v_float64x2(vec_cmpeq(a.val, a.val)); }
+
 /** min/max **/
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
index 63232e3..ef9ab59 100644 (file)
@@ -1884,8 +1884,11 @@ Rect_<_Tp>& operator += ( Rect_<_Tp>& a, const Size_<_Tp>& b )
 template<typename _Tp> static inline
 Rect_<_Tp>& operator -= ( Rect_<_Tp>& a, const Size_<_Tp>& b )
 {
-    a.width -= b.width;
-    a.height -= b.height;
+    const _Tp width = a.width - b.width;
+    const _Tp height = a.height - b.height;
+    CV_DbgAssert(width >= 0 && height >= 0);
+    a.width = width;
+    a.height = height;
     return a;
 }
 
@@ -1951,6 +1954,15 @@ Rect_<_Tp> operator + (const Rect_<_Tp>& a, const Size_<_Tp>& b)
 }
 
 template<typename _Tp> static inline
+Rect_<_Tp> operator - (const Rect_<_Tp>& a, const Size_<_Tp>& b)
+{
+    const _Tp width = a.width - b.width;
+    const _Tp height = a.height - b.height;
+    CV_DbgAssert(width >= 0 && height >= 0);
+    return Rect_<_Tp>( a.x, a.y, width, height );
+}
+
+template<typename _Tp> static inline
 Rect_<_Tp> operator & (const Rect_<_Tp>& a, const Rect_<_Tp>& b)
 {
     Rect_<_Tp> c = a;
index e72400c..e0a2c99 100644 (file)
@@ -972,6 +972,13 @@ bool CV_OperationsTest::operations1()
         if (sz.width != 10 || sz.height != 20) throw test_excep();
         if (cvSize(sz).width != 10 || cvSize(sz).height != 20) throw test_excep();
 
+        Rect r1(0, 0, 10, 20);
+        Size sz1(5, 10);
+        r1 -= sz1;
+        if (r1.size().width != 5 || r1.size().height != 10) throw test_excep();
+        Rect r2 = r1 - sz1;
+        if (r2.size().width != 0 || r2.size().height != 0) throw test_excep();
+
         Vec<double, 5> v5d(1, 1, 1, 1, 1);
         Vec<double, 6> v6d(1, 1, 1, 1, 1, 1);
         Vec<double, 7> v7d(1, 1, 1, 1, 1, 1, 1);
index 20a953b..3b26bb2 100644 (file)
@@ -88,7 +88,9 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
         DNN_TARGET_CPU,
         DNN_TARGET_OPENCL,
         DNN_TARGET_OPENCL_FP16,
-        DNN_TARGET_MYRIAD
+        DNN_TARGET_MYRIAD,
+        //! FPGA device with CPU fallbacks using Inference Engine's Heterogeneous plugin.
+        DNN_TARGET_FPGA
     };
 
     /** @brief This class provides all data needed to initialize layer.
@@ -501,6 +503,7 @@ CV__DNN_EXPERIMENTAL_NS_BEGIN
          * | DNN_TARGET_OPENCL      |                  + |                            + |                  + |
          * | DNN_TARGET_OPENCL_FP16 |                  + |                            + |                    |
          * | DNN_TARGET_MYRIAD      |                    |                            + |                    |
+         * | DNN_TARGET_FPGA        |                    |                            + |                    |
          */
         CV_WRAP void setPreferableTarget(int targetId);
 
index 1647db3..03d7a23 100644 (file)
@@ -42,7 +42,7 @@ public:
         }
         if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
         {
-            if (!checkMyriadTarget())
+            if (!checkIETarget(DNN_TARGET_MYRIAD))
             {
                 throw SkipTestException("Myriad is not available/disabled in OpenCV");
             }
index 087769a..036726d 100644 (file)
@@ -1077,7 +1077,8 @@ struct Net::Impl
                   preferableTarget == DNN_TARGET_CPU ||
                   preferableTarget == DNN_TARGET_OPENCL ||
                   preferableTarget == DNN_TARGET_OPENCL_FP16 ||
-                  preferableTarget == DNN_TARGET_MYRIAD);
+                  preferableTarget == DNN_TARGET_MYRIAD ||
+                  preferableTarget == DNN_TARGET_FPGA);
         if (!netWasAllocated || this->blobsToKeep != blobsToKeep_)
         {
             if (preferableBackend == DNN_BACKEND_OPENCV && IS_DNN_OPENCL_TARGET(preferableTarget))
@@ -1512,7 +1513,9 @@ struct Net::Impl
             ieNode->net = net;
 
             auto weightableLayer = std::dynamic_pointer_cast<InferenceEngine::WeightableLayer>(ieNode->layer);
-            if ((preferableTarget == DNN_TARGET_OPENCL_FP16 || preferableTarget == DNN_TARGET_MYRIAD) && !fused)
+            if ((preferableTarget == DNN_TARGET_OPENCL_FP16 ||
+                 preferableTarget == DNN_TARGET_MYRIAD ||
+                 preferableTarget == DNN_TARGET_FPGA) && !fused)
             {
                 ieNode->layer->precision = InferenceEngine::Precision::FP16;
                 if (weightableLayer)
index 178a2a4..1eb149b 100644 (file)
@@ -119,8 +119,8 @@ public:
         lp.precision = InferenceEngine::Precision::FP32;
         std::shared_ptr<InferenceEngine::SplitLayer> ieLayer(new InferenceEngine::SplitLayer(lp));
 #if INF_ENGINE_VER_MAJOR_GT(INF_ENGINE_RELEASE_2018R3)
-        ieLayer->params["axis"] = format("%d", input->dims.size() - 1);
-        ieLayer->params["out_sizes"] = format("%d", input->dims[0]);
+        ieLayer->params["axis"] = format("%d", (int)input->dims.size() - 1);
+        ieLayer->params["out_sizes"] = format("%d", (int)input->dims[0]);
 #endif
         return Ptr<BackendNode>(new InfEngineBackendNode(ieLayer));
 #endif  // HAVE_INF_ENGINE
index bc61e39..9daceb5 100644 (file)
@@ -219,9 +219,14 @@ public:
 
     virtual bool supportBackend(int backendId) CV_OVERRIDE
     {
+#ifdef HAVE_INF_ENGINE
         if (backendId == DNN_BACKEND_INFERENCE_ENGINE)
-            return preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height;
+        {
+            return INF_ENGINE_VER_MAJOR_GE(INF_ENGINE_RELEASE_2018R4) ||
+                   (preferableTarget != DNN_TARGET_MYRIAD || dilation.width == dilation.height);
+        }
         else
+#endif
             return backendId == DNN_BACKEND_OPENCV || backendId == DNN_BACKEND_HALIDE;
     }
 
index 4d75f8f..1443690 100644 (file)
@@ -302,7 +302,8 @@ void InfEngineBackendNet::setTargetDevice(InferenceEngine::TargetDevice device)
 {
     if (device != InferenceEngine::TargetDevice::eCPU &&
         device != InferenceEngine::TargetDevice::eGPU &&
-        device != InferenceEngine::TargetDevice::eMYRIAD)
+        device != InferenceEngine::TargetDevice::eMYRIAD &&
+        device != InferenceEngine::TargetDevice::eFPGA)
         CV_Error(Error::StsNotImplemented, "");
     targetDevice = device;
 }
@@ -314,7 +315,8 @@ InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() noexcept
 
 InferenceEngine::TargetDevice InfEngineBackendNet::getTargetDevice() const noexcept
 {
-    return targetDevice;
+    return targetDevice == InferenceEngine::TargetDevice::eFPGA ?
+           InferenceEngine::TargetDevice::eHETERO : targetDevice;
 }
 
 InferenceEngine::StatusCode InfEngineBackendNet::setBatchSize(const size_t) noexcept
@@ -466,6 +468,11 @@ void InfEngineBackendNet::init(int targetId)
         setPrecision(InferenceEngine::Precision::FP16);
         setTargetDevice(InferenceEngine::TargetDevice::eMYRIAD); break;
     }
+    case DNN_TARGET_FPGA:
+    {
+        setPrecision(InferenceEngine::Precision::FP16);
+        setTargetDevice(InferenceEngine::TargetDevice::eFPGA); break;
+    }
     default:
         CV_Error(Error::StsError, format("Unknown target identifier: %d", targetId));
     }
@@ -489,10 +496,15 @@ void InfEngineBackendNet::initPlugin(InferenceEngine::ICNNNetwork& net)
         }
         else
         {
-            enginePtr = InferenceEngine::PluginDispatcher({""}).getSuitablePlugin(targetDevice);
+            auto dispatcher = InferenceEngine::PluginDispatcher({""});
+            if (targetDevice == InferenceEngine::TargetDevice::eFPGA)
+                enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU");
+            else
+                enginePtr = dispatcher.getSuitablePlugin(targetDevice);
             sharedPlugins[targetDevice] = enginePtr;
 
-            if (targetDevice == InferenceEngine::TargetDevice::eCPU)
+            if (targetDevice == InferenceEngine::TargetDevice::eCPU ||
+                targetDevice == InferenceEngine::TargetDevice::eFPGA)
             {
                 std::string suffixes[] = {"_avx2", "_sse4", ""};
                 bool haveFeature[] = {
index bee1f22..9e9b25e 100644 (file)
@@ -66,6 +66,7 @@ static inline void PrintTo(const cv::dnn::Target& v, std::ostream* os)
     case DNN_TARGET_OPENCL: *os << "OCL"; return;
     case DNN_TARGET_OPENCL_FP16: *os << "OCL_FP16"; return;
     case DNN_TARGET_MYRIAD: *os << "MYRIAD"; return;
+    case DNN_TARGET_FPGA: *os << "FPGA"; return;
     } // don't use "default:" to emit compiler warnings
     *os << "DNN_TARGET_UNKNOWN(" << (int)v << ")";
 }
@@ -188,7 +189,7 @@ static inline void normAssertDetections(cv::Mat ref, cv::Mat out, const char *co
                          testBoxes, comment, confThreshold, scores_diff, boxes_iou_diff);
 }
 
-static inline bool checkMyriadTarget()
+static inline bool checkIETarget(int target)
 {
 #ifndef HAVE_INF_ENGINE
     return false;
@@ -197,7 +198,7 @@ static inline bool checkMyriadTarget()
     cv::dnn::LayerParams lp;
     net.addLayerToPrev("testLayer", "Identity", lp);
     net.setPreferableBackend(cv::dnn::DNN_BACKEND_INFERENCE_ENGINE);
-    net.setPreferableTarget(cv::dnn::DNN_TARGET_MYRIAD);
+    net.setPreferableTarget(target);
     static int inpDims[] = {1, 2, 3, 4};
     net.setInput(cv::Mat(4, &inpDims[0], CV_32FC1, cv::Scalar(0)));
     try
@@ -264,7 +265,7 @@ testing::internal::ParamGenerator<tuple<Backend, Target> > dnnBackendsAndTargets
             targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_OPENCL_FP16));
         }
 #endif
-        if (checkMyriadTarget())
+        if (checkIETarget(DNN_TARGET_MYRIAD))
             targets.push_back(make_tuple(DNN_BACKEND_INFERENCE_ENGINE, DNN_TARGET_MYRIAD));
     }
 #endif
@@ -344,7 +345,7 @@ public:
        }
        if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
        {
-           if (!checkMyriadTarget())
+           if (!checkIETarget(DNN_TARGET_MYRIAD))
            {
                throw SkipTestException("Myriad is not available/disabled in OpenCV");
            }
index a50fed8..a8404e0 100644 (file)
@@ -57,28 +57,29 @@ void runIE(Target target, const std::string& xmlPath, const std::string& binPath
     InferencePlugin plugin;
     ExecutableNetwork netExec;
     InferRequest infRequest;
-    TargetDevice targetDevice;
-    switch (target)
-    {
-        case DNN_TARGET_CPU:
-            targetDevice = TargetDevice::eCPU;
-            break;
-        case DNN_TARGET_OPENCL:
-        case DNN_TARGET_OPENCL_FP16:
-            targetDevice = TargetDevice::eGPU;
-            break;
-        case DNN_TARGET_MYRIAD:
-            targetDevice = TargetDevice::eMYRIAD;
-            break;
-        default:
-            CV_Error(Error::StsNotImplemented, "Unknown target");
-    };
-
     try
     {
-        enginePtr = PluginDispatcher({""}).getSuitablePlugin(targetDevice);
-
-        if (targetDevice == TargetDevice::eCPU)
+        auto dispatcher = InferenceEngine::PluginDispatcher({""});
+        switch (target)
+        {
+            case DNN_TARGET_CPU:
+                enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eCPU);
+                break;
+            case DNN_TARGET_OPENCL:
+            case DNN_TARGET_OPENCL_FP16:
+                enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eGPU);
+                break;
+            case DNN_TARGET_MYRIAD:
+                enginePtr = dispatcher.getSuitablePlugin(TargetDevice::eMYRIAD);
+                break;
+            case DNN_TARGET_FPGA:
+                enginePtr = dispatcher.getPluginByDevice("HETERO:FPGA,CPU");
+                break;
+            default:
+                CV_Error(Error::StsNotImplemented, "Unknown target");
+        };
+
+        if (target == DNN_TARGET_CPU || target == DNN_TARGET_FPGA)
         {
             std::string suffixes[] = {"_avx2", "_sse4", ""};
             bool haveFeature[] = {
@@ -255,8 +256,10 @@ static testing::internal::ParamGenerator<Target> dnnDLIETargets()
         targets.push_back(DNN_TARGET_OPENCL_FP16);
     }
 #endif
-    if (checkMyriadTarget())
+    if (checkIETarget(DNN_TARGET_MYRIAD))
         targets.push_back(DNN_TARGET_MYRIAD);
+    if (checkIETarget(DNN_TARGET_FPGA))
+        targets.push_back(DNN_TARGET_FPGA);
     return testing::ValuesIn(targets);
 }
 
index cf94fad..2b21485 100644 (file)
@@ -351,7 +351,7 @@ TEST_P(Test_Caffe_layers, Conv_Elu)
 {
     if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD)
     {
-        if (!checkMyriadTarget())
+        if (!checkIETarget(DNN_TARGET_MYRIAD))
             throw SkipTestException("Myriad is not available/disabled in OpenCV");
     }
 
index 73163f7..327de6f 100644 (file)
@@ -157,7 +157,7 @@ TEST_P(setInput, normalization)
     const int target   = get<1>(get<3>(GetParam()));
     const bool kSwapRB = true;
 
-    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && !checkMyriadTarget())
+    if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && !checkIETarget(DNN_TARGET_MYRIAD))
         throw SkipTestException("Myriad is not available/disabled in OpenCV");
     if (backend == DNN_BACKEND_OPENCV && target == DNN_TARGET_OPENCL_FP16 && dtype != CV_32F)
         throw SkipTestException("");
index 21d2f35..ee0c482 100644 (file)
@@ -117,7 +117,7 @@ void drawKeypoints( InputArray image, const std::vector<KeyPoint>& keypoints, In
                                      end = keypoints.end();
     for( ; it != end; ++it )
     {
-        Scalar color = isRandColor ? Scalar(rng(256), rng(256), rng(256)) : _color;
+        Scalar color = isRandColor ? Scalar( rng(256), rng(256), rng(256), 255 ) : _color;
         _drawKeypoint( outImage, *it, color, flags );
     }
 }
@@ -173,7 +173,7 @@ static inline void _drawMatch( InputOutputArray outImg, InputOutputArray outImg1
 {
     RNG& rng = theRNG();
     bool isRandMatchColor = matchColor == Scalar::all(-1);
-    Scalar color = isRandMatchColor ? Scalar( rng(256), rng(256), rng(256) ) : matchColor;
+    Scalar color = isRandMatchColor ? Scalar( rng(256), rng(256), rng(256), 255 ) : matchColor;
 
     _drawKeypoint( outImg1, kp1, color, flags );
     _drawKeypoint( outImg2, kp2, color, flags );
index 5e39fa4..e9181f2 100644 (file)
@@ -82,7 +82,84 @@ public:
                 memset(buf.data(), 0, buf.size() * sizeof(float));
                 float *sum = alignPtr(buf.data(), CV_SIMD_WIDTH);
                 float *wsum = sum + alignSize(size.width, CV_SIMD_WIDTH);
-                for( k = 0; k < maxk; k++ )
+                k = 0;
+                for(; k <= maxk-4; k+=4)
+                {
+                    const uchar* ksptr0 = sptr + space_ofs[k];
+                    const uchar* ksptr1 = sptr + space_ofs[k+1];
+                    const uchar* ksptr2 = sptr + space_ofs[k+2];
+                    const uchar* ksptr3 = sptr + space_ofs[k+3];
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
+                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
+                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
+                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    {
+                        v_uint32 rval = vx_load_expand_q(sptr + j);
+
+                        v_uint32 val = vx_load_expand_q(ksptr0 + j);
+                        v_float32 w = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
+                        v_float32 v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, vx_load_aligned(sum + j));
+
+                        val = vx_load_expand_q(ksptr1 + j);
+                        w = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_wsum += w;
+                        v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
+
+                        val = vx_load_expand_q(ksptr2 + j);
+                        w = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_wsum += w;
+                        v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
+
+                        val = vx_load_expand_q(ksptr3 + j);
+                        w = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        v_wsum += w;
+                        v_sum = v_muladd(v_cvt_f32(v_reinterpret_as_s32(val)), w, v_sum);
+
+                        v_store_aligned(wsum + j, v_wsum);
+                        v_store_aligned(sum + j, v_sum);
+                    }
+#endif
+#if CV_SIMD128
+                    v_float32x4 kweight4 = v_load(space_weight + k);
+#endif
+                    for (; j < size.width; j++)
+                    {
+#if CV_SIMD128
+                        v_uint32x4 rval = v_setall_u32(sptr[j]);
+                        v_uint32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
+                        v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(val, rval)));
+                        wsum[j] += v_reduce_sum(w);
+                        sum[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(val)) * w);
+#else
+                        int rval = sptr[j];
+
+                        int val = ksptr0[j];
+                        float w = space_weight[k] * color_weight[std::abs(val - rval)];
+                        wsum[j] += w;
+                        sum[j] += val * w;
+
+                        val = ksptr1[j];
+                        w = space_weight[k+1] * color_weight[std::abs(val - rval)];
+                        wsum[j] += w;
+                        sum[j] += val * w;
+
+                        val = ksptr2[j];
+                        w = space_weight[k+2] * color_weight[std::abs(val - rval)];
+                        wsum[j] += w;
+                        sum[j] += val * w;
+
+                        val = ksptr3[j];
+                        w = space_weight[k+3] * color_weight[std::abs(val - rval)];
+                        wsum[j] += w;
+                        sum[j] += val * w;
+#endif
+                    }
+                }
+                for(; k < maxk; k++)
                 {
                     const uchar* ksptr = sptr + space_ofs[k];
                     j = 0;
@@ -126,7 +203,232 @@ public:
                 float *sum_g = sum_b + alignSize(size.width, CV_SIMD_WIDTH);
                 float *sum_r = sum_g + alignSize(size.width, CV_SIMD_WIDTH);
                 float *wsum = sum_r + alignSize(size.width, CV_SIMD_WIDTH);
-                for(k = 0; k < maxk; k++ )
+                k = 0;
+                for(; k <= maxk-4; k+=4)
+                {
+                    const uchar* ksptr0 = sptr + space_ofs[k];
+                    const uchar* ksptr1 = sptr + space_ofs[k+1];
+                    const uchar* ksptr2 = sptr + space_ofs[k+2];
+                    const uchar* ksptr3 = sptr + space_ofs[k+3];
+                    const uchar* rsptr = sptr;
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
+                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
+                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
+                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
+                    for (; j <= size.width - v_uint8::nlanes; j += v_uint8::nlanes, rsptr += 3*v_uint8::nlanes,
+                                                              ksptr0 += 3*v_uint8::nlanes, ksptr1 += 3*v_uint8::nlanes, ksptr2 += 3*v_uint8::nlanes, ksptr3 += 3*v_uint8::nlanes)
+                    {
+                        v_uint8 kb, kg, kr, rb, rg, rr;
+                        v_load_deinterleave(rsptr, rb, rg, rr);
+
+                        v_load_deinterleave(ksptr0, kb, kg, kr);
+                        v_uint16 val0, val1, val2, val3, val4;
+                        v_expand(v_absdiff(kb, rb), val0, val1);
+                        v_expand(v_absdiff(kg, rg), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(v_absdiff(kr, rr), val2, val3);
+                        val0 += val2; val1 += val3;
+
+                        v_uint32 vall, valh;
+                        v_expand(val0, vall, valh);
+                        v_float32 w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        v_float32 w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_expand(kb, val0, val2);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_b + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_expand(kg, val0, val3);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_g + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_expand(kr, val0, val4);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_r + j                      , v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j +   v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+
+                        v_expand(val1, vall, valh);
+                        w0 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight0 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        v_expand(val2, vall, valh);
+                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_expand(val3, vall, valh);
+                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_expand(val4, vall, valh);
+                        v_store_aligned(sum_r + j + 2*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2*v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3*v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3*v_float32::nlanes)));
+
+                        v_load_deinterleave(ksptr1, kb, kg, kr);
+                        v_expand(v_absdiff(kb, rb), val0, val1);
+                        v_expand(v_absdiff(kg, rg), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(v_absdiff(kr, rr), val2, val3);
+                        val0 += val2; val1 += val3;
+
+                        v_expand(val0, vall, valh);
+                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_expand(kb, val0, val2);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_expand(kg, val0, val3);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_expand(kr, val0, val4);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+
+                        v_expand(val1, vall, valh);
+                        w0 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight1 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        v_expand(val2, vall, valh);
+                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_expand(val3, vall, valh);
+                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_expand(val4, vall, valh);
+                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+
+                        v_load_deinterleave(ksptr2, kb, kg, kr);
+                        v_expand(v_absdiff(kb, rb), val0, val1);
+                        v_expand(v_absdiff(kg, rg), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(v_absdiff(kr, rr), val2, val3);
+                        val0 += val2; val1 += val3;
+
+                        v_expand(val0, vall, valh);
+                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_expand(kb, val0, val2);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_expand(kg, val0, val3);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_expand(kr, val0, val4);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+
+                        v_expand(val1, vall, valh);
+                        w0 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight2 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        v_expand(val2, vall, valh);
+                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_expand(val3, vall, valh);
+                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_expand(val4, vall, valh);
+                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+
+                        v_load_deinterleave(ksptr3, kb, kg, kr);
+                        v_expand(v_absdiff(kb, rb), val0, val1);
+                        v_expand(v_absdiff(kg, rg), val2, val3);
+                        val0 += val2; val1 += val3;
+                        v_expand(v_absdiff(kr, rr), val2, val3);
+                        val0 += val2; val1 += val3;
+
+                        v_expand(val0, vall, valh);
+                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j, w0 + vx_load_aligned(wsum + j));
+                        v_store_aligned(wsum + j + v_float32::nlanes, w1 + vx_load_aligned(wsum + j + v_float32::nlanes));
+                        v_expand(kb, val0, val2);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_b + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_b + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + v_float32::nlanes)));
+                        v_expand(kg, val0, val3);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_g + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_g + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + v_float32::nlanes)));
+                        v_expand(kr, val0, val4);
+                        v_expand(val0, vall, valh);
+                        v_store_aligned(sum_r + j, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_r + j + v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + v_float32::nlanes)));
+
+                        v_expand(val1, vall, valh);
+                        w0 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(vall));
+                        w1 = kweight3 * v_lut(color_weight, v_reinterpret_as_s32(valh));
+                        v_store_aligned(wsum + j + 2 * v_float32::nlanes, w0 + vx_load_aligned(wsum + j + 2 * v_float32::nlanes));
+                        v_store_aligned(wsum + j + 3 * v_float32::nlanes, w1 + vx_load_aligned(wsum + j + 3 * v_float32::nlanes));
+                        v_expand(val2, vall, valh);
+                        v_store_aligned(sum_b + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_b + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_b + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_b + j + 3 * v_float32::nlanes)));
+                        v_expand(val3, vall, valh);
+                        v_store_aligned(sum_g + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_g + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_g + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_g + j + 3 * v_float32::nlanes)));
+                        v_expand(val4, vall, valh);
+                        v_store_aligned(sum_r + j + 2 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(vall)), w0, vx_load_aligned(sum_r + j + 2 * v_float32::nlanes)));
+                        v_store_aligned(sum_r + j + 3 * v_float32::nlanes, v_muladd(v_cvt_f32(v_reinterpret_as_s32(valh)), w1, vx_load_aligned(sum_r + j + 3 * v_float32::nlanes)));
+                    }
+#endif
+#if CV_SIMD128
+                    v_float32x4 kweight4 = v_load(space_weight + k);
+#endif
+                    for(; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
+                    {
+#if CV_SIMD128
+                            v_uint32x4 rb = v_setall_u32(rsptr[0]);
+                            v_uint32x4 rg = v_setall_u32(rsptr[1]);
+                            v_uint32x4 rr = v_setall_u32(rsptr[2]);
+                            v_uint32x4 b(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
+                            v_uint32x4 g(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
+                            v_uint32x4 r(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
+                            v_float32x4 w = kweight4 * v_lut(color_weight, v_reinterpret_as_s32(v_absdiff(b, rb) + v_absdiff(g, rg) + v_absdiff(r, rr)));
+                            wsum[j] += v_reduce_sum(w);
+                            sum_b[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(b)) * w);
+                            sum_g[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(g)) * w);
+                            sum_r[j] += v_reduce_sum(v_cvt_f32(v_reinterpret_as_s32(r)) * w);
+#else
+                        int rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
+
+                        int b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
+                        float w = space_weight[k]*color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
+
+                        b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
+                        w = space_weight[k+1] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
+
+                        b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
+                        w = space_weight[k+2] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
+
+                        b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
+                        w = space_weight[k+3] * color_weight[std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)];
+                        wsum[j] += w;
+                        sum_b[j] += b*w; sum_g[j] += g*w; sum_r[j] += r*w;
+#endif
+                    }
+                }
+                for(; k < maxk; k++)
                 {
                     const uchar* ksptr = sptr + space_ofs[k];
                     const uchar* rsptr = sptr;
@@ -421,7 +723,130 @@ public:
                 v_float32 v_one = vx_setall_f32(1.f);
                 v_float32 sindex = vx_setall_f32(scale_index);
 #endif
-                for( k = 0; k < maxk; k++ )
+                k = 0;
+                for(; k <= maxk - 4; k+=4)
+                {
+                    const float* ksptr0 = sptr + space_ofs[k];
+                    const float* ksptr1 = sptr + space_ofs[k + 1];
+                    const float* ksptr2 = sptr + space_ofs[k + 2];
+                    const float* ksptr3 = sptr + space_ofs[k + 3];
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
+                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
+                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
+                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
+                    {
+                        v_float32 rval = vx_load(sptr + j);
+
+                        v_float32 val = vx_load(ksptr0 + j);
+                        v_float32 knan = v_not_nan(val);
+                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        v_int32 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
+                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
+                        v_float32 v_sum = v_muladd(val & knan, w, vx_load_aligned(sum + j));
+
+                        val = vx_load(ksptr1 + j);
+                        knan = v_not_nan(val);
+                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum = v_muladd(val & knan, w, v_sum);
+
+                        val = vx_load(ksptr2 + j);
+                        knan = v_not_nan(val);
+                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum = v_muladd(val & knan, w, v_sum);
+
+                        val = vx_load(ksptr3 + j);
+                        knan = v_not_nan(val);
+                        alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum = v_muladd(val & knan, w, v_sum);
+
+                        v_store_aligned(wsum + j, v_wsum);
+                        v_store_aligned(sum + j, v_sum);
+                    }
+#endif
+#if CV_SIMD128
+                    v_float32x4 v_one4 = v_setall_f32(1.f);
+                    v_float32x4 sindex4 = v_setall_f32(scale_index);
+                    v_float32x4 kweight4 = v_load(space_weight + k);
+#endif
+                    for (; j < size.width; j++)
+                    {
+#if CV_SIMD128
+                        v_float32x4 rval = v_setall_f32(sptr[j]);
+                        v_float32x4 val(ksptr0[j], ksptr1[j], ksptr2[j], ksptr3[j]);
+                        v_float32x4 knan = v_not_nan(val);
+                        v_float32x4 alpha = (v_absdiff(val, rval) * sindex4) & v_not_nan(rval) & knan;
+                        v_int32x4 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
+                        wsum[j] += v_reduce_sum(w);
+                        sum[j] += v_reduce_sum((val & knan) * w);
+#else
+                        float rval = sptr[j];
+
+                        float val = ksptr0[j];
+                        float alpha = std::abs(val - rval) * scale_index;
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
+
+                        val = ksptr1[j];
+                        alpha = std::abs(val - rval) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k+1] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
+
+                        val = ksptr2[j];
+                        alpha = std::abs(val - rval) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k+2] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
+
+                        val = ksptr3[j];
+                        alpha = std::abs(val - rval) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k+3] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
+#endif
+                    }
+                }
+                for(; k < maxk; k++)
                 {
                     const float* ksptr = sptr + space_ofs[k];
                     j = 0;
@@ -430,36 +855,44 @@ public:
                     for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
                     {
                         v_float32 val = vx_load(ksptr + j);
-
-                        v_float32 alpha = v_absdiff(val, vx_load(sptr + j)) * sindex;
+                        v_float32 rval = vx_load(sptr + j);
+                        v_float32 knan = v_not_nan(val);
+                        v_float32 alpha = (v_absdiff(val, rval) * sindex) & v_not_nan(rval) & knan;
                         v_int32 idx = v_trunc(alpha);
                         alpha -= v_cvt_f32(idx);
 
-                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha));
+                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one-alpha))) & knan;
                         v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum + j, v_muladd(val, w, vx_load_aligned(sum + j)));
+                        v_store_aligned(sum + j, v_muladd(val & knan, w, vx_load_aligned(sum + j)));
                     }
 #endif
                     for (; j < size.width; j++)
                     {
                         float val = ksptr[j];
-                        float alpha = std::abs(val - sptr[j]) * scale_index;
+                        float rval = sptr[j];
+                        float alpha = std::abs(val - rval) * scale_index;
                         int idx = cvFloor(alpha);
                         alpha -= idx;
-                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx+1] - expLUT[idx]));
-                        wsum[j] += w;
-                        sum[j] += val * w;
+                        if (!cvIsNaN(val))
+                        {
+                            float w = space_weight[k] * (cvIsNaN(rval) ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum[j] += val * w;
+                        }
                     }
                 }
                 j = 0;
 #if CV_SIMD
                 for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes)
-                    v_store(dptr + j, vx_load_aligned(sum + j) / vx_load_aligned(wsum + j));
+                {
+                    v_float32 v_val = vx_load(sptr + j);
+                    v_store(dptr + j, (vx_load_aligned(sum + j) + (v_val & v_not_nan(v_val))) / (vx_load_aligned(wsum + j) + (v_one & v_not_nan(v_val))));
+                }
 #endif
                 for (; j < size.width; j++)
                 {
-                    CV_DbgAssert(fabs(wsum[j]) > 0);
-                    dptr[j] = sum[j] / wsum[j];
+                    CV_DbgAssert(fabs(wsum[j]) >= 0);
+                    dptr[j] = cvIsNaN(sptr[j]) ? sum[j] / wsum[j] : (sum[j] + sptr[j]) / (wsum[j] + 1.f);
                 }
             }
             else
@@ -475,7 +908,162 @@ public:
                 v_float32 v_one = vx_setall_f32(1.f);
                 v_float32 sindex = vx_setall_f32(scale_index);
 #endif
-                for (k = 0; k < maxk; k++)
+                k = 0;
+                for (; k <= maxk-4; k+=4)
+                {
+                    const float* ksptr0 = sptr + space_ofs[k];
+                    const float* ksptr1 = sptr + space_ofs[k+1];
+                    const float* ksptr2 = sptr + space_ofs[k+2];
+                    const float* ksptr3 = sptr + space_ofs[k+3];
+                    const float* rsptr = sptr;
+                    j = 0;
+#if CV_SIMD
+                    v_float32 kweight0 = vx_setall_f32(space_weight[k]);
+                    v_float32 kweight1 = vx_setall_f32(space_weight[k+1]);
+                    v_float32 kweight2 = vx_setall_f32(space_weight[k+2]);
+                    v_float32 kweight3 = vx_setall_f32(space_weight[k+3]);
+                    for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, rsptr += 3 * v_float32::nlanes,
+                                                                ksptr0 += 3 * v_float32::nlanes, ksptr1 += 3 * v_float32::nlanes, ksptr2 += 3 * v_float32::nlanes, ksptr3 += 3 * v_float32::nlanes)
+                    {
+                        v_float32 kb, kg, kr, rb, rg, rr;
+                        v_load_deinterleave(rsptr, rb, rg, rr);
+
+                        v_load_deinterleave(ksptr0, kb, kg, kr);
+                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_int32 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        v_float32 w = (kweight0 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_float32 v_wsum = vx_load_aligned(wsum + j) + w;
+                        v_float32 v_sum_b = v_muladd(kb & knan, w, vx_load_aligned(sum_b + j));
+                        v_float32 v_sum_g = v_muladd(kg & knan, w, vx_load_aligned(sum_g + j));
+                        v_float32 v_sum_r = v_muladd(kr & knan, w, vx_load_aligned(sum_r + j));
+
+                        v_load_deinterleave(ksptr1, kb, kg, kr);
+                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight1 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
+                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
+                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+
+                        v_load_deinterleave(ksptr2, kb, kg, kr);
+                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight2 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
+                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
+                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+
+                        v_load_deinterleave(ksptr3, kb, kg, kr);
+                        knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        w = (kweight3 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
+                        v_wsum += w;
+                        v_sum_b = v_muladd(kb & knan, w, v_sum_b);
+                        v_sum_g = v_muladd(kg & knan, w, v_sum_g);
+                        v_sum_r = v_muladd(kr & knan, w, v_sum_r);
+
+                        v_store_aligned(wsum + j, v_wsum);
+                        v_store_aligned(sum_b + j, v_sum_b);
+                        v_store_aligned(sum_g + j, v_sum_g);
+                        v_store_aligned(sum_r + j, v_sum_r);
+                    }
+#endif
+#if CV_SIMD128
+                    v_float32x4 v_one4 = v_setall_f32(1.f);
+                    v_float32x4 sindex4 = v_setall_f32(scale_index);
+                    v_float32x4 kweight4 = v_load(space_weight + k);
+#endif
+                    for (; j < size.width; j++, rsptr += 3, ksptr0 += 3, ksptr1 += 3, ksptr2 += 3, ksptr3 += 3)
+                    {
+#if CV_SIMD128
+                        v_float32x4 rb = v_setall_f32(rsptr[0]);
+                        v_float32x4 rg = v_setall_f32(rsptr[1]);
+                        v_float32x4 rr = v_setall_f32(rsptr[2]);
+                        v_float32x4 kb(ksptr0[0], ksptr1[0], ksptr2[0], ksptr3[0]);
+                        v_float32x4 kg(ksptr0[1], ksptr1[1], ksptr2[1], ksptr3[1]);
+                        v_float32x4 kr(ksptr0[2], ksptr1[2], ksptr2[2], ksptr3[2]);
+                        v_float32x4 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        v_float32x4 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex4) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
+                        v_int32x4 idx = v_trunc(alpha);
+                        alpha -= v_cvt_f32(idx);
+                        v_float32x4 w = (kweight4 * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one4 - alpha))) & knan;
+                        wsum[j] += v_reduce_sum(w);
+                        sum_b[j] += v_reduce_sum((kb & knan) * w);
+                        sum_g[j] += v_reduce_sum((kg & knan) * w);
+                        sum_r[j] += v_reduce_sum((kr & knan) * w);
+#else
+                        float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
+                        bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);
+
+                        float b = ksptr0[0], g = ksptr0[1], r = ksptr0[2];
+                        bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
+                        int idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
+
+                        b = ksptr1[0]; g = ksptr1[1]; r = ksptr1[2];
+                        v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k+1] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
+
+                        b = ksptr2[0]; g = ksptr2[1]; r = ksptr2[2];
+                        v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k+2] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
+
+                        b = ksptr3[0]; g = ksptr3[1]; r = ksptr3[2];
+                        v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
+                        idx = cvFloor(alpha);
+                        alpha -= idx;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k+3] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
+#endif
+                    }
+                }
+                for (; k < maxk; k++)
                 {
                     const float* ksptr = sptr + space_ofs[k];
                     const float* rsptr = sptr;
@@ -488,45 +1076,68 @@ public:
                         v_load_deinterleave(ksptr, kb, kg, kr);
                         v_load_deinterleave(rsptr, rb, rg, rr);
 
-                        v_float32 alpha = (v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex;
+                        v_float32 knan = v_not_nan(kb) & v_not_nan(kg) & v_not_nan(kr);
+                        v_float32 alpha = ((v_absdiff(kb, rb) + v_absdiff(kg, rg) + v_absdiff(kr, rr)) * sindex) & v_not_nan(rb) & v_not_nan(rg) & v_not_nan(rr) & knan;
                         v_int32 idx = v_trunc(alpha);
                         alpha -= v_cvt_f32(idx);
 
-                        v_float32 w = kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha));
+                        v_float32 w = (kweight * v_muladd(v_lut(expLUT + 1, idx), alpha, v_lut(expLUT, idx) * (v_one - alpha))) & knan;
                         v_store_aligned(wsum + j, vx_load_aligned(wsum + j) + w);
-                        v_store_aligned(sum_b + j, v_muladd(kb, w, vx_load_aligned(sum_b + j)));
-                        v_store_aligned(sum_g + j, v_muladd(kg, w, vx_load_aligned(sum_g + j)));
-                        v_store_aligned(sum_r + j, v_muladd(kr, w, vx_load_aligned(sum_r + j)));
+                        v_store_aligned(sum_b + j, v_muladd(kb & knan, w, vx_load_aligned(sum_b + j)));
+                        v_store_aligned(sum_g + j, v_muladd(kg & knan, w, vx_load_aligned(sum_g + j)));
+                        v_store_aligned(sum_r + j, v_muladd(kr & knan, w, vx_load_aligned(sum_r + j)));
                     }
 #endif
                     for (; j < size.width; j++, ksptr += 3, rsptr += 3)
                     {
                         float b = ksptr[0], g = ksptr[1], r = ksptr[2];
-                        float alpha = (std::abs(b - rsptr[0]) + std::abs(g - rsptr[1]) + std::abs(r - rsptr[2])) * scale_index;
+                        bool v_NAN = cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r);
+                        float rb = rsptr[0], rg = rsptr[1], rr = rsptr[2];
+                        bool r_NAN = cvIsNaN(rb) || cvIsNaN(rg) || cvIsNaN(rr);
+                        float alpha = (std::abs(b - rb) + std::abs(g - rg) + std::abs(r - rr)) * scale_index;
                         int idx = cvFloor(alpha);
                         alpha -= idx;
-                        float w = space_weight[k] * (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx]));
-                        wsum[j] += w;
-                        sum_b[j] += b*w;
-                        sum_g[j] += g*w;
-                        sum_r[j] += r*w;
+                        if (!v_NAN)
+                        {
+                            float w = space_weight[k] * (r_NAN ? 1.f : (expLUT[idx] + alpha*(expLUT[idx + 1] - expLUT[idx])));
+                            wsum[j] += w;
+                            sum_b[j] += b*w;
+                            sum_g[j] += g*w;
+                            sum_r[j] += r*w;
+                        }
                     }
                 }
                 j = 0;
 #if CV_SIMD
-                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, dptr += 3*v_float32::nlanes)
+                for (; j <= size.width - v_float32::nlanes; j += v_float32::nlanes, sptr += 3*v_float32::nlanes, dptr += 3*v_float32::nlanes)
                 {
-                    v_float32 w = v_one / vx_load_aligned(wsum + j);
-                    v_store_interleave(dptr, vx_load_aligned(sum_b + j) * w, vx_load_aligned(sum_g + j) * w, vx_load_aligned(sum_r + j) * w);
+                    v_float32 b, g, r;
+                    v_load_deinterleave(sptr, b, g, r);
+                    v_float32 mask = v_not_nan(b) & v_not_nan(g) & v_not_nan(r);
+                    v_float32 w = v_one / (vx_load_aligned(wsum + j) + (v_one & mask));
+                    v_store_interleave(dptr, (vx_load_aligned(sum_b + j) + (b & mask)) * w, (vx_load_aligned(sum_g + j) + (g & mask)) * w, (vx_load_aligned(sum_r + j) + (r & mask)) * w);
                 }
 #endif
                 for (; j < size.width; j++)
                 {
-                    CV_DbgAssert(fabs(wsum[j]) > 0);
-                    wsum[j] = 1.f / wsum[j];
-                    *(dptr++) = sum_b[j] * wsum[j];
-                    *(dptr++) = sum_g[j] * wsum[j];
-                    *(dptr++) = sum_r[j] * wsum[j];
+                    CV_DbgAssert(fabs(wsum[j]) >= 0);
+                    float b = *(sptr++);
+                    float g = *(sptr++);
+                    float r = *(sptr++);
+                    if (cvIsNaN(b) || cvIsNaN(g) || cvIsNaN(r))
+                    {
+                        wsum[j] = 1.f / wsum[j];
+                        *(dptr++) = sum_b[j] * wsum[j];
+                        *(dptr++) = sum_g[j] * wsum[j];
+                        *(dptr++) = sum_r[j] * wsum[j];
+                    }
+                    else
+                    {
+                        wsum[j] = 1.f / (wsum[j] + 1.f);
+                        *(dptr++) = (sum_b[j] + b) * wsum[j];
+                        *(dptr++) = (sum_g[j] + g) * wsum[j];
+                        *(dptr++) = (sum_r[j] + r) * wsum[j];
+                    }
                 }
             }
         }
@@ -585,9 +1196,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
     // temporary copy of the image with borders for easy processing
     Mat temp;
     copyMakeBorder( src, temp, radius, radius, radius, radius, borderType );
-    minValSrc -= 5. * sigma_color;
-    patchNaNs( temp, minValSrc ); // this replacement of NaNs makes the assumption that depth values are nonnegative
-                                  // TODO: make replacement parameter avalible in the outside function interface
+
     // allocate lookup tables
     std::vector<float> _space_weight(d*d);
     std::vector<int> _space_ofs(d*d);
@@ -620,7 +1229,7 @@ bilateralFilter_32f( const Mat& src, Mat& dst, int d,
         for( j = -radius; j <= radius; j++ )
         {
             double r = std::sqrt((double)i*i + (double)j*j);
-            if( r > radius )
+            if( r > radius || ( i == 0 && j == 0 ) )
                 continue;
             space_weight[maxk] = (float)std::exp(r*r*gauss_space_coeff);
             space_ofs[maxk++] = (int)(i*(temp.step/sizeof(float)) + j*cn);
index b92d462..aaefe45 100644 (file)
@@ -38,6 +38,8 @@ aspect_ratios = [float(ar) for ar in grid_anchor_generator['aspect_ratios']]
 width_stride = float(grid_anchor_generator['width_stride'][0])
 height_stride = float(grid_anchor_generator['height_stride'][0])
 features_stride = float(config['feature_extractor'][0]['first_stage_features_stride'][0])
+first_stage_nms_iou_threshold = float(config['first_stage_nms_iou_threshold'][0])
+first_stage_max_proposals = int(config['first_stage_max_proposals'][0])
 
 print('Number of classes: %d' % num_classes)
 print('Scales:            %s' % str(scales))
@@ -53,7 +55,8 @@ graph_def = parseTextGraph(args.output)
 removeIdentity(graph_def)
 
 def to_remove(name, op):
-    return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep)
+    return name.startswith(scopesToIgnore) or not name.startswith(scopesToKeep) or \
+           (name.startswith('CropAndResize') and op != 'CropAndResize')
 
 removeUnusedNodesAndAttrs(to_remove, graph_def)
 
@@ -123,20 +126,22 @@ detectionOut.input.append('proposals')
 detectionOut.addAttr('num_classes', 2)
 detectionOut.addAttr('share_location', True)
 detectionOut.addAttr('background_label_id', 0)
-detectionOut.addAttr('nms_threshold', 0.7)
+detectionOut.addAttr('nms_threshold', first_stage_nms_iou_threshold)
 detectionOut.addAttr('top_k', 6000)
 detectionOut.addAttr('code_type', "CENTER_SIZE")
-detectionOut.addAttr('keep_top_k', 100)
+detectionOut.addAttr('keep_top_k', first_stage_max_proposals)
 detectionOut.addAttr('clip', True)
 
 graph_def.node.extend([detectionOut])
 
 # Save as text.
+cropAndResizeNodesNames = []
 for node in reversed(topNodes):
     if node.op != 'CropAndResize':
         graph_def.node.extend([node])
         topNodes.pop()
     else:
+        cropAndResizeNodesNames.append(node.name)
         if numCropAndResize == 1:
             break
         else:
@@ -166,11 +171,15 @@ for i in reversed(range(len(graph_def.node))):
 
     if graph_def.node[i].name in ['SecondStageBoxPredictor/Flatten/flatten/Shape',
                                   'SecondStageBoxPredictor/Flatten/flatten/strided_slice',
-                                  'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape']:
+                                  'SecondStageBoxPredictor/Flatten/flatten/Reshape/shape',
+                                  'SecondStageBoxPredictor/Flatten_1/flatten/Shape',
+                                  'SecondStageBoxPredictor/Flatten_1/flatten/strided_slice',
+                                  'SecondStageBoxPredictor/Flatten_1/flatten/Reshape/shape']:
         del graph_def.node[i]
 
 for node in graph_def.node:
-    if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape':
+    if node.name == 'SecondStageBoxPredictor/Flatten/flatten/Reshape' or \
+       node.name == 'SecondStageBoxPredictor/Flatten_1/flatten/Reshape':
         node.op = 'Flatten'
         node.input.pop()
 
@@ -178,6 +187,12 @@ for node in graph_def.node:
                      'SecondStageBoxPredictor/BoxEncodingPredictor/MatMul']:
         node.addAttr('loc_pred_transposed', True)
 
+    if node.name.startswith('MaxPool2D'):
+        assert(node.op == 'MaxPool')
+        assert(len(cropAndResizeNodesNames) == 2)
+        node.input = [cropAndResizeNodesNames[0]]
+        del cropAndResizeNodesNames[0]
+
 ################################################################################
 ### Postprocessing
 ################################################################################
@@ -223,6 +238,11 @@ graph_def.node.extend([detectionOut])
 for node in reversed(topNodes):
     graph_def.node.extend([node])
 
+    if node.name.startswith('MaxPool2D'):
+        assert(node.op == 'MaxPool')
+        assert(len(cropAndResizeNodesNames) == 1)
+        node.input = [cropAndResizeNodesNames[0]]
+
 for i in reversed(range(len(graph_def.node))):
     if graph_def.node[i].op == 'CropAndResize':
         graph_def.node[i].input.insert(1, 'detection_out_final')