SSE2 optimization of cv::patchNaNs
authorIlya Lavrenov <ilya.lavrenov@itseez.com>
Thu, 3 Jul 2014 17:28:54 +0000 (21:28 +0400)
committerIlya Lavrenov <ilya.lavrenov@itseez.com>
Fri, 4 Jul 2014 06:27:44 +0000 (10:27 +0400)
modules/core/src/mathfuncs.cpp

index 7a02bd6..f6dd5b6 100644 (file)
@@ -2543,12 +2543,33 @@ void patchNaNs( InputOutputArray _a, double _val )
     NAryMatIterator it(arrays, (uchar**)ptrs);
     size_t len = it.size*a.channels();
     Cv32suf val;
-    val.f = (float)_val;
+    float fval = (float)_val;
+    val.f = fval;
+
+#if CV_SSE2
+    __m128i v_mask1 = _mm_set1_epi32(0x7fffffff), v_mask2 = _mm_set1_epi32(0x7f800000);
+    __m128i v_val = _mm_set1_epi32(val.i);
+#endif
 
     for( size_t i = 0; i < it.nplanes; i++, ++it )
     {
         int* tptr = ptrs[0];
-        for( size_t j = 0; j < len; j++ )
+        size_t j = 0;
+
+#if CV_SSE2
+        if (USE_SSE2)
+        {
+            for ( ; j < len; j += 4)
+            {
+                __m128i v_src = _mm_loadu_si128((__m128i const *)(tptr + j));
+                __m128i v_cmp_mask = _mm_cmplt_epi32(v_mask2, _mm_and_si128(v_src, v_mask1));
+                __m128i v_res = _mm_or_si128(_mm_andnot_si128(v_cmp_mask, v_src), _mm_and_si128(v_cmp_mask, v_val));
+                _mm_storeu_si128((__m128i *)(tptr + j), v_res);
+            }
+        }
+#endif
+
+        for( ; j < len; j++ )
             if( (tptr[j] & 0x7fffffff) > 0x7f800000 )
                 tptr[j] = val.i;
     }