SSE2 optimization of cv::preCornerDetect
authorIlya Lavrenov <ilya.lavrenov@itseez.com>
Sun, 29 Jun 2014 21:47:51 +0000 (01:47 +0400)
committerIlya Lavrenov <ilya.lavrenov@itseez.com>
Sun, 29 Jun 2014 21:51:53 +0000 (01:51 +0400)
modules/imgproc/src/corner.cpp

index eeb20fb..923d78b 100644 (file)
@@ -608,6 +608,11 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
         factor *= 255;
     factor = 1./(factor * factor * factor);
 
+#if CV_SSE2
+    volatile bool haveSSE2 = cv::checkHardwareSupport(CV_CPU_SSE2);
+    __m128 v_factor = _mm_set1_ps((float)factor), v_m2 = _mm_set1_ps(-2.0f);
+#endif
+
     Size size = src.size();
     int i, j;
     for( i = 0; i < size.height; i++ )
@@ -619,7 +624,26 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
         const float* d2ydata = (const float*)(D2y.data + i*D2y.step);
         const float* dxydata = (const float*)(Dxy.data + i*Dxy.step);
 
-        for( j = 0; j < size.width; j++ )
+        j = 0;
+
+#if CV_SSE2
+        if (haveSSE2)
+        {
+            for( ; j <= size.width - 4; j += 4 )
+            {
+                __m128 v_dx = _mm_loadu_ps((const float *)(dxdata + j));
+                __m128 v_dy = _mm_loadu_ps((const float *)(dydata + j));
+
+                __m128 v_s1 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dx), _mm_loadu_ps((const float *)(d2ydata + j)));
+                __m128 v_s2 = _mm_mul_ps(_mm_mul_ps(v_dy, v_dy), _mm_loadu_ps((const float *)(d2xdata + j)));
+                __m128 v_s3 = _mm_mul_ps(_mm_mul_ps(v_dx, v_dy), _mm_loadu_ps((const float *)(dxydata + j)));
+                v_s1 = _mm_mul_ps(v_factor, _mm_add_ps(v_s1, _mm_add_ps(v_s2, _mm_mul_ps(v_s3, v_m2))));
+                _mm_storeu_ps(dstdata + j, v_s1);
+            }
+        }
+#endif
+
+        for( ; j < size.width; j++ )
         {
             float dx = dxdata[j];
             float dy = dydata[j];