reproducing C++ version of resize area fast
authorIlya Lavrenov <ilya.lavrenov@itseez.com>
Fri, 14 Dec 2012 10:32:00 +0000 (14:32 +0400)
committerIlya Lavrenov <ilya.lavrenov@itseez.com>
Fri, 14 Dec 2012 10:32:00 +0000 (14:32 +0400)
modules/imgproc/src/imgwarp.cpp

index 0de9f59..7c174f2 100644 (file)
@@ -1265,47 +1265,72 @@ public:
         int dx = 0;
         const uchar* S0 = S;
         const uchar* S1 = S0 + step;
-        __m128i masklow = _mm_set1_epi16(0x00ff);
         __m128i zero = _mm_setzero_si128();
+        __m128i delta2 = _mm_set1_epi16(2);
 
         if (cn == 1)
         {
-            for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            __m128i masklow = _mm_set1_epi16(0x00ff);
+            for ( ; dx < w; dx += 8, S0 += 16, S1 += 16, D += 8)
             {
-                __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
 
-                __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 1));
-                s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 1)));
+                __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
+                __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
+                s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
 
-                _mm_storel_epi64((__m128i*)D, _mm_packus_epi16(_mm_and_si128(s, masklow), zero));
+                _mm_storel_epi64((__m128i*)D, s0);
             }
         }
         else if (cn == 3)
-            for ( ; dx < w - 6; dx += 6, S0 += 12, S1 += 12, D += 6)
+            for ( ; dx < w; dx += 6, S0 += 12, S1 += 12, D += 6)
             {
-                __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 3));
-                s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 3)));
-
-                _mm_storel_epi64((__m128i*)D, s);
-                _mm_storel_epi64((__m128i*)(D+3), _mm_srli_si128(s, 6));
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
+                __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
+                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
+                __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
+
+                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
+                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+
+                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
+                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)(D+3), s0);
             }
         else
         {
             CV_Assert(cn == 4);
-            for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
+            for ( ; dx < w; dx += 8, S0 += 16, S1 += 16, D += 8)
             {
-                __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 4));
-                s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 4)));
-
-                _mm_storel_epi64((__m128i*)D, s);
-                _mm_storel_epi64((__m128i*)(D+4), _mm_srli_si128(s, 8));
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
+                __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
+                __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
+                __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
+
+                __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
+                __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
+
+                s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
+                s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
+                s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
+                s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)(D+4), s0);
             }
         }
 
@@ -1314,8 +1339,8 @@ public:
 
 private:
     int cn;
-    int step;
     bool use_simd;
+    int step;
 };
 
 class ResizeAreaFastVec_SIMD_16u
@@ -1337,45 +1362,58 @@ public:
         const ushort* S1 = (const ushort*)(S0 + step);
         __m128i masklow = _mm_set1_epi32(0x0000ffff);
         __m128i zero = _mm_setzero_si128();
+        __m128i delta2 = _mm_set1_epi32(2);
 
         if (cn == 1)
         {
             for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
             {
-                __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
 
-                __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 2));
-                s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 2)));
+                __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
+                __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
+                s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
+                s0 = _mm_packs_epi32(_mm_srli_epi32(s0, 2), zero);
 
-                s = _mm_and_si128(s, masklow);
-                s = _mm_packs_epi32(s, zero);
-                _mm_storel_epi64((__m128i*)D, s);
+                _mm_storel_epi64((__m128i*)D, s0);
             }
         }
         else if (cn == 3)
             for ( ; dx < w - 3; dx += 3, S0 += 6, S1 += 6, D += 3)
             {
-                __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 6));
-                s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 6)));
-
-                _mm_storel_epi64((__m128i*)D, s);
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
+                __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
+                __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
+                __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
+
+                __m128i s0 = _mm_add_epi16(r0_16l, r0_16h);
+                __m128i s1 = _mm_add_epi16(r1_16l, r1_16h);
+                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
+                s0 = _mm_packs_epi32(_mm_srli_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
             }
         else
         {
             CV_Assert(cn == 4);
             for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
             {
-                __m128i s0 = _mm_loadu_si128((const __m128i*)S0);
-                __m128i s1 = _mm_loadu_si128((const __m128i*)S1);
-
-                __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 8));
-                s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 8)));
-
-                _mm_storel_epi64((__m128i*)(D), s);
+                __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
+                __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
+
+                __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
+                __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
+                __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
+                __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
+
+                __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
+                __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
+                s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
+                s0 = _mm_packs_epi32(_mm_srli_epi32(s0, 2), zero);
+                _mm_storel_epi64((__m128i*)D, s0);
             }
         }
 
@@ -1404,7 +1442,7 @@ struct ResizeAreaFastVec
 
     int operator() (const T* S, T* D, int w) const
     {
-        if( !fast_mode )
+        if (!fast_mode)
             return 0;
 
         const T* nextS = (const T*)((const uchar*)S + step);