From 16f9b6f5e419cd84e53494838a08c7cd54f8f5d1 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Fri, 14 Dec 2012 14:32:00 +0400 Subject: [PATCH] reproducing C++ version of resize area fast --- modules/imgproc/src/imgwarp.cpp | 134 ++++++++++++++++++++++++++-------------- 1 file changed, 86 insertions(+), 48 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index 0de9f59..7c174f2 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1265,47 +1265,72 @@ public: int dx = 0; const uchar* S0 = S; const uchar* S1 = S0 + step; - __m128i masklow = _mm_set1_epi16(0x00ff); __m128i zero = _mm_setzero_si128(); + __m128i delta2 = _mm_set1_epi16(2); if (cn == 1) { - for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + __m128i masklow = _mm_set1_epi16(0x00ff); + for ( ; dx < w; dx += 8, S0 += 16, S1 += 16, D += 8) { - __m128i s0 = _mm_loadu_si128((const __m128i*)S0); - __m128i s1 = _mm_loadu_si128((const __m128i*)S1); + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 1)); - s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 1))); + __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow)); + __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow)); + s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2); + s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); - _mm_storel_epi64((__m128i*)D, _mm_packus_epi16(_mm_and_si128(s, masklow), zero)); + _mm_storel_epi64((__m128i*)D, s0); } } else if (cn == 3) - for ( ; dx < w - 6; dx += 6, S0 += 12, S1 += 12, D += 6) + for ( ; dx < w; dx += 6, S0 += 12, S1 += 12, D += 6) { - __m128i s0 = _mm_loadu_si128((const __m128i*)S0); - __m128i s1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 3)); - s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 3))); - - _mm_storel_epi64((__m128i*)D, s); - _mm_storel_epi64((__m128i*)(D+3), _mm_srli_si128(s, 6)); + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); + __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero); + __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); + __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero); + + __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6)); + __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6)); + s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); + s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + + s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6)); + s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6)); + s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); + s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); + _mm_storel_epi64((__m128i*)(D+3), s0); } else { CV_Assert(cn == 4); - for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + for ( ; dx < w; dx += 8, S0 += 16, S1 += 16, D += 8) { - __m128i s0 = _mm_loadu_si128((const __m128i*)S0); - __m128i s1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 4)); - s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 4))); - - _mm_storel_epi64((__m128i*)D, s); - _mm_storel_epi64((__m128i*)(D+4), _mm_srli_si128(s, 8)); + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); + __m128i r0_16h = _mm_unpackhi_epi8(r0, zero); + __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); + __m128i r1_16h = _mm_unpackhi_epi8(r1, zero); + + __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8)); + __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8)); + s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); + s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + + s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8)); + s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8)); + s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); + s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); + _mm_storel_epi64((__m128i*)(D+4), s0); } } @@ -1314,8 +1339,8 @@ public: private: int cn; - int step; bool use_simd; + int step; }; class ResizeAreaFastVec_SIMD_16u @@ -1337,45 +1362,58 @@ public: const ushort* S1 = (const ushort*)(S0 + step); __m128i masklow = _mm_set1_epi32(0x0000ffff); __m128i zero = _mm_setzero_si128(); + __m128i delta2 = _mm_set1_epi32(2); if (cn == 1) { for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) { - __m128i s0 = _mm_loadu_si128((const __m128i*)S0); - __m128i s1 = _mm_loadu_si128((const __m128i*)S1); + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); - __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 2)); - s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 2))); + __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow)); + __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow)); + s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); + s0 = _mm_packs_epi32(_mm_srli_epi32(s0, 2), zero); - s = _mm_and_si128(s, masklow); - s = _mm_packs_epi32(s, zero); - _mm_storel_epi64((__m128i*)D, s); + _mm_storel_epi64((__m128i*)D, s0); } } else if (cn == 3) for ( ; dx < w - 3; dx += 3, S0 += 6, S1 += 6, D += 3) { - __m128i s0 = _mm_loadu_si128((const __m128i*)S0); - __m128i s1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 6)); - s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 6))); - - _mm_storel_epi64((__m128i*)D, s); + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_unpacklo_epi16(r0, zero); + __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero); + __m128i r1_16l = _mm_unpacklo_epi16(r1, zero); + __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero); + + __m128i s0 = _mm_add_epi16(r0_16l, r0_16h); + __m128i s1 = _mm_add_epi16(r1_16l, r1_16h); + s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); + s0 = _mm_packs_epi32(_mm_srli_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); } else { CV_Assert(cn == 4); for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) { - __m128i s0 = _mm_loadu_si128((const __m128i*)S0); - __m128i s1 = _mm_loadu_si128((const __m128i*)S1); - - __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 8)); - s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 8))); - - _mm_storel_epi64((__m128i*)(D), s); + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_32l = _mm_unpacklo_epi16(r0, zero); + __m128i r0_32h = _mm_unpackhi_epi16(r0, zero); + __m128i r1_32l = _mm_unpacklo_epi16(r1, zero); + __m128i r1_32h = _mm_unpackhi_epi16(r1, zero); + + __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); + __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); + s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); + s0 = _mm_packs_epi32(_mm_srli_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); } } @@ -1404,7 +1442,7 @@ struct ResizeAreaFastVec int operator() (const T* S, T* D, int w) const { - if( !fast_mode ) + if (!fast_mode) return 0; const T* nextS = (const T*)((const uchar*)S + step); -- 2.7.4