From 1ca35b74248f457122a1cdcf3d2a0dbc8d4de299 Mon Sep 17 00:00:00 2001 From: Ilya Lavrenov Date: Mon, 12 Jan 2015 10:59:29 +0300 Subject: [PATCH] resize are fast --- modules/imgproc/src/imgwarp.cpp | 142 +++++++++++++++++++++++++++++++++- modules/imgproc/test/test_imgwarp.cpp | 5 +- 2 files changed, 144 insertions(+), 3 deletions(-) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index c4bb3ba..1fa4557 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -2199,8 +2199,146 @@ private: bool use_simd; }; -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_16s; -typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_32f; +class ResizeAreaFastVec_SIMD_16s +{ +public: + ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : + cn(_cn), step(_step) + { + use_simd = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const short* S, short* D, int w) const + { + if (!use_simd) + return 0; + + int dx = 0; + const short* S0 = (const short*)S; + const short* S1 = (const short*)((const uchar*)(S) + step); + __m128i masklow = _mm_set1_epi32(0x0000ffff); + __m128i zero = _mm_setzero_si128(); + __m128i delta2 = _mm_set1_epi32(2); + + if (cn == 1) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16), + _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16)); + __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16), + _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16)); + s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); + s0 = _mm_srai_epi32(s0, 2); + s0 = _mm_packs_epi32(s0, zero); + + _mm_storel_epi64((__m128i*)D, s0); + } + } + else if (cn == 3) + for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); + __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16); + __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); + __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16); + + __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); + __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); + s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); + s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + else + { + CV_Assert(cn == 4); + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i r0 = _mm_loadu_si128((const __m128i*)S0); + __m128i r1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); + __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16); + __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); + __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16); + + __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); + __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); + s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); + s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); + _mm_storel_epi64((__m128i*)D, s0); + } + } + + return dx; + } + +private: + int cn; + int step; + bool use_simd; +}; + +struct ResizeAreaFastVec_SIMD_32f +{ + ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : + scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step) + { + fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); + } + + int operator() (const float * S, float * D, int w) const + { + if (!fast_mode) + return 0; + + const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); + int dx = 0; + + __m128 v_025 = _mm_set1_ps(0.25f); + + if (cn == 1) + { + int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), + v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4); + + __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo), + _mm_shuffle_ps(v_row00, v_row01, shuffle_hi)); + __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo), + _mm_shuffle_ps(v_row10, v_row11, shuffle_hi)); + + _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + } + } + else if (cn == 4) + { + for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4)); + __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4)); + + _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); + } + } + + return dx; + } + +private: + int scale_x, scale_y; + int cn; + bool fast_mode; + int step; +}; #else diff --git a/modules/imgproc/test/test_imgwarp.cpp b/modules/imgproc/test/test_imgwarp.cpp index 34505c4..176c990 100644 --- a/modules/imgproc/test/test_imgwarp.cpp +++ b/modules/imgproc/test/test_imgwarp.cpp @@ -1595,7 +1595,10 @@ void resizeArea(const cv::Mat & src, cv::Mat & dst) TEST(Resize, Area_half) { const int size = 1000; - int types[] = { CV_8UC1, CV_8UC4, CV_16UC1, CV_16UC4, CV_16SC1, CV_16SC4, CV_32FC1, CV_32FC4 }; + int types[] = { CV_8UC1, CV_8UC4, + CV_16UC1, CV_16UC4, + CV_16SC1, CV_16SC3, CV_16SC4, + CV_32FC1, CV_32FC4 }; cv::RNG rng(17); -- 2.7.4