From: Ilya Lavrenov Date: Tue, 11 Dec 2012 11:14:50 +0000 (+0400) Subject: sse2 version of resize area fast for types CV_(8, 16)UC(1, 3, 4) X-Git-Tag: accepted/tizen/6.0/unified/20201030.111113~4062^2~8 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d1ca9341151fe63e052809bef6f6552df1c3d1c5;p=platform%2Fupstream%2Fopencv.git sse2 version of resize area fast for types CV_(8, 16)UC(1, 3, 4) --- diff --git a/modules/imgproc/perf/perf_resize.cpp b/modules/imgproc/perf/perf_resize.cpp index 82bf0d3..98e4bc2 100644 --- a/modules/imgproc/perf/perf_resize.cpp +++ b/modules/imgproc/perf/perf_resize.cpp @@ -71,7 +71,7 @@ typedef TestBaseWithParam MatInfo_Size_Scale; PERF_TEST_P(MatInfo_Size_Scale, ResizeAreaFast, testing::Combine( - testing::Values(CV_8UC1, CV_8UC4), + testing::Values(CV_8UC1, CV_8UC3, CV_8UC4, CV_16UC1, CV_16UC3, CV_16UC4), testing::Values(szVGA, szqHD, sz720p, sz1080p), testing::Values(2) ) diff --git a/modules/imgproc/src/imgwarp.cpp b/modules/imgproc/src/imgwarp.cpp index c250659..0de9f59 100644 --- a/modules/imgproc/src/imgwarp.cpp +++ b/modules/imgproc/src/imgwarp.cpp @@ -1241,16 +1241,163 @@ static void resizeGeneric_( const Mat& src, Mat& dst, template struct ResizeAreaFastNoVec { - ResizeAreaFastNoVec(int /*_scale_x*/, int /*_scale_y*/, - int /*_cn*/, int /*_step*//*, const int**/ /*_ofs*/) { } - int operator() (const T* /*S*/, T* /*D*/, int /*w*/) const { return 0; } + ResizeAreaFastNoVec(int, int) { } + ResizeAreaFastNoVec(int, int, int, int) { } + int operator() (const T*, T*, int) const + { return 0; } }; -template +#if CV_SSE2 +class ResizeAreaFastVec_SIMD_8u +{ +public: + ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : + cn(_cn), step(_step) + { + use_simd = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const uchar* S, uchar* D, int w) const + { + if (!use_simd) + return 0; + + int dx = 0; + const uchar* S0 = S; + const uchar* S1 = S0 + step; + __m128i masklow = _mm_set1_epi16(0x00ff); + __m128i zero = _mm_setzero_si128(); + + if (cn == 1) + { + for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + __m128i s0 = _mm_loadu_si128((const __m128i*)S0); + __m128i s1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 1)); + s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 1))); + + _mm_storel_epi64((__m128i*)D, _mm_packus_epi16(_mm_and_si128(s, masklow), zero)); + } + } + else if (cn == 3) + for ( ; dx < w - 6; dx += 6, S0 += 12, S1 += 12, D += 6) + { + __m128i s0 = _mm_loadu_si128((const __m128i*)S0); + __m128i s1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 3)); + s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 3))); + + _mm_storel_epi64((__m128i*)D, s); + _mm_storel_epi64((__m128i*)(D+3), _mm_srli_si128(s, 6)); + } + else + { + CV_Assert(cn == 4); + for ( ; dx < w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) + { + __m128i s0 = _mm_loadu_si128((const __m128i*)S0); + __m128i s1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s = _mm_avg_epu8(s0, _mm_srli_si128(s0, 4)); + s = _mm_avg_epu8(s, _mm_avg_epu8(s1, _mm_srli_si128(s1, 4))); + + _mm_storel_epi64((__m128i*)D, s); + _mm_storel_epi64((__m128i*)(D+4), _mm_srli_si128(s, 8)); + } + } + + return dx; + } + +private: + int cn; + int step; + bool use_simd; +}; + +class ResizeAreaFastVec_SIMD_16u +{ +public: + ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : + cn(_cn), step(_step) + { + use_simd = checkHardwareSupport(CV_CPU_SSE2); + } + + int operator() (const ushort* S, ushort* D, int w) const + { + if (!use_simd) + return 0; + + int dx = 0; + const ushort* S0 = (const ushort*)S; + const ushort* S1 = (const ushort*)(S0 + step); + __m128i masklow = _mm_set1_epi32(0x0000ffff); + __m128i zero = _mm_setzero_si128(); + + if (cn == 1) + { + for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i s0 = _mm_loadu_si128((const __m128i*)S0); + __m128i s1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 2)); + s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 2))); + + s = _mm_and_si128(s, masklow); + s = _mm_packs_epi32(s, zero); + _mm_storel_epi64((__m128i*)D, s); + } + } + else if (cn == 3) + for ( ; dx < w - 3; dx += 3, S0 += 6, S1 += 6, D += 3) + { + __m128i s0 = _mm_loadu_si128((const __m128i*)S0); + __m128i s1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 6)); + s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 6))); + + _mm_storel_epi64((__m128i*)D, s); + } + else + { + CV_Assert(cn == 4); + for ( ; dx < w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) + { + __m128i s0 = _mm_loadu_si128((const __m128i*)S0); + __m128i s1 = _mm_loadu_si128((const __m128i*)S1); + + __m128i s = _mm_avg_epu16(s0, _mm_srli_si128(s0, 8)); + s = _mm_avg_epu16(s, _mm_avg_epu16(s1, _mm_srli_si128(s1, 8))); + + _mm_storel_epi64((__m128i*)(D), s); + } + } + + return dx; + } + +private: + int cn; + int step; + bool use_simd; +}; + +#else +typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_8u; +typedef ResizeAreaFastNoVec ResizeAreaFastVec_SIMD_16u; +#endif + +template struct ResizeAreaFastVec { - ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step/*, const int* _ofs*/) : - scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step)/*, ofs(_ofs)*/ + ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) : + scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step) { fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); } @@ -1261,7 +1408,7 @@ struct ResizeAreaFastVec return 0; const T* nextS = (const T*)((const uchar*)S + step); - int dx = 0; + int dx = vecOp(S, D, w); if (cn == 1) for( ; dx < w; ++dx ) @@ -1279,7 +1426,7 @@ struct ResizeAreaFastVec } else { - assert(cn == 4); + CV_Assert(cn == 4); for( ; dx < w; dx += 4 ) { int index = dx*2; @@ -1298,6 +1445,7 @@ private: int cn; bool fast_mode; int step; + SIMDVecOp vecOp; }; template @@ -1702,10 +1850,10 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, static ResizeAreaFastFunc areafast_tab[] = { - resizeAreaFast_ >, + resizeAreaFast_ >, 0, - resizeAreaFast_ >, - resizeAreaFast_ >, + resizeAreaFast_ >, + resizeAreaFast_ > >, 0, resizeAreaFast_ >, resizeAreaFast_ >, @@ -1764,9 +1912,7 @@ void cv::resize( InputArray _src, OutputArray _dst, Size dsize, // in case of scale_x && scale_y is equal to 2 // INTER_AREA (fast) also is equal to INTER_LINEAR if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 ) - { interpolation = INTER_AREA; - } // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1). // In other cases it is emulated using some variant of bilinear interpolation