From: tomhudson@google.com Date: Wed, 22 Feb 2012 18:30:43 +0000 (+0000) Subject: SSE2 version of ClampX_ClampY_{no}filter_scale; yields 10-20% speedup in X-Git-Tag: accepted/tizen/5.0/unified/20181102.025319~16807 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=06a7313430728b18f2ed92f14b189f3320fb8d13;p=platform%2Fupstream%2FlibSkiaSharp.git SSE2 version of ClampX_ClampY_{no}filter_scale; yields 10-20% speedup in bitmap_8888 benchmarks on top of last week's SSSE3 patch. Thanks to Jin Yang. http://codereview.appspot.com/5685055/ git-svn-id: http://skia.googlecode.com/svn/trunk@3227 2bbb7eff-a529-9590-31e7-b0007b416f81 --- diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h index 98c8782..fb4957e 100644 --- a/src/core/SkBitmapProcState.h +++ b/src/core/SkBitmapProcState.h @@ -136,5 +136,9 @@ void S32_opaque_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[], int count, SkPMColor colors[]); void S32_alpha_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[], int count, SkPMColor colors[]); +void ClampX_ClampY_filter_scale(const SkBitmapProcState& s, uint32_t xy[], + int count, int x, int y); +void ClampX_ClampY_nofilter_scale(const SkBitmapProcState& s, uint32_t xy[], + int count, int x, int y); #endif diff --git a/src/opts/SkBitmapProcState_opts_SSE2.cpp b/src/opts/SkBitmapProcState_opts_SSE2.cpp index 9a0a013..10abd17 100644 --- a/src/opts/SkBitmapProcState_opts_SSE2.cpp +++ b/src/opts/SkBitmapProcState_opts_SSE2.cpp @@ -232,3 +232,254 @@ void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s, *colors++ = _mm_cvtsi128_si32(sum); } while (--count > 0); } + +static inline uint32_t ClampX_ClampY_pack_filter(SkFixed f, unsigned max, + SkFixed one) { + unsigned i = SkClampMax(f >> 16, max); + i = (i << 4) | ((f >> 12) & 0xF); + return (i << 14) | SkClampMax((f + one) >> 16, max); +} + +/* SSE version of ClampX_ClampY_filter_scale() + * portable version is in core/SkBitmapProcState_matrix.h + */ +void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[], + int count, int x, int y) { + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask)) == 0); + SkASSERT(s.fInvKy == 0); + + const unsigned maxX = s.fBitmap->width() - 1; + const SkFixed one = s.fFilterOneX; + const SkFixed dx = s.fInvSx; + SkFixed fx; + + SkPoint pt; + s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &pt); + const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1); + const unsigned maxY = s.fBitmap->height() - 1; + // compute our two Y values up front + *xy++ = ClampX_ClampY_pack_filter(fy, maxY, s.fFilterOneY); + // now initialize fx + fx = SkScalarToFixed(pt.fX) - (one >> 1); + + // test if we don't need to apply the tile proc + if (dx > 0 && (unsigned)(fx >> 16) <= maxX && + (unsigned)((fx + dx * (count - 1)) >> 16) < maxX) { + if (count >= 4) { + // SSE version of decal_filter_scale + while ((size_t(xy) & 0x0F) != 0) { + SkASSERT((fx >> (16 + 14)) == 0); + *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1); + fx += dx; + count--; + } + + __m128i wide_1 = _mm_set1_epi32(1); + __m128i wide_dx4 = _mm_set1_epi32(dx * 4); + __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, + fx + dx, fx); + + while (count >= 4) { + __m128i wide_out; + + wide_out = _mm_slli_epi32(_mm_srai_epi32(wide_fx, 12), 14); + wide_out = _mm_or_si128(wide_out, _mm_add_epi32( + _mm_srai_epi32(wide_fx, 16), wide_1)); + + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_out); + + xy += 4; + fx += dx * 4; + wide_fx = _mm_add_epi32(wide_fx, wide_dx4); + count -= 4; + } // while count >= 4 + } // if count >= 4 + + while (count-- > 0) { + SkASSERT((fx >> (16 + 14)) == 0); + *xy++ = (fx >> 12 << 14) | ((fx >> 16) + 1); + fx += dx; + } + } else { + // SSE2 only support 16bit interger max & min, so only process the case + // maxX less than the max 16bit interger. Actually maxX is the bitmap's + // height, there should be rare bitmap whose height will be greater + // than max 16bit interger in the real world. + if ((count >= 4) && (maxX <= 0xFFFF)) { + while (((size_t)xy & 0x0F) != 0) { + *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one); + fx += dx; + count--; + } + + __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, + fx + dx, fx); + __m128i wide_dx4 = _mm_set1_epi32(dx * 4); + __m128i wide_one = _mm_set1_epi32(one); + __m128i wide_maxX = _mm_set1_epi32(maxX); + __m128i wide_mask = _mm_set1_epi32(0xF); + + while (count >= 4) { + __m128i wide_i; + __m128i wide_lo; + __m128i wide_fx1; + + // i = SkClampMax(f>>16,maxX) + wide_i = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), + _mm_setzero_si128()); + wide_i = _mm_min_epi16(wide_i, wide_maxX); + + // i<<4 | TILEX_LOW_BITS(fx) + wide_lo = _mm_srli_epi32(wide_fx, 12); + wide_lo = _mm_and_si128(wide_lo, wide_mask); + wide_i = _mm_slli_epi32(wide_i, 4); + wide_i = _mm_or_si128(wide_i, wide_lo); + + // i<<14 + wide_i = _mm_slli_epi32(wide_i, 14); + + // SkClampMax(((f+one))>>16,max) + wide_fx1 = _mm_add_epi32(wide_fx, wide_one); + wide_fx1 = _mm_max_epi16(_mm_srli_epi32(wide_fx1, 16), + _mm_setzero_si128()); + wide_fx1 = _mm_min_epi16(wide_fx1, wide_maxX); + + // final combination + wide_i = _mm_or_si128(wide_i, wide_fx1); + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); + + wide_fx = _mm_add_epi32(wide_fx, wide_dx4); + fx += dx * 4; + xy += 4; + count -= 4; + } // while count >= 4 + } // if count >= 4 + + while (count-- > 0) { + *xy++ = ClampX_ClampY_pack_filter(fx, maxX, one); + fx += dx; + } + } +} + +/* SSE version of ClampX_ClampY_nofilter_scale() + * portable version is in core/SkBitmapProcState_matrix.h + */ +void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask)) == 0); + + // we store y, x, x, x, x, x + const unsigned maxX = s.fBitmap->width() - 1; + SkFixed fx; + SkPoint pt; + s.fInvProc(*s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &pt); + fx = SkScalarToFixed(pt.fY); + const unsigned maxY = s.fBitmap->height() - 1; + *xy++ = SkClampMax(fx >> 16, maxY); + fx = SkScalarToFixed(pt.fX); + + if (0 == maxX) { + // all of the following X values must be 0 + memset(xy, 0, count * sizeof(uint16_t)); + return; + } + + const SkFixed dx = s.fInvSx; + + // test if we don't need to apply the tile proc + if ((unsigned)(fx >> 16) <= maxX && + (unsigned)((fx + dx * (count - 1)) >> 16) <= maxX) { + // SSE version of decal_nofilter_scale + if (count >= 8) { + while (((size_t)xy & 0x0F) != 0) { + *xy++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); + fx += 2 * dx; + count -= 2; + } + + __m128i wide_dx4 = _mm_set1_epi32(dx * 4); + __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4); + + __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2, + fx + dx, fx); + __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4); + + while (count >= 8) { + __m128i wide_out_low = _mm_srli_epi32(wide_low, 16); + __m128i wide_out_high = _mm_srli_epi32(wide_high, 16); + + __m128i wide_result = _mm_packs_epi32(wide_out_low, + wide_out_high); + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result); + + wide_low = _mm_add_epi32(wide_low, wide_dx8); + wide_high = _mm_add_epi32(wide_high, wide_dx8); + + xy += 4; + fx += dx * 8; + count -= 8; + } + } // if count >= 8 + + uint16_t* xx = reinterpret_cast(xy); + while (count-- > 0) { + *xx++ = SkToU16(fx >> 16); + fx += dx; + } + } else { + // SSE2 only support 16bit interger max & min, so only process the case + // maxX less than the max 16bit interger. Actually maxX is the bitmap's + // height, there should be rare bitmap whose height will be greater + // than max 16bit interger in the real world. + if ((count >= 8) && (maxX <= 0xFFFF)) { + while (((size_t)xy & 0x0F) != 0) { + *xy++ = SkClampMax((fx + dx) >> 16, maxX) | + SkClampMax(fx >> 16, maxX); + fx += 2 * dx; + count -= 2; + } + + __m128i wide_dx4 = _mm_set1_epi32(dx * 4); + __m128i wide_dx8 = _mm_add_epi32(wide_dx4, wide_dx4); + + __m128i wide_low = _mm_set_epi32(fx + dx * 3, fx + dx * 2, + fx + dx, fx); + __m128i wide_high = _mm_add_epi32(wide_low, wide_dx4); + __m128i wide_maxX = _mm_set1_epi32(maxX); + + while (count >= 8) { + __m128i wide_out_low = _mm_srli_epi32(wide_low, 16); + __m128i wide_out_high = _mm_srli_epi32(wide_high, 16); + + wide_out_low = _mm_max_epi16(wide_out_low, + _mm_setzero_si128()); + wide_out_low = _mm_min_epi16(wide_out_low, wide_maxX); + wide_out_high = _mm_max_epi16(wide_out_high, + _mm_setzero_si128()); + wide_out_high = _mm_min_epi16(wide_out_high, wide_maxX); + + __m128i wide_result = _mm_packs_epi32(wide_out_low, + wide_out_high); + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_result); + + wide_low = _mm_add_epi32(wide_low, wide_dx8); + wide_high = _mm_add_epi32(wide_high, wide_dx8); + + xy += 4; + fx += dx * 8; + count -= 8; + } + } // if count >= 8 + + uint16_t* xx = reinterpret_cast(xy); + while (count-- > 0) { + *xx++ = SkClampMax(fx >> 16, maxX); + fx += dx; + } + } +} diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h index 9e56642..0f276b8 100644 --- a/src/opts/SkBitmapProcState_opts_SSE2.h +++ b/src/opts/SkBitmapProcState_opts_SSE2.h @@ -17,3 +17,7 @@ void S32_alpha_D32_filter_DX_SSE2(const SkBitmapProcState& s, int count, uint32_t* colors); void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count, SkPMColor color); +void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[], + int count, int x, int y); +void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y); diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp index 3003d78..db5e4e8 100644 --- a/src/opts/opts_check_SSE2.cpp +++ b/src/opts/opts_check_SSE2.cpp @@ -85,19 +85,27 @@ static bool cachedHasSSSE3() { } void SkBitmapProcState::platformProcs() { - if (cachedHasSSSE3()) { - if (fSampleProc32 == S32_opaque_D32_filter_DX) { - fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3; - } else if (fSampleProc32 == S32_alpha_D32_filter_DX) { - fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3; - } - } else if (cachedHasSSE2()) { + if (cachedHasSSSE3()) { + if (fSampleProc32 == S32_opaque_D32_filter_DX) { + fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3; + } else if (fSampleProc32 == S32_alpha_D32_filter_DX) { + fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3; + } + } else if (cachedHasSSE2()) { if (fSampleProc32 == S32_opaque_D32_filter_DX) { fSampleProc32 = S32_opaque_D32_filter_DX_SSE2; } else if (fSampleProc32 == S32_alpha_D32_filter_DX) { fSampleProc32 = S32_alpha_D32_filter_DX_SSE2; } } + + if (cachedHasSSSE3() || cachedHasSSE2()) { + if (fMatrixProc == ClampX_ClampY_filter_scale) { + fMatrixProc = ClampX_ClampY_filter_scale_SSE2; + } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) { + fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2; + } + } } static SkBlitRow::Proc32 platform_32_procs[] = {