From: tomhudson@google.com Date: Tue, 28 Feb 2012 15:41:49 +0000 (+0000) Subject: SSE2 version of ClampX_ClampY_{no}filter_affine, courtesy of Jin Yang. X-Git-Tag: accepted/tizen/5.0/unified/20181102.025319~16765 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=5efaf268931d01498f4f1af63c556d811e5d5797;p=platform%2Fupstream%2FlibSkiaSharp.git SSE2 version of ClampX_ClampY_{no}filter_affine, courtesy of Jin Yang. Speeds up drawing rotated bitmaps by 20-30%. http://codereview.appspot.com/5700076/ git-svn-id: http://skia.googlecode.com/svn/trunk@3272 2bbb7eff-a529-9590-31e7-b0007b416f81 --- diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h index fb4957e..c04992b 100644 --- a/src/core/SkBitmapProcState.h +++ b/src/core/SkBitmapProcState.h @@ -140,5 +140,9 @@ void ClampX_ClampY_filter_scale(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y); void ClampX_ClampY_nofilter_scale(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y); +void ClampX_ClampY_filter_affine(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y); +void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y); #endif diff --git a/src/opts/SkBitmapProcState_opts_SSE2.cpp b/src/opts/SkBitmapProcState_opts_SSE2.cpp index 10abd17..1852c66 100644 --- a/src/opts/SkBitmapProcState_opts_SSE2.cpp +++ b/src/opts/SkBitmapProcState_opts_SSE2.cpp @@ -483,3 +483,152 @@ void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s, } } } + +/* SSE version of ClampX_ClampY_filter_affine() + * portable version is in core/SkBitmapProcState_matrix.h + */ +void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkPoint srcPt; + s.fInvProc(*s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &srcPt); + + SkFixed oneX = s.fFilterOneX; + SkFixed oneY = s.fFilterOneY; + SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1); + SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1); + SkFixed dx = s.fInvSx; + SkFixed dy = s.fInvKy; + unsigned maxX = s.fBitmap->width() - 1; + unsigned maxY = s.fBitmap->height() - 1; + + if (count >= 2 && (maxX <= 0xFFFF)) { + SkFixed dx2 = dx + dx; + SkFixed dy2 = dy + dy; + + __m128i wide_f = _mm_set_epi32(fx + dx, fy + dy, fx, fy); + __m128i wide_d2 = _mm_set_epi32(dx2, dy2, dx2, dy2); + __m128i wide_one = _mm_set_epi32(oneX, oneY, oneX, oneY); + __m128i wide_max = _mm_set_epi32(maxX, maxY, maxX, maxY); + __m128i wide_mask = _mm_set1_epi32(0xF); + + while (count >= 2) { + // i = SkClampMax(f>>16,maxX) + __m128i wide_i = _mm_max_epi16(_mm_srli_epi32(wide_f, 16), + _mm_setzero_si128()); + wide_i = _mm_min_epi16(wide_i, wide_max); + + // i<<4 | TILEX_LOW_BITS(f) + __m128i wide_lo = _mm_srli_epi32(wide_f, 12); + wide_lo = _mm_and_si128(wide_lo, wide_mask); + wide_i = _mm_slli_epi32(wide_i, 4); + wide_i = _mm_or_si128(wide_i, wide_lo); + + // i<<14 + wide_i = _mm_slli_epi32(wide_i, 14); + + // SkClampMax(((f+one))>>16,max) + __m128i wide_f1 = _mm_add_epi32(wide_f, wide_one); + wide_f1 = _mm_max_epi16(_mm_srli_epi32(wide_f1, 16), + _mm_setzero_si128()); + wide_f1 = _mm_min_epi16(wide_f1, wide_max); + + // final combination + wide_i = _mm_or_si128(wide_i, wide_f1); + _mm_storeu_si128(reinterpret_cast<__m128i*>(xy), wide_i); + + wide_f = _mm_add_epi32(wide_f, wide_d2); + + fx += dx2; + fy += dy2; + xy += 4; + count -= 2; + } // while count >= 2 + } // if count >= 2 + + while (count-- > 0) { + *xy++ = ClampX_ClampY_pack_filter(fy, maxY, oneY); + fy += dy; + *xy++ = ClampX_ClampY_pack_filter(fx, maxX, oneX); + fx += dx; + } +} + +/* SSE version of ClampX_ClampY_nofilter_affine() + * portable version is in core/SkBitmapProcState_matrix.h + */ +void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y) { + SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); + SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | + SkMatrix::kScale_Mask | + SkMatrix::kAffine_Mask)) == 0); + + SkPoint srcPt; + s.fInvProc(*s.fInvMatrix, + SkIntToScalar(x) + SK_ScalarHalf, + SkIntToScalar(y) + SK_ScalarHalf, &srcPt); + + SkFixed fx = SkScalarToFixed(srcPt.fX); + SkFixed fy = SkScalarToFixed(srcPt.fY); + SkFixed dx = s.fInvSx; + SkFixed dy = s.fInvKy; + int maxX = s.fBitmap->width() - 1; + int maxY = s.fBitmap->height() - 1; + + if (count >= 4 && (maxX <= 0xFFFF)) { + while (((size_t)xy & 0x0F) != 0) { + *xy++ = (SkClampMax(fy >> 16, maxY) << 16) | + SkClampMax(fx >> 16, maxX); + fx += dx; + fy += dy; + count--; + } + + SkFixed dx4 = dx * 4; + SkFixed dy4 = dy * 4; + + __m128i wide_fx = _mm_set_epi32(fx + dx * 3, fx + dx * 2, + fx + dx, fx); + __m128i wide_fy = _mm_set_epi32(fy + dy * 3, fy + dy * 2, + fy + dy, fy); + __m128i wide_dx4 = _mm_set1_epi32(dx4); + __m128i wide_dy4 = _mm_set1_epi32(dy4); + + __m128i wide_maxX = _mm_set1_epi32(maxX); + __m128i wide_maxY = _mm_set1_epi32(maxY); + + while (count >= 4) { + // SkClampMax(fx>>16,maxX) + __m128i wide_lo = _mm_max_epi16(_mm_srli_epi32(wide_fx, 16), + _mm_setzero_si128()); + wide_lo = _mm_min_epi16(wide_lo, wide_maxX); + + // SkClampMax(fy>>16,maxY) + __m128i wide_hi = _mm_max_epi16(_mm_srli_epi32(wide_fy, 16), + _mm_setzero_si128()); + wide_hi = _mm_min_epi16(wide_hi, wide_maxY); + + // final combination + __m128i wide_i = _mm_or_si128(_mm_slli_epi32(wide_hi, 16), + wide_lo); + _mm_store_si128(reinterpret_cast<__m128i*>(xy), wide_i); + + wide_fx = _mm_add_epi32(wide_fx, wide_dx4); + wide_fy = _mm_add_epi32(wide_fy, wide_dy4); + + fx += dx4; + fy += dy4; + xy += 4; + count -= 4; + } // while count >= 4 + } // if count >= 4 + + while (count-- > 0) { + *xy++ = (SkClampMax(fy >> 16, maxY) << 16) | + SkClampMax(fx >> 16, maxX); + fx += dx; + fy += dy; + } +} diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h index 0f276b8..3fdf696 100644 --- a/src/opts/SkBitmapProcState_opts_SSE2.h +++ b/src/opts/SkBitmapProcState_opts_SSE2.h @@ -21,3 +21,7 @@ void ClampX_ClampY_filter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y); void ClampX_ClampY_nofilter_scale_SSE2(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y); +void ClampX_ClampY_filter_affine_SSE2(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y); +void ClampX_ClampY_nofilter_affine_SSE2(const SkBitmapProcState& s, + uint32_t xy[], int count, int x, int y); diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp index db5e4e8..be1b4a1 100644 --- a/src/opts/opts_check_SSE2.cpp +++ b/src/opts/opts_check_SSE2.cpp @@ -105,6 +105,12 @@ void SkBitmapProcState::platformProcs() { } else if (fMatrixProc == ClampX_ClampY_nofilter_scale) { fMatrixProc = ClampX_ClampY_nofilter_scale_SSE2; } + + if (fMatrixProc == ClampX_ClampY_filter_affine) { + fMatrixProc = ClampX_ClampY_filter_affine_SSE2; + } else if (fMatrixProc == ClampX_ClampY_nofilter_affine) { + fMatrixProc = ClampX_ClampY_nofilter_affine_SSE2; + } } }