From 60f3c657cc0235650b630be78105fc47d37385e7 Mon Sep 17 00:00:00 2001 From: "qiankun.miao" Date: Thu, 4 Dec 2014 06:27:03 -0800 Subject: [PATCH] Add SSSE3 acceleration for S32_D16_filter_DXDY With this CL, related nanobench can be improved for 565 config. bitmap_BGRA_8888_scale_rotate_bilerp 115us -> 70.5us 0.61x bitmap_BGRA_8888_update_volatile_scale_rotate_bilerp 115us -> 70.5us 0.61x bitmap_BGRA_8888_update_scale_rotate_bilerp 112us -> 68us 0.6x BUG=skia: Committed: https://skia.googlesource.com/skia/+/45a05780867a06b9f8a8d5240cf6c5d5a2c15a35 Review URL: https://codereview.chromium.org/773753002 --- src/core/SkBitmapProcState.h | 2 ++ src/opts/SkBitmapProcState_opts_SSSE3.cpp | 34 +++++++++++++++++++++++-------- src/opts/SkBitmapProcState_opts_SSSE3.h | 4 ++++ src/opts/opts_check_x86.cpp | 11 ++++++---- 4 files changed, 39 insertions(+), 12 deletions(-) diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h index dd1f0bf..add5bf4 100644 --- a/src/core/SkBitmapProcState.h +++ b/src/core/SkBitmapProcState.h @@ -204,6 +204,8 @@ void ClampX_ClampY_nofilter_affine(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y); void S32_D16_filter_DX(const SkBitmapProcState& s, const uint32_t* xy, int count, uint16_t* colors); +void S32_D16_filter_DXDY(const SkBitmapProcState& s, + const uint32_t* xy, int count, uint16_t* colors); void highQualityFilter32(const SkBitmapProcState &s, int x, int y, SkPMColor *SK_RESTRICT colors, int count); diff --git a/src/opts/SkBitmapProcState_opts_SSSE3.cpp b/src/opts/SkBitmapProcState_opts_SSSE3.cpp index 165f1f5..99bc192 100644 --- a/src/opts/SkBitmapProcState_opts_SSSE3.cpp +++ b/src/opts/SkBitmapProcState_opts_SSSE3.cpp @@ -6,6 +6,7 @@ */ #include "SkBitmapProcState_opts_SSSE3.h" +#include "SkColorPriv.h" #include "SkPaint.h" #include "SkUtils.h" @@ -720,17 +721,28 @@ void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s, } void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, - const uint32_t* xy, - int count, uint32_t* colors) { + const uint32_t* xy, + int count, uint32_t* colors) { S32_generic_D32_filter_DXDY_SSSE3(s, xy, count, colors); } void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, - const uint32_t* xy, - int count, uint32_t* colors) { + const uint32_t* xy, + int count, uint32_t* colors) { S32_generic_D32_filter_DXDY_SSSE3(s, xy, count, colors); } +void S32_D16_filter_DXDY_SSSE3(const SkBitmapProcState& s, + const uint32_t* xy, + int count, uint16_t* colors) { + SkASSERT(64 >= count); + SkAutoSTMalloc<64, uint32_t> colors32(count); + S32_generic_D32_filter_DXDY_SSSE3(s, xy, count, colors32); + for(int i = 0; i < count; i++) { + *colors++ = SkPixel32ToPixel16(colors32[i]); + } +} + #else // SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSSE3 void S32_opaque_D32_filter_DX_SSSE3(const SkBitmapProcState& s, @@ -746,14 +758,20 @@ void S32_alpha_D32_filter_DX_SSSE3(const SkBitmapProcState& s, } void S32_opaque_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, - const uint32_t* xy, - int count, uint32_t* colors) { + const uint32_t* xy, + int count, uint32_t* colors) { sk_throw(); } void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, - const uint32_t* xy, - int count, uint32_t* colors) { + const uint32_t* xy, + int count, uint32_t* colors) { + sk_throw(); +} + +void S32_D16_filter_DXDY_SSSE3(const SkBitmapProcState& s, + const uint32_t* xy, + int count, uint16_t* colors) { sk_throw(); } diff --git a/src/opts/SkBitmapProcState_opts_SSSE3.h b/src/opts/SkBitmapProcState_opts_SSSE3.h index 9fd074a..74504d8 100644 --- a/src/opts/SkBitmapProcState_opts_SSSE3.h +++ b/src/opts/SkBitmapProcState_opts_SSSE3.h @@ -23,4 +23,8 @@ void S32_alpha_D32_filter_DXDY_SSSE3(const SkBitmapProcState& s, const uint32_t* xy, int count, uint32_t* colors); +void S32_D16_filter_DXDY_SSSE3(const SkBitmapProcState& s, + const uint32_t* xy, + int count, uint16_t* colors); + #endif diff --git a/src/opts/opts_check_x86.cpp b/src/opts/opts_check_x86.cpp index 8fec2ba..34aae92 100644 --- a/src/opts/opts_check_x86.cpp +++ b/src/opts/opts_check_x86.cpp @@ -149,26 +149,27 @@ void SkBitmapProcState::platformProcs() { if (!supports_simd(SK_CPU_SSE_LEVEL_SSE2)) { return; } + const bool ssse3 = supports_simd(SK_CPU_SSE_LEVEL_SSSE3); /* Check fSampleProc32 */ if (fSampleProc32 == S32_opaque_D32_filter_DX) { - if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { + if (ssse3) { fSampleProc32 = S32_opaque_D32_filter_DX_SSSE3; } else { fSampleProc32 = S32_opaque_D32_filter_DX_SSE2; } } else if (fSampleProc32 == S32_opaque_D32_filter_DXDY) { - if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { + if (ssse3) { fSampleProc32 = S32_opaque_D32_filter_DXDY_SSSE3; } } else if (fSampleProc32 == S32_alpha_D32_filter_DX) { - if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { + if (ssse3) { fSampleProc32 = S32_alpha_D32_filter_DX_SSSE3; } else { fSampleProc32 = S32_alpha_D32_filter_DX_SSE2; } } else if (fSampleProc32 == S32_alpha_D32_filter_DXDY) { - if (supports_simd(SK_CPU_SSE_LEVEL_SSSE3)) { + if (ssse3) { fSampleProc32 = S32_alpha_D32_filter_DXDY_SSSE3; } } @@ -176,6 +177,8 @@ void SkBitmapProcState::platformProcs() { /* Check fSampleProc16 */ if (fSampleProc16 == S32_D16_filter_DX) { fSampleProc16 = S32_D16_filter_DX_SSE2; + } else if (ssse3 && fSampleProc16 == S32_D16_filter_DXDY) { + fSampleProc16 = S32_D16_filter_DXDY_SSSE3; } /* Check fMatrixProc */ -- 2.7.4