From 9272761b22746d2d22439c26f5555028f8e824da Mon Sep 17 00:00:00 2001 From: "senorblanco@chromium.org" Date: Wed, 4 Nov 2009 20:51:06 +0000 Subject: [PATCH] SSE2 optimizations for 32bit blending blitters. This CL implements SSE2 optimizations for 3 of the 32bit blending blitters. It uses CPUID to detect for SSE2 at runtime. In order to accomodate runtime detection, it changes the platform procs from static arrays to static functions. It also includes an implementation of SkTime for Win32. http://codereview.appspot.com/144072 git-svn-id: http://skia.googlecode.com/svn/trunk@418 2bbb7eff-a529-9590-31e7-b0007b416f81 --- include/core/SkBlitRow.h | 17 +- src/core/SkBlitRow_D16.cpp | 4 +- src/core/SkBlitRow_D32.cpp | 2 +- src/opts/SkBlitRow_opts_SSE2.cpp | 336 +++++++++++++++++++++++++++++++++++++++ src/opts/SkBlitRow_opts_arm.cpp | 17 +- src/opts/SkBlitRow_opts_none.cpp | 42 ++--- src/ports/SkTime_win.cpp | 46 ++++++ 7 files changed, 418 insertions(+), 46 deletions(-) create mode 100644 src/opts/SkBlitRow_opts_SSE2.cpp create mode 100644 src/ports/SkTime_win.cpp diff --git a/include/core/SkBlitRow.h b/include/core/SkBlitRow.h index dbbd84d..6560d48 100644 --- a/include/core/SkBlitRow.h +++ b/include/core/SkBlitRow.h @@ -65,18 +65,21 @@ public: Color32(row, row, count, color); } + /** These static functions are called by the Factory and Factory32 + functions, and should return either NULL, or a + platform-specific function-ptr to be used in place of the + system default. + */ + + static const Proc32 PlatformProcs32(unsigned flags); + static const Proc PlatformProcs565(unsigned flags); + static const Proc PlatformProcs4444(unsigned flags); + private: enum { kFlags16_Mask = 7, kFlags32_Mask = 3 }; - /** These global arrays are indexed using the flags parameter to Factory, - and contain either NULL, or a platform-specific function-ptr to be used - in place of the system default. - */ - static const Proc gPlatform_565_Procs[]; - static const Proc gPlatform_4444_Procs[]; - static const Proc32 gPlatform_Procs32[]; }; #endif diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp index 66ac90e..07c42ce 100644 --- a/src/core/SkBlitRow_D16.cpp +++ b/src/core/SkBlitRow_D16.cpp @@ -242,13 +242,13 @@ SkBlitRow::Proc SkBlitRow::Factory(unsigned flags, SkBitmap::Config config) { switch (config) { case SkBitmap::kRGB_565_Config: - proc = gPlatform_565_Procs[flags]; + proc = PlatformProcs565(flags); if (NULL == proc) { proc = gDefault_565_Procs[flags]; } break; case SkBitmap::kARGB_4444_Config: - proc = gPlatform_4444_Procs[flags]; + proc = PlatformProcs4444(flags); if (NULL == proc) { proc = SkBlitRow_Factory_4444(flags); } diff --git a/src/core/SkBlitRow_D32.cpp b/src/core/SkBlitRow_D32.cpp index f67bb9a..0036025 100644 --- a/src/core/SkBlitRow_D32.cpp +++ b/src/core/SkBlitRow_D32.cpp @@ -78,7 +78,7 @@ SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) { // just so we don't crash flags &= kFlags32_Mask; - SkBlitRow::Proc32 proc = gPlatform_Procs32[flags]; + SkBlitRow::Proc32 proc = PlatformProcs32(flags); if (NULL == proc) { proc = gDefault_Procs32[flags]; } diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp new file mode 100644 index 0000000..68c7519 --- /dev/null +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -0,0 +1,336 @@ +/* + ** + ** Copyright 2009, The Android Open Source Project + ** + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** + ** http://www.apache.org/licenses/LICENSE-2.0 + ** + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + */ + +#include "SkBlitRow.h" +#include "SkColorPriv.h" +#include "SkDither.h" + +#include + +#ifdef _MSC_VER +static void getcpuid(int info_type, int info[4]) +{ + __asm { + mov eax, [info_type] + cpuid + mov edi, [info] + mov [edi], eax + mov [edi+4], ebx + mov [edi+8], ecx + mov [edi+12], edx + } +} +#else +static void getcpuid(int info_type, int info[4]) +{ + asm("cpuid": "=a" (info[0]), "=b" (info[1]), "=c" (info[2]), "=d" (info[3]) + : "a"(info_type) + : + ); +} +#endif + +/* SSE2 version of S32_Blend_BlitRow32() + * portable version is in core/SkBlitRow_D32.cpp + */ +static void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha) { + SkASSERT(alpha <= 255); + if (count <= 0) { + return; + } + + uint32_t src_scale = SkAlpha255To256(alpha); + uint32_t dst_scale = 256 - src_scale; + + const __m128i *s = reinterpret_cast(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i src_scale_wide = _mm_set1_epi16(src_scale); + __m128i dst_scale_wide = _mm_set1_epi16(dst_scale); + while (count >= 4) { + // Load 4 pixels each of src and dest. + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_loadu_si128(d); + + // Get red and blue pixels into lower byte of each word. + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); + + // Get alpha and green into lower byte of each word. + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); + + // Multiply by scale. + src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); + src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); + dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide); + dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide); + + // Divide by 256. + src_rb = _mm_srli_epi16(src_rb, 8); + dst_rb = _mm_srli_epi16(dst_rb, 8); + src_ag = _mm_andnot_si128(rb_mask, src_ag); + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + + // Combine back into RGBA. + src_pixel = _mm_or_si128(src_rb, src_ag); + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + + // Add result + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_storeu_si128(d, result); + s++; + d++; + count -= 4; + } + + src = reinterpret_cast(s); + dst = reinterpret_cast(d); + while (count > 0) { + *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); + src++; + dst++; + count--; + } +} + +static void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha) { + SkASSERT(alpha == 255); + if (count <= 0) { + return; + } + const __m128i *s = reinterpret_cast(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); +#ifdef SK_USE_ACCURATE_BLENDING + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) + __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) + while (count >= 4) { + // Load 4 pixels + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_loadu_si128(d); + + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); + dst_ag = _mm_srli_epi16(dst_ag, 8); + // Shift alphas down to lower 8 bits of each quad. + __m128i alpha = _mm_srli_epi32(src_pixel, 24); + + // Copy alpha to upper 3rd byte of each quad + alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); + + // Subtract alphas from 255, to get 0..255 + alpha = _mm_sub_epi16(c_255, alpha); + + // Multiply by red and blue by src alpha. + dst_rb = _mm_mullo_epi16(dst_rb, alpha); + // Multiply by alpha and green by src alpha. + dst_ag = _mm_mullo_epi16(dst_ag, alpha); + + // dst_rb_low = (dst_rb >> 8) + __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); + __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); + + // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 + dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); + dst_rb = _mm_add_epi16(dst_rb, c_128); + dst_rb = _mm_srli_epi16(dst_rb, 8); + + // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask + dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); + dst_ag = _mm_add_epi16(dst_ag, c_128); + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + + // Combine back into RGBA. + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + + // Add result + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_storeu_si128(d, result); + s++; + d++; + count -= 4; + } +#else + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) + while (count >= 4) { + // Load 4 pixels + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_loadu_si128(d); + + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); + dst_ag = _mm_srli_epi16(dst_ag, 8); + // Shift alphas down to lower 8 bits of each quad. + __m128i alpha = _mm_srli_epi32(src_pixel, 24); + + // Copy alpha to upper 3rd byte of each quad + alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); + + // Subtract alphas from 256, to get 1..256 + alpha = _mm_sub_epi16(c_256, alpha); + + // Multiply by red and blue by src alpha. + dst_rb = _mm_mullo_epi16(dst_rb, alpha); + // Multiply by alpha and green by src alpha. + dst_ag = _mm_mullo_epi16(dst_ag, alpha); + + // Divide by 256. + dst_rb = _mm_srli_epi16(dst_rb, 8); + + // Mask out high bits (already in the right place) + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + + // Combine back into RGBA. + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + + // Add result + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_storeu_si128(d, result); + s++; + d++; + count -= 4; + } +#endif + + src = reinterpret_cast(s); + dst = reinterpret_cast(d); + while (count > 0) { + *dst = SkPMSrcOver(*src, *dst); + src++; + dst++; + count--; + } +} + +static void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha) { + SkASSERT(alpha <= 255); + if (count <= 0) { + return; + } + + uint32_t src_scale = SkAlpha255To256(alpha); + + const __m128i *s = reinterpret_cast(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); + __m128i src_scale_wide = _mm_set1_epi16(src_scale); + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) + while (count >= 4) { + // Load 4 pixels each of src and dest. + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_loadu_si128(d); + + // Get red and blue pixels into lower byte of each word. + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); + + // Get alpha and green into lower byte of each word. + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); + + // Put per-pixel alpha in low byte of each word. + __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); + dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); + + // dst_alpha = dst_alpha * src_scale + dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); + + // Divide by 256. + dst_alpha = _mm_srli_epi16(dst_alpha, 8); + + // Subtract alphas from 256, to get 1..256 + dst_alpha = _mm_sub_epi16(c_256, dst_alpha); + + // Multiply red and blue by dst pixel alpha. + dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); + // Multiply alpha and green by dst pixel alpha. + dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); + + // Multiply red and blue by global alpha. + src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); + // Multiply alpha and green by global alpha. + src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); + + // Divide by 256. + dst_rb = _mm_srli_epi16(dst_rb, 8); + src_rb = _mm_srli_epi16(src_rb, 8); + + // Mask out low bits (goodies already in the right place; no need to divide) + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + src_ag = _mm_andnot_si128(rb_mask, src_ag); + + // Combine back into RGBA. + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + src_pixel = _mm_or_si128(src_rb, src_ag); + + // Add two pixels into result. + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_storeu_si128(d, result); + s++; + d++; + count -= 4; + } + src = reinterpret_cast(s); + dst = reinterpret_cast(d); + while (count > 0) { + *dst = SkBlendARGB32(*src, *dst, alpha); + src++; + dst++; + count--; + } +} + +/////////////////////////////////////////////////////////////////////////////// + +static const SkBlitRow::Proc32 platform_32_procs[] = { + NULL, // S32_Opaque, + S32_Blend_BlitRow32_SSE2, // S32_Blend, + S32A_Opaque_BlitRow32_SSE2, // S32A_Opaque + S32A_Blend_BlitRow32_SSE2, // S32A_Blend, +}; + +const SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) { + return NULL; +} + +const SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { + return NULL; +} + +const SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { + static bool once; + static bool hasSSE2; + if (!once) { + int cpu_info[4] = { 0 }; + getcpuid(1, cpu_info); + hasSSE2 = (cpu_info[3] & (1<<26)) != 0; + once = true; + } + if (hasSSE2) { + return platform_32_procs[flags]; + } else { + return NULL; + } +} diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index be1cbdf..44550da 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -976,7 +976,7 @@ static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, /////////////////////////////////////////////////////////////////////////////// -const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = { +static const SkBlitRow::Proc platform_565_procs[] = { // no dither S32_D565_Opaque_PROC, S32_D565_Blend_PROC, @@ -990,7 +990,7 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = { NULL, // S32A_D565_Blend_Dither }; -const SkBlitRow::Proc SkBlitRow::gPlatform_4444_Procs[] = { +static const SkBlitRow::Proc platform_4444_procs[] = { // no dither NULL, // S32_D4444_Opaque, NULL, // S32_D4444_Blend, @@ -1004,10 +1004,21 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_4444_Procs[] = { NULL, // S32A_D4444_Blend_Dither }; -const SkBlitRow::Proc32 SkBlitRow::gPlatform_Procs32[] = { +static const SkBlitRow::Proc32 platform_32_procs[] = { NULL, // S32_Opaque, S32_Blend_BlitRow32_PROC, // S32_Blend, S32A_Opaque_BlitRow32_PROC, // S32A_Opaque, NULL, // S32A_Blend, }; +const SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) { + return platform_4444_procs[flags]; +} + +const SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { + return platform_565_procs[flags]; +} + +const SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { + return platform_32_procs[flags]; +} diff --git a/src/opts/SkBlitRow_opts_none.cpp b/src/opts/SkBlitRow_opts_none.cpp index 7a77759..15b999b 100644 --- a/src/opts/SkBlitRow_opts_none.cpp +++ b/src/opts/SkBlitRow_opts_none.cpp @@ -2,38 +2,14 @@ // Platform impl of Platform_procs with no overrides -const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = { - // no dither - NULL, // S32_D565_Opaque, - NULL, // S32_D565_Blend, - NULL, // S32A_D565_Opaque, - NULL, // S32A_D565_Blend, - - // dither - NULL, // S32_D565_Opaque_Dither, - NULL, // S32_D565_Blend_Dither, - NULL, // S32A_D565_Opaque_Dither, - NULL, // S32A_D565_Blend_Dither -}; +const SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) { + return NULL; +} -const SkBlitRow::Proc SkBlitRow::gPlatform_4444_Procs[] = { - // no dither - NULL, // S32_D4444_Opaque, - NULL, // S32_D4444_Blend, - NULL, // S32A_D4444_Opaque, - NULL, // S32A_D4444_Blend, - - // dither - NULL, // S32_D4444_Opaque_Dither, - NULL, // S32_D4444_Blend_Dither, - NULL, // S32A_D4444_Opaque_Dither, - NULL, // S32A_D4444_Blend_Dither -}; - -const SkBlitRow::Proc32 SkBlitRow::gPlatform_Procs32[] = { - NULL, // S32_Opaque, - NULL, // S32_Blend, - NULL, // S32A_Opaque, - NULL, // S32A_Blend, -}; +const SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { + return NULL; +} +const SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { + return NULL; +} diff --git a/src/ports/SkTime_win.cpp b/src/ports/SkTime_win.cpp new file mode 100644 index 0000000..5194b02 --- /dev/null +++ b/src/ports/SkTime_win.cpp @@ -0,0 +1,46 @@ +/* libs/graphics/ports/SkTime_Unix.cpp +** +** Copyright 2009, The Android Open Source Project +** +** Licensed under the Apache License, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** http://www.apache.org/licenses/LICENSE-2.0 +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +*/ + +#include "SkTime.h" + +void SkTime::GetDateTime(DateTime* dt) +{ + if (dt) + { + SYSTEMTIME st; + GetSystemTime(&st); + + dt->fYear = st.wYear; + dt->fMonth = SkToU8(st.wMonth + 1); + dt->fDayOfWeek = SkToU8(st.wDayOfWeek); + dt->fDay = SkToU8(st.wDay); + dt->fHour = SkToU8(st.wHour); + dt->fMinute = SkToU8(st.wMinute); + dt->fSecond = SkToU8(st.wSecond); + } +} + +SkMSec SkTime::GetMSecs() +{ + FILETIME ft; + LARGE_INTEGER li; + GetSystemTimeAsFileTime(&ft); + li.LowPart = ft.dwLowDateTime; + li.HighPart = ft.dwHighDateTime; + __int64 t = li.QuadPart; /* In 100-nanosecond intervals */ + return t / 10000; /* In milliseconds */ +} -- 2.7.4