From dc7de745dd142cdc00ffed7963ebb030a0506f72 Mon Sep 17 00:00:00 2001 From: "senorblanco@chromium.org" Date: Mon, 30 Nov 2009 20:00:29 +0000 Subject: [PATCH] More SSE2 optimizations. This CL implements an SSE2 version of S32_bitmap_D32_filter_DX, and uses aligned loads and stores for dst, in all blending. Review URL: http://codereview.appspot.com/157141 git-svn-id: http://skia.googlecode.com/svn/trunk@448 2bbb7eff-a529-9590-31e7-b0007b416f81 --- src/core/SkBitmapProcState.h | 5 + src/core/SkBitmapProcState_matrix.h | 12 +- src/core/SkBitmapProcState_sample.h | 24 +- src/opts/SkBitmapProcState_opts_SSE2.cpp | 126 +++++++++ src/opts/SkBitmapProcState_opts_SSE2.h | 22 ++ src/opts/SkBlitRow_opts_SSE2.cpp | 429 +++++++++++++++++-------------- src/opts/opts_check_SSE2.cpp | 9 + 7 files changed, 410 insertions(+), 217 deletions(-) create mode 100644 src/opts/SkBitmapProcState_opts_SSE2.cpp create mode 100644 src/opts/SkBitmapProcState_opts_SSE2.h diff --git a/src/core/SkBitmapProcState.h b/src/core/SkBitmapProcState.h index 9db62f0..9a5674b 100644 --- a/src/core/SkBitmapProcState.h +++ b/src/core/SkBitmapProcState.h @@ -136,4 +136,9 @@ private: #define pack_two_shorts(pri, sec) PACK_TWO_SHORTS(pri, sec) #endif +// These functions are generated via macros, but are exposed here so that +// platformProcs may test for them by name. +void S32_opaque_D32_filter_DX(const SkBitmapProcState& s, const uint32_t xy[], + int count, SkPMColor colors[]); + #endif diff --git a/src/core/SkBitmapProcState_matrix.h b/src/core/SkBitmapProcState_matrix.h index 049d6d4..9ae8b17 100644 --- a/src/core/SkBitmapProcState_matrix.h +++ b/src/core/SkBitmapProcState_matrix.h @@ -17,7 +17,7 @@ #define PREAMBLE_ARG_Y #endif -static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, +void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) == 0); @@ -82,7 +82,7 @@ static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, // this would require a more general setup thatn SCALE does, but could use // SCALE's inner loop that only looks at dx -static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, +void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | @@ -108,7 +108,7 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, } } -static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, +void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, uint32_t* SK_RESTRICT xy, int count, int x, int y) { SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); @@ -147,7 +147,7 @@ static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max, return (i << 14) | (TILEX_PROCF((f + one), max)); } -static void SCALE_FILTER_NAME(const SkBitmapProcState& s, +void SCALE_FILTER_NAME(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)) == 0); @@ -188,7 +188,7 @@ static void SCALE_FILTER_NAME(const SkBitmapProcState& s, } } -static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, +void AFFINE_FILTER_NAME(const SkBitmapProcState& s, uint32_t xy[], int count, int x, int y) { SkASSERT(s.fInvType & SkMatrix::kAffine_Mask); SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask | @@ -218,7 +218,7 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, } while (--count != 0); } -static void PERSP_FILTER_NAME(const SkBitmapProcState& s, +void PERSP_FILTER_NAME(const SkBitmapProcState& s, uint32_t* SK_RESTRICT xy, int count, int x, int y) { SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask); diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h index 4e1f139..978d144 100644 --- a/src/core/SkBitmapProcState_sample.h +++ b/src/core/SkBitmapProcState_sample.h @@ -16,9 +16,9 @@ #error "unsupported DSTSIZE" #endif -static void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s, - const uint32_t* SK_RESTRICT xy, - int count, DSTTYPE* SK_RESTRICT colors) { +void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s, + const uint32_t* SK_RESTRICT xy, + int count, DSTTYPE* SK_RESTRICT colors) { SkASSERT(count > 0 && colors != NULL); SkASSERT(s.fDoFilter == false); SkDEBUGCODE(CHECKSTATE(s);) @@ -58,9 +58,9 @@ static void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s, #endif } -static void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, - const uint32_t* SK_RESTRICT xy, - int count, DSTTYPE* SK_RESTRICT colors) { +void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, + const uint32_t* SK_RESTRICT xy, + int count, DSTTYPE* SK_RESTRICT colors) { SkASSERT(count > 0 && colors != NULL); SkASSERT(s.fInvType <= (SkMatrix::kTranslate_Mask | SkMatrix::kScale_Mask)); SkASSERT(s.fDoFilter == false); @@ -113,9 +113,9 @@ static void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, /////////////////////////////////////////////////////////////////////////////// -static void MAKENAME(_filter_DX)(const SkBitmapProcState& s, - const uint32_t* SK_RESTRICT xy, - int count, DSTTYPE* SK_RESTRICT colors) { +void MAKENAME(_filter_DX)(const SkBitmapProcState& s, + const uint32_t* SK_RESTRICT xy, + int count, DSTTYPE* SK_RESTRICT colors) { SkASSERT(count > 0 && colors != NULL); SkASSERT(s.fDoFilter); SkDEBUGCODE(CHECKSTATE(s);) @@ -159,9 +159,9 @@ static void MAKENAME(_filter_DX)(const SkBitmapProcState& s, POSTAMBLE(s); #endif } -static void MAKENAME(_filter_DXDY)(const SkBitmapProcState& s, - const uint32_t* SK_RESTRICT xy, - int count, DSTTYPE* SK_RESTRICT colors) { +void MAKENAME(_filter_DXDY)(const SkBitmapProcState& s, + const uint32_t* SK_RESTRICT xy, + int count, DSTTYPE* SK_RESTRICT colors) { SkASSERT(count > 0 && colors != NULL); SkASSERT(s.fDoFilter); SkDEBUGCODE(CHECKSTATE(s);) diff --git a/src/opts/SkBitmapProcState_opts_SSE2.cpp b/src/opts/SkBitmapProcState_opts_SSE2.cpp new file mode 100644 index 0000000..dd92c2b --- /dev/null +++ b/src/opts/SkBitmapProcState_opts_SSE2.cpp @@ -0,0 +1,126 @@ +/* + ** + ** Copyright 2009, The Android Open Source Project + ** + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** + ** http://www.apache.org/licenses/LICENSE-2.0 + ** + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + */ + +#include +#include "SkBitmapProcState_opts_SSE2.h" +#include "SkUtils.h" + +void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s, + const uint32_t* xy, + int count, uint32_t* colors) { + SkASSERT(count > 0 && colors != NULL); + SkASSERT(s.fDoFilter); + SkASSERT(state.fBitmap->config() == SkBitmap::kARGB_8888_Config); + SkASSERT(state.fAlphaScale == 256); + + const char* srcAddr = static_cast(s.fBitmap->getPixels()); + unsigned rb = s.fBitmap->rowBytes(); + uint32_t XY = *xy++; + unsigned y0 = XY >> 14; + const uint32_t* row0 = reinterpret_cast(srcAddr + (y0 >> 4) * rb); + const uint32_t* row1 = reinterpret_cast(srcAddr + (XY & 0x3FFF) * rb); + unsigned subY = y0 & 0xF; + + // ( 0, 0, 0, 0, 0, 0, 0, 16) + __m128i sixteen = _mm_cvtsi32_si128(16); + + // ( 0, 0, 0, 0, 16, 16, 16, 16) + sixteen = _mm_shufflelo_epi16(sixteen, 0); + + // ( 0, 0, 0, 0, 0, 0, 0, y) + __m128i allY = _mm_cvtsi32_si128(subY); + + // ( 0, 0, 0, 0, y, y, y, y) + allY = _mm_shufflelo_epi16(allY, 0); + + // ( 0, 0, 0, 0, 16-y, 16-y, 16-y, 16-y) + __m128i negY = _mm_sub_epi16(sixteen, allY); + + // (16-y, 16-y, 16-y, 16-y, y, y, y, y) + allY = _mm_unpacklo_epi64(allY, negY); + + // (16, 16, 16, 16, 16, 16, 16, 16 ) + sixteen = _mm_shuffle_epi32(sixteen, 0); + + // ( 0, 0, 0, 0, 0, 0, 0, 0) + __m128i zero = _mm_setzero_si128(); + do { + uint32_t XX = *xy++; // x0:14 | 4 | x1:14 + unsigned x0 = XX >> 18; + unsigned x1 = XX & 0x3FFF; + + // (0, 0, 0, 0, 0, 0, 0, x) + __m128i allX = _mm_cvtsi32_si128((XX >> 14) & 0x0F); + + // (0, 0, 0, 0, x, x, x, x) + allX = _mm_shufflelo_epi16(allX, 0); + + // (x, x, x, x, x, x, x, x) + allX = _mm_shuffle_epi32(allX, 0); + + // (16-x, 16-x, 16-x, 16-x, 16-x, 16-x, 16-x) + __m128i negX = _mm_sub_epi16(sixteen, allX); + + // Load 4 samples (pixels). + __m128i a00 = _mm_cvtsi32_si128(row0[x0]); + __m128i a01 = _mm_cvtsi32_si128(row0[x1]); + __m128i a10 = _mm_cvtsi32_si128(row1[x0]); + __m128i a11 = _mm_cvtsi32_si128(row1[x1]); + + // (0, 0, a00, a10) + __m128i a00a10 = _mm_unpacklo_epi32(a10, a00); + + // Expand to 16 bits per component. + a00a10 = _mm_unpacklo_epi8(a00a10, zero); + + // ((a00 * (16-y)), (a10 * y)). + a00a10 = _mm_mullo_epi16(a00a10, allY); + + // (a00 * (16-y) * (16-x), a10 * y * (16-x)). + a00a10 = _mm_mullo_epi16(a00a10, negX); + + // (0, 0, a01, a10) + __m128i a01a11 = _mm_unpacklo_epi32(a11, a01); + + // Expand to 16 bits per component. + a01a11 = _mm_unpacklo_epi8(a01a11, zero); + + // (a01 * (16-y)), (a11 * y) + a01a11 = _mm_mullo_epi16(a01a11, allY); + + // (a01 * (16-y) * x), (a11 * y * x) + a01a11 = _mm_mullo_epi16(a01a11, allX); + + // (a00*w00 + a01*w01, a10*w10 + a11*w11) + __m128i sum = _mm_add_epi16(a00a10, a01a11); + + // (DC, a00*w00 + a01*w01) + __m128i shifted = _mm_shuffle_epi32(sum, 0xEE); + + // (DC, a00*w00 + a01*w01 + a10*w10 + a11*w11) + sum = _mm_add_epi16(sum, shifted); + + // Divide each 16 bit component by 256. + sum = _mm_srli_epi16(sum, 8); + + // Pack lower 4 16 bit values of sum into lower 4 bytes. + sum = _mm_packus_epi16(sum, zero); + + // Extract low int and store. + *colors++ = _mm_cvtsi128_si32(sum); + } while (--count > 0); +} diff --git a/src/opts/SkBitmapProcState_opts_SSE2.h b/src/opts/SkBitmapProcState_opts_SSE2.h new file mode 100644 index 0000000..57342ff --- /dev/null +++ b/src/opts/SkBitmapProcState_opts_SSE2.h @@ -0,0 +1,22 @@ +/* + ** + ** Copyright 2009, The Android Open Source Project + ** + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** + ** http://www.apache.org/licenses/LICENSE-2.0 + ** + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + */ + +#include "SkBitmapProcState.h" + +void S32_opaque_D32_filter_DX_SSE2(const SkBitmapProcState& s, + const uint32_t* xy, + int count, uint32_t* colors); diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index 8983093..bf2db26 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -34,50 +34,60 @@ void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, uint32_t src_scale = SkAlpha255To256(alpha); uint32_t dst_scale = 256 - src_scale; - const __m128i *s = reinterpret_cast(src); - __m128i *d = reinterpret_cast<__m128i*>(dst); - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i src_scale_wide = _mm_set1_epi16(src_scale); - __m128i dst_scale_wide = _mm_set1_epi16(dst_scale); - while (count >= 4) { - // Load 4 pixels each of src and dest. - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_loadu_si128(d); - - // Get red and blue pixels into lower byte of each word. - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); - - // Get alpha and green into lower byte of each word. - __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); - __m128i src_ag = _mm_srli_epi16(src_pixel, 8); - - // Multiply by scale. - src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); - src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); - dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide); - dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide); - - // Divide by 256. - src_rb = _mm_srli_epi16(src_rb, 8); - dst_rb = _mm_srli_epi16(dst_rb, 8); - src_ag = _mm_andnot_si128(rb_mask, src_ag); - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - - // Combine back into RGBA. - src_pixel = _mm_or_si128(src_rb, src_ag); - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - - // Add result - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); - _mm_storeu_si128(d, result); - s++; - d++; - count -= 4; + if (count >= 4) { + SkASSERT(((size_t)dst & 0x03) == 0); + while (((size_t)dst & 0x0F) != 0) { + *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); + src++; + dst++; + count--; + } + + const __m128i *s = reinterpret_cast(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i src_scale_wide = _mm_set1_epi16(src_scale); + __m128i dst_scale_wide = _mm_set1_epi16(dst_scale); + while (count >= 4) { + // Load 4 pixels each of src and dest. + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_load_si128(d); + + // Get red and blue pixels into lower byte of each word. + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); + + // Get alpha and green into lower byte of each word. + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); + + // Multiply by scale. + src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); + src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); + dst_rb = _mm_mullo_epi16(dst_rb, dst_scale_wide); + dst_ag = _mm_mullo_epi16(dst_ag, dst_scale_wide); + + // Divide by 256. + src_rb = _mm_srli_epi16(src_rb, 8); + dst_rb = _mm_srli_epi16(dst_rb, 8); + src_ag = _mm_andnot_si128(rb_mask, src_ag); + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + + // Combine back into RGBA. + src_pixel = _mm_or_si128(src_rb, src_ag); + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + + // Add result + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_store_si128(d, result); + s++; + d++; + count -= 4; + } + src = reinterpret_cast(s); + dst = reinterpret_cast(d); } - src = reinterpret_cast(s); - dst = reinterpret_cast(d); while (count > 0) { *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale); src++; @@ -93,103 +103,114 @@ void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, if (count <= 0) { return; } - const __m128i *s = reinterpret_cast(src); - __m128i *d = reinterpret_cast<__m128i*>(dst); + + if (count >= 4) { + SkASSERT(((size_t)dst & 0x03) == 0); + while (((size_t)dst & 0x0F) != 0) { + *dst = SkPMSrcOver(*src, *dst); + src++; + dst++; + count--; + } + + const __m128i *s = reinterpret_cast(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); #ifdef SK_USE_ACCURATE_BLENDING - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) - __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) - while (count >= 4) { - // Load 4 pixels - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_loadu_si128(d); - - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); - dst_ag = _mm_srli_epi16(dst_ag, 8); - // Shift alphas down to lower 8 bits of each quad. - __m128i alpha = _mm_srli_epi32(src_pixel, 24); - - // Copy alpha to upper 3rd byte of each quad - alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); - - // Subtract alphas from 255, to get 0..255 - alpha = _mm_sub_epi16(c_255, alpha); - - // Multiply by red and blue by src alpha. - dst_rb = _mm_mullo_epi16(dst_rb, alpha); - // Multiply by alpha and green by src alpha. - dst_ag = _mm_mullo_epi16(dst_ag, alpha); - - // dst_rb_low = (dst_rb >> 8) - __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); - __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); - - // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 - dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); - dst_rb = _mm_add_epi16(dst_rb, c_128); - dst_rb = _mm_srli_epi16(dst_rb, 8); - - // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask - dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); - dst_ag = _mm_add_epi16(dst_ag, c_128); - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - - // Add result - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); - _mm_storeu_si128(d, result); - s++; - d++; - count -= 4; - } -#else - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) - while (count >= 4) { - // Load 4 pixels - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_loadu_si128(d); - - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); - dst_ag = _mm_srli_epi16(dst_ag, 8); - // Shift alphas down to lower 8 bits of each quad. - __m128i alpha = _mm_srli_epi32(src_pixel, 24); - - // Copy alpha to upper 3rd byte of each quad - alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); - - // Subtract alphas from 256, to get 1..256 - alpha = _mm_sub_epi16(c_256, alpha); - - // Multiply by red and blue by src alpha. - dst_rb = _mm_mullo_epi16(dst_rb, alpha); - // Multiply by alpha and green by src alpha. - dst_ag = _mm_mullo_epi16(dst_ag, alpha); - - // Divide by 256. - dst_rb = _mm_srli_epi16(dst_rb, 8); - - // Mask out high bits (already in the right place) - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - - // Add result - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); - _mm_storeu_si128(d, result); - s++; - d++; - count -= 4; - } + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit) + __m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit) + while (count >= 4) { + // Load 4 pixels + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_load_si128(d); + + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); + dst_ag = _mm_srli_epi16(dst_ag, 8); + // Shift alphas down to lower 8 bits of each quad. + __m128i alpha = _mm_srli_epi32(src_pixel, 24); + + // Copy alpha to upper 3rd byte of each quad + alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); + + // Subtract alphas from 255, to get 0..255 + alpha = _mm_sub_epi16(c_255, alpha); + + // Multiply by red and blue by src alpha. + dst_rb = _mm_mullo_epi16(dst_rb, alpha); + // Multiply by alpha and green by src alpha. + dst_ag = _mm_mullo_epi16(dst_ag, alpha); + + // dst_rb_low = (dst_rb >> 8) + __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8); + __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8); + + // dst_rb = (dst_rb + dst_rb_low + 128) >> 8 + dst_rb = _mm_add_epi16(dst_rb, dst_rb_low); + dst_rb = _mm_add_epi16(dst_rb, c_128); + dst_rb = _mm_srli_epi16(dst_rb, 8); + + // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask + dst_ag = _mm_add_epi16(dst_ag, dst_ag_low); + dst_ag = _mm_add_epi16(dst_ag, c_128); + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + + // Combine back into RGBA. + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + + // Add result + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_store_si128(d, result); + s++; + d++; + count -= 4; + } + #else + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit) + while (count >= 4) { + // Load 4 pixels + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_load_si128(d); + + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i dst_ag = _mm_andnot_si128(rb_mask, dst_pixel); + dst_ag = _mm_srli_epi16(dst_ag, 8); + // Shift alphas down to lower 8 bits of each quad. + __m128i alpha = _mm_srli_epi32(src_pixel, 24); + + // Copy alpha to upper 3rd byte of each quad + alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16)); + + // Subtract alphas from 256, to get 1..256 + alpha = _mm_sub_epi16(c_256, alpha); + + // Multiply by red and blue by src alpha. + dst_rb = _mm_mullo_epi16(dst_rb, alpha); + // Multiply by alpha and green by src alpha. + dst_ag = _mm_mullo_epi16(dst_ag, alpha); + + // Divide by 256. + dst_rb = _mm_srli_epi16(dst_rb, 8); + + // Mask out high bits (already in the right place) + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + + // Combine back into RGBA. + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + + // Add result + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_store_si128(d, result); + s++; + d++; + count -= 4; + } #endif + src = reinterpret_cast(s); + dst = reinterpret_cast(d); + } - src = reinterpret_cast(s); - dst = reinterpret_cast(d); while (count > 0) { *dst = SkPMSrcOver(*src, *dst); src++; @@ -206,70 +227,80 @@ void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst, return; } - uint32_t src_scale = SkAlpha255To256(alpha); - - const __m128i *s = reinterpret_cast(src); - __m128i *d = reinterpret_cast<__m128i*>(dst); - __m128i src_scale_wide = _mm_set1_epi16(src_scale); - __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); - __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) - while (count >= 4) { - // Load 4 pixels each of src and dest. - __m128i src_pixel = _mm_loadu_si128(s); - __m128i dst_pixel = _mm_loadu_si128(d); - - // Get red and blue pixels into lower byte of each word. - __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); - __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); - - // Get alpha and green into lower byte of each word. - __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); - __m128i src_ag = _mm_srli_epi16(src_pixel, 8); - - // Put per-pixel alpha in low byte of each word. - __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); - dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); - - // dst_alpha = dst_alpha * src_scale - dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); - - // Divide by 256. - dst_alpha = _mm_srli_epi16(dst_alpha, 8); - - // Subtract alphas from 256, to get 1..256 - dst_alpha = _mm_sub_epi16(c_256, dst_alpha); - - // Multiply red and blue by dst pixel alpha. - dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); - // Multiply alpha and green by dst pixel alpha. - dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); - - // Multiply red and blue by global alpha. - src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); - // Multiply alpha and green by global alpha. - src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); - - // Divide by 256. - dst_rb = _mm_srli_epi16(dst_rb, 8); - src_rb = _mm_srli_epi16(src_rb, 8); - - // Mask out low bits (goodies already in the right place; no need to divide) - dst_ag = _mm_andnot_si128(rb_mask, dst_ag); - src_ag = _mm_andnot_si128(rb_mask, src_ag); - - // Combine back into RGBA. - dst_pixel = _mm_or_si128(dst_rb, dst_ag); - src_pixel = _mm_or_si128(src_rb, src_ag); - - // Add two pixels into result. - __m128i result = _mm_add_epi8(src_pixel, dst_pixel); - _mm_storeu_si128(d, result); - s++; - d++; - count -= 4; + if (count >= 4) { + while (((size_t)dst & 0x0F) != 0) { + *dst = SkBlendARGB32(*src, *dst, alpha); + src++; + dst++; + count--; + } + + uint32_t src_scale = SkAlpha255To256(alpha); + + const __m128i *s = reinterpret_cast(src); + __m128i *d = reinterpret_cast<__m128i*>(dst); + __m128i src_scale_wide = _mm_set1_epi16(src_scale); + __m128i rb_mask = _mm_set1_epi32(0x00FF00FF); + __m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit) + while (count >= 4) { + // Load 4 pixels each of src and dest. + __m128i src_pixel = _mm_loadu_si128(s); + __m128i dst_pixel = _mm_load_si128(d); + + // Get red and blue pixels into lower byte of each word. + __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel); + __m128i src_rb = _mm_and_si128(rb_mask, src_pixel); + + // Get alpha and green into lower byte of each word. + __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8); + __m128i src_ag = _mm_srli_epi16(src_pixel, 8); + + // Put per-pixel alpha in low byte of each word. + __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5); + dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5); + + // dst_alpha = dst_alpha * src_scale + dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide); + + // Divide by 256. + dst_alpha = _mm_srli_epi16(dst_alpha, 8); + + // Subtract alphas from 256, to get 1..256 + dst_alpha = _mm_sub_epi16(c_256, dst_alpha); + + // Multiply red and blue by dst pixel alpha. + dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha); + // Multiply alpha and green by dst pixel alpha. + dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha); + + // Multiply red and blue by global alpha. + src_rb = _mm_mullo_epi16(src_rb, src_scale_wide); + // Multiply alpha and green by global alpha. + src_ag = _mm_mullo_epi16(src_ag, src_scale_wide); + + // Divide by 256. + dst_rb = _mm_srli_epi16(dst_rb, 8); + src_rb = _mm_srli_epi16(src_rb, 8); + + // Mask out low bits (goodies already in the right place; no need to divide) + dst_ag = _mm_andnot_si128(rb_mask, dst_ag); + src_ag = _mm_andnot_si128(rb_mask, src_ag); + + // Combine back into RGBA. + dst_pixel = _mm_or_si128(dst_rb, dst_ag); + src_pixel = _mm_or_si128(src_rb, src_ag); + + // Add two pixels into result. + __m128i result = _mm_add_epi8(src_pixel, dst_pixel); + _mm_store_si128(d, result); + s++; + d++; + count -= 4; + } + src = reinterpret_cast(s); + dst = reinterpret_cast(d); } - src = reinterpret_cast(s); - dst = reinterpret_cast(d); + while (count > 0) { *dst = SkBlendARGB32(*src, *dst, alpha); src++; diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp index bd4807a..3b84596 100644 --- a/src/opts/opts_check_SSE2.cpp +++ b/src/opts/opts_check_SSE2.cpp @@ -15,6 +15,7 @@ ** limitations under the License. */ +#include "SkBitmapProcState_opts_SSE2.h" #include "SkBlitRow_opts_SSE2.h" #include "SkUtils_opts_SSE2.h" #include "SkUtils.h" @@ -64,6 +65,14 @@ static inline bool hasSSE2() { } #endif +void SkBitmapProcState::platformProcs() { + if (hasSSE2()) { + if (fSampleProc32 == S32_opaque_D32_filter_DX) { + fSampleProc32 = S32_opaque_D32_filter_DX_SSE2; + } + } +} + static SkBlitRow::Proc32 platform_32_procs[] = { NULL, // S32_Opaque, S32_Blend_BlitRow32_SSE2, // S32_Blend, -- 2.7.4