From: reed@google.com Date: Fri, 27 Jul 2012 13:29:59 +0000 (+0000) Subject: revert 4799-4801 -- red and blue are reversed on windows and linux X-Git-Tag: accepted/tizen/5.0/unified/20181102.025319~15414 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7329dc9976e5a61e2382974884d2e075f4f856f1;p=platform%2Fupstream%2FlibSkiaSharp.git revert 4799-4801 -- red and blue are reversed on windows and linux git-svn-id: http://skia.googlecode.com/svn/trunk@4803 2bbb7eff-a529-9590-31e7-b0007b416f81 --- diff --git a/src/opts/SkBlitRow_opts_SSE2.cpp b/src/opts/SkBlitRow_opts_SSE2.cpp index f61617a..74e44af 100644 --- a/src/opts/SkBlitRow_opts_SSE2.cpp +++ b/src/opts/SkBlitRow_opts_SSE2.cpp @@ -9,7 +9,6 @@ #include "SkBlitRow_opts_SSE2.h" #include "SkBitmapProcState_opts_SSE2.h" #include "SkColorPriv.h" -#include "SkDither.h" #include "SkUtils.h" #include @@ -762,793 +761,3 @@ void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], width--; } } - -/* SSE2 version of S32A_D565_Blend() - * portable version is in core/SkBlitRow_D16.cpp - */ - -void S32A_D565_Blend_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/) { - SkASSERT(255 > alpha); - - while (((size_t)src & 0xF ) && count-- > 0 ) - { - SkPMColor sc = *src++; - if (sc) { - uint16_t dc = *dst; - unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); - unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); - unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); - unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); - *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); - } - dst++; - } - - __m128i v_alpha = _mm_set1_epi16(alpha); - __m128i v_128 = _mm_set1_epi16(128); - - while (count >= 8) { - - __m128i src_pixel_l = _mm_load_si128((const __m128i*)(src)); - __m128i src_pixel_h = _mm_load_si128((const __m128i*)(src+4)); - - // compute dst_scale - __m128i dst_scale = _mm_srli_epi32(src_pixel_l, 24); - dst_scale = _mm_packs_epi32(dst_scale, _mm_srli_epi32(src_pixel_h, 24)); - - dst_scale = _mm_mullo_epi16(dst_scale, v_alpha); - dst_scale = _mm_add_epi16(dst_scale, v_128); - dst_scale = _mm_add_epi16(dst_scale, _mm_srli_epi16(dst_scale, 8)); - dst_scale = _mm_srli_epi16(dst_scale, 8); - dst_scale = _mm_sub_epi16(_mm_set1_epi16(255), dst_scale); - - // unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); - // unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); - // unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); - - __m128i dst_pixel = _mm_loadu_si128((__m128i *)dst); - - // SkMulS16(SkPacked32ToR16(sc), alpha) - __m128i src_r_l = _mm_srli_epi32(src_pixel_l, 3); - __m128i src_r_h = _mm_srli_epi32(src_pixel_h, 3); - src_r_l = _mm_and_si128(src_r_l, _mm_set1_epi32(0x1F)); - src_r_h = _mm_and_si128(src_r_h, _mm_set1_epi32(0x1F)); - src_r_l = _mm_packs_epi32(src_r_l, src_r_h); - src_r_l = _mm_mullo_epi16(src_r_l, v_alpha); - - // SkMulS16(SkGetPackedR16(dc), dst_scale) - __m128i dst_r = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); - dst_r = _mm_mullo_epi16(dst_r, dst_scale); - dst_r = _mm_add_epi16(dst_r, src_r_l); - dst_r = _mm_add_epi16(dst_r, v_128); - // add and shift - dst_r = _mm_add_epi16(dst_r, _mm_srli_epi16(dst_r, 8)); - dst_r = _mm_srli_epi16(dst_r, 8); - - // SkMulS16(SkPacked32ToG16(sc), alpha) - - __m128i src_g_l = _mm_srli_epi32(src_pixel_l, 10); - __m128i src_g_h = _mm_srli_epi32(src_pixel_h, 10); - src_g_l = _mm_and_si128(src_g_l, _mm_set1_epi32(0x3F)); - src_g_h = _mm_and_si128(src_g_h, _mm_set1_epi32(0x3F)); - src_g_l = _mm_packs_epi32(src_g_l, src_g_h); - src_g_l = _mm_mullo_epi16(src_g_l, v_alpha); - - // SkMulS16(SkGetPackedG16(dc), dst_scale) - __m128i dst_g = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); - dst_g = _mm_and_si128(dst_g, _mm_set1_epi16(SK_G16_MASK)); - dst_g = _mm_mullo_epi16(dst_g, dst_scale); - dst_g = _mm_add_epi16(dst_g, src_g_l); - dst_g = _mm_add_epi16(dst_g, v_128); - // add and shift - dst_g = _mm_add_epi16(dst_g, _mm_srli_epi16(dst_g, 8)); - dst_g = _mm_srli_epi16(dst_g, 8); - - // SkMulS16(SkPacked32ToB16(sc), alpha) - __m128i src_b_l = _mm_srli_epi32(src_pixel_l, 19); - __m128i src_b_h = _mm_srli_epi32(src_pixel_h, 19); - src_b_l = _mm_and_si128(src_b_l, _mm_set1_epi32(0x1F)); - src_b_h = _mm_and_si128(src_b_h, _mm_set1_epi32(0x1F)); - src_b_l = _mm_packs_epi32(src_b_l, src_b_h); - src_b_l = _mm_mullo_epi16(src_b_l, v_alpha); - - // SkMulS16(SkGetPackedB16(dc), dst_scale); - - __m128i dst_b = _mm_and_si128(dst_pixel, _mm_set1_epi16(SK_B16_MASK)); - dst_b = _mm_mullo_epi16(dst_b, dst_scale); - dst_b = _mm_add_epi16(dst_b, src_b_l); - dst_b = _mm_add_epi16(dst_b, v_128); - // add and shift - dst_b = _mm_add_epi16(dst_b, _mm_srli_epi16(dst_b, 8)); - dst_b = _mm_srli_epi16(dst_b, 8); - - // *dst = SkPackRGB16(dr, dg, db); - dst_r = _mm_slli_epi16(dst_r, SK_R16_SHIFT); - dst_r = _mm_or_si128(dst_r, dst_b); - dst_g = _mm_slli_epi16(dst_g, SK_G16_SHIFT); - dst_r = _mm_or_si128(dst_r, dst_g); - - _mm_storeu_si128((__m128i *)dst, dst_r); - - dst += 8; - src += 8; - count -= 8; -} - - while (count-- > 0 ) - { - SkPMColor sc = *src++; - if (sc) { - uint16_t dc = *dst; - unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); - unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); - unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); - unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); - *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); - } - dst++; - } - -} - -/* SSE2 version of S32A_D565_Opaque() - * portable version is in core/SkBlitRow_D16.cpp - */ - -void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/) { - while(((intptr_t)src & 0xf) && (count-- > 0)){ - *dst = SkSrcOver32To16(*src, *dst); - src++; - dst++; - } - - while (count >= 8) { - __m128i src_pixel1 = _mm_load_si128((const __m128i*)(src)); - __m128i src_pixel2 = _mm_load_si128((const __m128i*)(src + 4)); - - /* unpack src_r */ - __m128i src_r1 = _mm_slli_epi32(src_pixel1, 24); - src_r1 = _mm_srli_epi32(src_r1, 24); - __m128i src_r2 = _mm_slli_epi32(src_pixel2, 24); - src_r2 = _mm_srli_epi32(src_r2, 24); - __m128i src_r = _mm_packs_epi32 (src_r1, src_r2); - - /* unpack src_g */ - __m128i src_g1 = _mm_slli_epi32(src_pixel1, 16); - src_g1 = _mm_srli_epi32(src_g1, 24); - __m128i src_g2 = _mm_slli_epi32(src_pixel2, 16); - src_g2 = _mm_srli_epi32(src_g2, 24); - __m128i src_g = _mm_packs_epi32 (src_g1, src_g2); - - /* unpack src_b */ - __m128i src_b1 = _mm_slli_epi32(src_pixel1, 8); - src_b1 = _mm_srli_epi32(src_b1, 24); - __m128i src_b2 = _mm_slli_epi32(src_pixel2, 8); - src_b2 = _mm_srli_epi32(src_b2, 24); - __m128i src_b = _mm_packs_epi32 (src_b1, src_b2); - - /* compute isav */ - __m128i isav1 = _mm_srli_epi32(src_pixel1, 24); - isav1 = _mm_sub_epi32(_mm_set1_epi32(255), isav1); - __m128i isav2 = _mm_srli_epi32(src_pixel2, 24); - isav2 = _mm_sub_epi32(_mm_set1_epi32(255), isav2); - __m128i isav = _mm_packs_epi32 (isav1, isav2); - - __m128i dst_pixel = _mm_loadu_si128((__m128i *)dst); - - // get dst red - __m128i dst_r = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); - dst_r = _mm_and_si128(dst_r, _mm_set1_epi16(SK_R16_MASK)); - - // get dst green - __m128i dst_g = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); - dst_g = _mm_and_si128(dst_g, _mm_set1_epi16(SK_G16_MASK)); - - // get dst blue - __m128i dst_b = _mm_srli_epi16(dst_pixel, SK_B16_SHIFT); - dst_b = _mm_and_si128(dst_b, _mm_set1_epi16(SK_B16_MASK)); - - // interleave blending of r,g,b - // dr = (sr + SkMul16ShiftRound(dr,isa,SK_R16_BITS)) >> (8 - SK_R16_BITS); - // dg = (sg + SkMul16ShiftRound(dg,isa,SK_G16_BITS)) >> (8 - SK_G16_BITS); - // db = (sb + SkMul16ShiftRound(db,isa,SK_B16_BITS)) >> (8 - SK_B16_BITS); - - dst_r = _mm_mullo_epi16(dst_r, isav); - dst_g = _mm_mullo_epi16(dst_g, isav); - dst_b = _mm_mullo_epi16(dst_b, isav); - - dst_r = _mm_add_epi16(dst_r, _mm_set1_epi16(1 << (SK_R16_BITS - 1))); - dst_g = _mm_add_epi16(dst_g, _mm_set1_epi16(1 << (SK_G16_BITS - 1))); - dst_b = _mm_add_epi16(dst_b, _mm_set1_epi16(1 << (SK_B16_BITS - 1))); - dst_r = _mm_add_epi16(dst_r, _mm_srli_epi16(dst_r, SK_R16_BITS)); - dst_g = _mm_add_epi16(dst_g, _mm_srli_epi16(dst_g, SK_G16_BITS)); - dst_r = _mm_srli_epi16(dst_r, SK_R16_BITS); - - dst_b = _mm_add_epi16(dst_b, _mm_srli_epi16(dst_b, SK_B16_BITS)); - dst_g = _mm_srli_epi16(dst_g, SK_G16_BITS); - dst_r = _mm_add_epi16(dst_r, src_r); - - dst_b = _mm_srli_epi16(dst_b, SK_B16_BITS); - dst_g = _mm_add_epi16(dst_g, src_g); - - dst_r = _mm_srli_epi16(dst_r, (8 - SK_R16_BITS)); - dst_b = _mm_add_epi16(dst_b, src_b); - dst_g = _mm_srli_epi16(dst_g, (8 - SK_G16_BITS)); - dst_b = _mm_srli_epi16(dst_b, (8 - SK_B16_BITS)); - - // *dst = SkPackRGB16(dr, dg, db); - // SkToU16((r << SK_R16_SHIFT) | (g << SK_G16_SHIFT) | (b << SK_B16_SHIFT) - dst_r = _mm_slli_epi16(dst_r, SK_R16_SHIFT); - dst_g = _mm_slli_epi16(dst_g, SK_G16_SHIFT); - dst_r = _mm_or_si128(dst_r, dst_g); - dst_b = _mm_slli_epi16(dst_b, SK_B16_SHIFT); - dst_r = _mm_or_si128(dst_r, dst_b); - - _mm_storeu_si128((__m128i *)dst, dst_r); - - dst += 8; - src += 8; - count -= 8; - } - - while (count-- > 0) { - *dst = SkSrcOver32To16(*src, *dst); - src++; - dst++; - } -} - -/* SSE2 version of S32A_D565_Opaque_Dither() - * portable version is in core/SkBlitRow_D16.cpp - */ - -void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,int count, - U8CPU alpha, int x, int y) -{ - -#ifdef ENABLE_DITHER_MATRIX_4X4 - const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; -#else - uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; -#endif - - __m128i dither_v, dither_p, maskFF; - unsigned short dither_temp[4]; - - // if we are using SIMD, load dither_scan (or precursor) into 8 ints - if(count >= 8) { - -#ifdef ENABLE_DITHER_MATRIX_4X4 - // #define DITHER_VALUE(x) dither_scan[(x) & 3] - dither_temp[0] = dither_scan[x & 3]; - dither_temp[1] = dither_scan[x+1 & 3]; - dither_temp[2] = dither_scan[x+2 & 3]; - dither_temp[3] = dither_scan[x+3 & 3]; - dither_p = _mm_loadl_epi64((__m128i *) dither_temp); - dither_p = _mm_shuffle_epi32(dither_p, 0x44); -#else - dither_temp[0] = ((dither_scan >> ((x & 3) << 2)) & 0xF); - dither_temp[1] = ((dither_scan >> (((x+1) & 3) << 2)) & 0xF); - dither_temp[2] = ((dither_scan >> (((x+2) & 3) << 2)) & 0xF); - dither_temp[3] = ((dither_scan >> (((x+3) & 3) << 2)) & 0xF); - dither_p = _mm_loadl_epi64((__m128i *) dither_temp); - dither_p = _mm_shuffle_epi32(dither_p, 0x44); -#endif - maskFF = _mm_set1_epi32(0xFF); - } - - while (count >= 8) { - - __m128i src_pixel_l = _mm_loadu_si128((const __m128i*)(src)); - __m128i src_pixel_h = _mm_loadu_si128((const __m128i*)(src+4)); - - __m128i alpha_v = _mm_srli_epi32(src_pixel_l, 24); - alpha_v = _mm_packs_epi32(alpha_v, _mm_srli_epi32(src_pixel_h, 24)); - - dither_v = _mm_mullo_epi16(dither_p, _mm_add_epi16(alpha_v, _mm_set1_epi16(1))); - dither_v = _mm_srli_epi16(dither_v, 8); - - // sr = (sr + d - (sr >> 5)) - __m128i src_r_l = _mm_and_si128(src_pixel_l, maskFF); - __m128i src_r_h = _mm_and_si128(src_pixel_h, maskFF); - src_r_l = _mm_packs_epi32(src_r_l, src_r_h); - - __m128i srvs = _mm_srli_epi16(src_r_l, 5); - src_r_l = _mm_add_epi16(src_r_l, dither_v); - src_r_l = _mm_sub_epi16(src_r_l, srvs); - src_r_l = _mm_slli_epi16(src_r_l, 2); - - // sg = (sg + (d >> 1) - (sg >> 6)) - __m128i src_g_l = _mm_srli_epi32(src_pixel_l, 8); - src_g_l = _mm_and_si128(src_g_l, maskFF); - __m128i src_g_h = _mm_srli_epi32(src_pixel_h, 8); - src_g_h = _mm_and_si128(src_g_h, maskFF); - src_g_l = _mm_packs_epi32(src_g_l, src_g_h); - - __m128i sgvs = _mm_srli_epi16(src_g_l, 6); - src_g_l = _mm_add_epi16(src_g_l,_mm_srli_epi16(dither_v, 1)); - src_g_l = _mm_sub_epi16(src_g_l, sgvs); - src_g_l = _mm_slli_epi16(src_g_l, 3); - - //sb = (sb + d - (sb >> 5)) - __m128i src_b_l = _mm_srli_epi32(src_pixel_l, 16); - src_b_l = _mm_and_si128(src_b_l, maskFF); - __m128i src_b_h = _mm_srli_epi32(src_pixel_h, 16); - src_b_h = _mm_and_si128(src_b_h, maskFF); - src_b_l = _mm_packs_epi32(src_b_l, src_b_h); - - __m128i sbvs = _mm_srli_epi16(src_b_l, 5); - src_b_l = _mm_add_epi16(src_b_l, dither_v); - src_b_l = _mm_sub_epi16(src_b_l, sbvs); - src_b_l = _mm_slli_epi16(src_b_l, 2); - - // unpack dst - __m128i dst_pixel = _mm_loadu_si128((__m128i *)dst); - - // get dst red - __m128i dst_r = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); - dst_r = _mm_and_si128(dst_r, _mm_set1_epi16(SK_R16_MASK)); - - // get dst green - __m128i dst_g = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); - dst_g = _mm_and_si128(dst_g, _mm_set1_epi16(SK_G16_MASK)); - - // get dst blue - __m128i dst_b = _mm_and_si128(dst_pixel, _mm_set1_epi16(SK_R16_MASK)); - - //dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3) - __m128i avtemp = _mm_sub_epi16(_mm_set1_epi16(256), alpha_v); - avtemp = _mm_srli_epi16(avtemp, 3); - - dst_r = _mm_mullo_epi16(dst_r, avtemp); - dst_g = _mm_mullo_epi16(dst_g, avtemp); - dst_b = _mm_mullo_epi16(dst_b, avtemp); - - // *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5) - // (src_expanded + dst_expanded) >> 5 - - dst_r = _mm_add_epi16(dst_r, src_r_l); - dst_r = _mm_srli_epi16(dst_r, 5); - - dst_g = _mm_add_epi16(dst_g, src_g_l); - dst_g = _mm_srli_epi16(dst_g, 5); - - dst_b = _mm_add_epi16(dst_b, src_b_l); - dst_b = _mm_srli_epi16(dst_b, 5); - - dst_r = _mm_slli_epi16(dst_r, SK_R16_SHIFT); - dst_g = _mm_slli_epi16(dst_g, SK_G16_SHIFT); - dst_r = _mm_or_si128(dst_r, dst_g); - dst_r = _mm_or_si128(dst_r, dst_b); - - _mm_storeu_si128((__m128i *)dst, dst_r); - - src += 8; - dst += 8; - x += 8; - count -= 8; - } - - if (count > 0) { - DITHER_565_SCAN(y); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - if (c) { - unsigned a = SkGetPackedA32(c); - int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); - unsigned sr = SkGetPackedR32(c); - unsigned sg = SkGetPackedG32(c); - unsigned sb = SkGetPackedB32(c); - sr = SkDITHER_R32_FOR_565(sr, d); - sg = SkDITHER_G32_FOR_565(sg, d); - sb = SkDITHER_B32_FOR_565(sb, d); - - uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); - uint32_t dst_expanded = SkExpand_rgb_16(*dst); - dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); - // now src and dst expanded are in g:11 r:10 x:1 b:10 - *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); - } - dst += 1; - DITHER_INC_X(x); - } while (--count != 0); - } -} - -/* SSE2 version of S32_D565_Opaque() - * portable version is in core/SkBlitRow_D16.cpp - */ - -void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,int count, - U8CPU alpha, int x, int y) -{ - - __m128i mask_RB, mask_G; - - if(count >= 8) { - mask_RB = _mm_set1_epi32(0x1F); - mask_G = _mm_set1_epi32(0x3F); - } - - while (count >= 8) { - - __m128i src_pixel_l = _mm_loadu_si128((const __m128i*)(src)); - __m128i src_pixel_h = _mm_loadu_si128((const __m128i*)(src+4)); - - __m128i src_r_l = _mm_srli_epi32(src_pixel_l, 3); - src_r_l = _mm_and_si128(src_r_l, mask_RB); - __m128i src_r_h = _mm_srli_epi32(src_pixel_h, 3); - src_r_h = _mm_and_si128(src_r_h, mask_RB); - src_r_l = _mm_packs_epi32(src_r_l, src_r_h); - - __m128i src_g_l = _mm_srli_epi32(src_pixel_l, 10); - src_g_l = _mm_and_si128(src_g_l, mask_G); - __m128i src_g_h = _mm_srli_epi32(src_pixel_h, 10); - src_g_h = _mm_and_si128(src_g_h, mask_G); - src_g_l = _mm_packs_epi32(src_g_l, src_g_h); - - __m128i src_b_l = _mm_srli_epi32(src_pixel_l, 19); - src_b_l = _mm_and_si128(src_b_l, mask_RB); - __m128i src_b_h = _mm_srli_epi32(src_pixel_h, 19); - src_b_h = _mm_and_si128(src_b_h, mask_RB); - src_b_l = _mm_packs_epi32(src_b_l, src_b_h); - - src_r_l = _mm_slli_epi16(src_r_l, SK_R16_SHIFT); - src_g_l = _mm_slli_epi16(src_g_l, SK_G16_SHIFT); - src_r_l = _mm_or_si128(src_r_l, src_g_l); - src_r_l = _mm_or_si128(src_r_l, src_b_l); - - _mm_storeu_si128((__m128i *)dst, src_r_l); - - src += 8; - dst += 8; - count -= 8; - } - - if (count > 0) { - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - SkASSERT(SkGetPackedA32(c) == 255); - *dst++ = SkPixel32ToPixel16_ToU16(c); - } while (--count != 0); - } -} - -/* SSE2 version of S32_D565_Blend() - * portable version is in core/SkBlitRow_D16.cpp - */ - -void S32_D565_Blend_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,int count, - U8CPU alpha, int x, int y) -{ - - __m128i alpha_scale, mask_RB, mask_G; - - if(count >= 8) { - alpha_scale = _mm_set1_epi16(SkAlpha255To256(alpha)); - mask_RB = _mm_set1_epi32(0x1F); - mask_G = _mm_set1_epi32(0x3F); - } - - while (count >= 8) { - - __m128i src_pixel_l = _mm_loadu_si128((const __m128i*)(src)); - __m128i src_pixel_h = _mm_loadu_si128((const __m128i*)(src+4)); - - __m128i src_r_l = _mm_srli_epi32(src_pixel_l, 3); - src_r_l = _mm_and_si128(src_r_l, mask_RB); - __m128i src_r_h = _mm_srli_epi32(src_pixel_h, 3); - src_r_h = _mm_and_si128(src_r_h, mask_RB); - src_r_l = _mm_packs_epi32(src_r_l, src_r_h); - - __m128i src_g_l = _mm_srli_epi32(src_pixel_l, 10); - src_g_l = _mm_and_si128(src_g_l, mask_G); - __m128i src_g_h = _mm_srli_epi32(src_pixel_h, 10); - src_g_h = _mm_and_si128(src_g_h, mask_G); - src_g_l = _mm_packs_epi32(src_g_l, src_g_h); - - __m128i src_b_l = _mm_srli_epi32(src_pixel_l, 19); - src_b_l = _mm_and_si128(src_b_l, mask_RB); - __m128i src_b_h = _mm_srli_epi32(src_pixel_h, 19); - src_b_h = _mm_and_si128(src_b_h, mask_RB); - src_b_l = _mm_packs_epi32(src_b_l, src_b_h); - - __m128i dst_pixel = _mm_loadu_si128((__m128i *)dst); - __m128i dst_r = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); - src_r_l = _mm_sub_epi16(src_r_l, dst_r); - src_r_l = _mm_mullo_epi16(src_r_l, alpha_scale); - src_r_l = _mm_srli_epi16(src_r_l, 8); - src_r_l = _mm_add_epi16(src_r_l, dst_r); - src_r_l = _mm_and_si128(src_r_l, _mm_set1_epi16(SK_R16_MASK)); - - __m128i dst_g = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); - dst_g = _mm_and_si128(dst_g, _mm_set1_epi16(SK_G16_MASK)); - src_g_l = _mm_sub_epi16(src_g_l, dst_g); - src_g_l = _mm_mullo_epi16(src_g_l, alpha_scale); - src_g_l = _mm_srli_epi16(src_g_l, 8); - src_g_l = _mm_add_epi16(src_g_l, dst_g); - src_g_l = _mm_and_si128(src_g_l, _mm_set1_epi16(SK_G16_MASK)); - - __m128i dst_b = _mm_and_si128(dst_pixel, _mm_set1_epi16(SK_B16_MASK)); - src_b_l = _mm_sub_epi16(src_b_l, dst_b); - src_b_l = _mm_mullo_epi16(src_b_l, alpha_scale); - src_b_l = _mm_srli_epi16(src_b_l, 8); - src_b_l = _mm_add_epi16(src_b_l, dst_b); - src_b_l = _mm_and_si128(src_b_l, _mm_set1_epi16(SK_B16_MASK)); - - src_r_l = _mm_slli_epi16(src_r_l, SK_R16_SHIFT); - src_g_l = _mm_slli_epi16(src_g_l, SK_G16_SHIFT); - src_r_l = _mm_or_si128(src_r_l, src_g_l); - src_r_l = _mm_or_si128(src_r_l, src_b_l); - - _mm_storeu_si128((__m128i *)dst, src_r_l); - - src += 8; - dst += 8; - count -= 8; - } - - - if (count > 0) { - int scale = SkAlpha255To256(alpha); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - SkASSERT(SkGetPackedA32(c) == 255); - uint16_t d = *dst; - *dst++ = SkPackRGB16( - SkAlphaBlend(SkPacked32ToR16(c), SkGetPackedR16(d), scale), - SkAlphaBlend(SkPacked32ToG16(c), SkGetPackedG16(d), scale), - SkAlphaBlend(SkPacked32ToB16(c), SkGetPackedB16(d), scale)); - } while (--count != 0); - } -} - -/* SSE2 version of S32_D565_Opaque_Dither() - * portable version is in core/SkBlitRow_D16.cpp - */ - -void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,int count, - U8CPU alpha, int x, int y) -{ -#ifdef ENABLE_DITHER_MATRIX_4X4 - const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; -#else - uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; -#endif - - __m128i dither_v, maskFF; - unsigned short dither_temp[4]; - - // if we are using SIMD, load dither_scan (or precursor) into 8 ints - if(count >= 8) { - -#ifdef ENABLE_DITHER_MATRIX_4X4 - // #define DITHER_VALUE(x) dither_scan[(x) & 3] - dither_temp[0] = dither_scan[x & 3]; - dither_temp[1] = dither_scan[x+1 & 3]; - dither_temp[2] = dither_scan[x+2 & 3]; - dither_temp[3] = dither_scan[x+3 & 3]; - dither_v = _mm_loadl_epi64((__m128i *) dither_temp); - dither_v = _mm_shuffle_epi32(dither_v, 0x44); -#else - dither_temp[0] = ((dither_scan >> ((x & 3) << 2)) & 0xF); - dither_temp[1] = ((dither_scan >> (((x+1) & 3) << 2)) & 0xF); - dither_temp[2] = ((dither_scan >> (((x+2) & 3) << 2)) & 0xF); - dither_temp[3] = ((dither_scan >> (((x+3) & 3) << 2)) & 0xF); - dither_v = _mm_loadl_epi64((__m128i *) dither_temp); - dither_v = _mm_shuffle_epi32(dither_v, 0x44); -#endif - maskFF = _mm_set1_epi32(0xFF); - } - - while (count >= 8) { - - __m128i src_pixel_l = _mm_loadu_si128((const __m128i*)(src)); - __m128i src_pixel_h = _mm_loadu_si128((const __m128i*)(src+4)); - - // sr = (sr + d - (sr >> 5)) - __m128i src_r_l = _mm_and_si128(src_pixel_l, maskFF); - __m128i src_r_h = _mm_and_si128(src_pixel_h, maskFF); - src_r_l = _mm_packs_epi32(src_r_l, src_r_h); - - __m128i srvs = _mm_srli_epi16(src_r_l, 5); - src_r_l = _mm_add_epi16(src_r_l, dither_v); - src_r_l = _mm_sub_epi16(src_r_l, srvs); - src_r_l = _mm_srli_epi16(src_r_l, 3); - - // sg = (sg + (d >> 1) - (sg >> 6)) - __m128i src_g_l = _mm_srli_epi32(src_pixel_l, 8); - src_g_l = _mm_and_si128(src_g_l, maskFF); - __m128i src_g_h = _mm_srli_epi32(src_pixel_h, 8); - src_g_h = _mm_and_si128(src_g_h, maskFF); - src_g_l = _mm_packs_epi32(src_g_l, src_g_h); - - __m128i sgvs = _mm_srli_epi16(src_g_l, 6); - src_g_l = _mm_add_epi16(src_g_l,_mm_srli_epi16(dither_v, 1)); - src_g_l = _mm_sub_epi16(src_g_l, sgvs); - src_g_l = _mm_srli_epi16(src_g_l, 2); - - //sb = (sb + d - (sb >> 5)) - __m128i src_b_l = _mm_srli_epi32(src_pixel_l, 16); - src_b_l = _mm_and_si128(src_b_l, maskFF); - __m128i src_b_h = _mm_srli_epi32(src_pixel_h, 16); - src_b_h = _mm_and_si128(src_b_h, maskFF); - src_b_l = _mm_packs_epi32(src_b_l, src_b_h); - - __m128i sbvs = _mm_srli_epi16(src_b_l, 5); - src_b_l = _mm_add_epi16(src_b_l, dither_v); - src_b_l = _mm_sub_epi16(src_b_l, sbvs); - src_b_l = _mm_srli_epi16(src_b_l, 3); - - src_r_l = _mm_slli_epi16(src_r_l, SK_R16_SHIFT); - src_g_l = _mm_slli_epi16(src_g_l, SK_G16_SHIFT); - src_r_l = _mm_or_si128(src_r_l, src_g_l); - src_r_l = _mm_or_si128(src_r_l, src_b_l); - - _mm_storeu_si128((__m128i *)dst, src_r_l); - - src += 8; - dst += 8; - x += 8; - count -= 8; - } - - if (count > 0) { - DITHER_565_SCAN(y); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - SkASSERT(SkGetPackedA32(c) == 255); - - unsigned dither = DITHER_VALUE(x); - *dst++ = SkDitherRGB32To565(c, dither); - DITHER_INC_X(x); - } while (--count != 0); - } -} - -/* SSE2 version of S32_D565_Blend_Dither() - * portable version is in core/SkBlitRow_D16.cpp - */ - -void S32_D565_Blend_Dither_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,int count, - U8CPU alpha, int x, int y) -{ -#ifdef ENABLE_DITHER_MATRIX_4X4 - const uint8_t* dither_scan = gDitherMatrix_3Bit_4X4[(y) & 3]; -#else - uint16_t dither_scan = gDitherMatrix_3Bit_16[(y) & 3]; -#endif - - __m128i dither_v, maskFF, alpha_scale; - unsigned short dither_temp[4]; - - // if we are using SIMD, load dither_scan (or precursor) into 8 ints - if(count >= 8) { - -#ifdef ENABLE_DITHER_MATRIX_4X4 - // #define DITHER_VALUE(x) dither_scan[(x) & 3] - dither_temp[0] = dither_scan[x & 3]; - dither_temp[1] = dither_scan[x+1 & 3]; - dither_temp[2] = dither_scan[x+2 & 3]; - dither_temp[3] = dither_scan[x+3 & 3]; - dither_v = _mm_loadl_epi64((__m128i *) dither_temp); - dither_v = _mm_shuffle_epi32(dither_v, 0x44); -#else - dither_temp[0] = ((dither_scan >> ((x & 3) << 2)) & 0xF); - dither_temp[1] = ((dither_scan >> (((x+1) & 3) << 2)) & 0xF); - dither_temp[2] = ((dither_scan >> (((x+2) & 3) << 2)) & 0xF); - dither_temp[3] = ((dither_scan >> (((x+3) & 3) << 2)) & 0xF); - dither_v = _mm_loadl_epi64((__m128i *) dither_temp); - dither_v = _mm_shuffle_epi32(dither_v, 0x44); -#endif - maskFF = _mm_set1_epi32(0xFF); - alpha_scale = _mm_set1_epi16(SkAlpha255To256(alpha)); - } - - while (count >= 8) { - - __m128i src_pixel_l = _mm_loadu_si128((const __m128i*)(src)); - __m128i src_pixel_h = _mm_loadu_si128((const __m128i*)(src+4)); - - // sr = (sr + d - (sr >> 5)) - __m128i src_r_l = _mm_and_si128(src_pixel_l, maskFF); - __m128i src_r_h = _mm_and_si128(src_pixel_h, maskFF); - src_r_l = _mm_packs_epi32(src_r_l, src_r_h); - - __m128i srvs = _mm_srli_epi16(src_r_l, 5); - src_r_l = _mm_add_epi16(src_r_l, dither_v); - src_r_l = _mm_sub_epi16(src_r_l, srvs); - src_r_l = _mm_srli_epi16(src_r_l, 3); - - // sg = (sg + (d >> 1) - (sg >> 6)) - __m128i src_g_l = _mm_srli_epi32(src_pixel_l, 8); - src_g_l = _mm_and_si128(src_g_l, maskFF); - __m128i src_g_h = _mm_srli_epi32(src_pixel_h, 8); - src_g_h = _mm_and_si128(src_g_h, maskFF); - src_g_l = _mm_packs_epi32(src_g_l, src_g_h); - - __m128i sgvs = _mm_srli_epi16(src_g_l, 6); - src_g_l = _mm_add_epi16(src_g_l,_mm_srli_epi16(dither_v, 1)); - src_g_l = _mm_sub_epi16(src_g_l, sgvs); - src_g_l = _mm_srli_epi16(src_g_l, 2); - - //sb = (sb + d - (sb >> 5)) - __m128i src_b_l = _mm_srli_epi32(src_pixel_l, 16); - src_b_l = _mm_and_si128(src_b_l, maskFF); - __m128i src_b_h = _mm_srli_epi32(src_pixel_h, 16); - src_b_h = _mm_and_si128(src_b_h, maskFF); - src_b_l = _mm_packs_epi32(src_b_l, src_b_h); - - __m128i sbvs = _mm_srli_epi16(src_b_l, 5); - src_b_l = _mm_add_epi16(src_b_l, dither_v); - src_b_l = _mm_sub_epi16(src_b_l, sbvs); - src_b_l = _mm_srli_epi16(src_b_l, 3); - - __m128i dst_pixel = _mm_loadu_si128((__m128i *)dst); - __m128i dst_r = _mm_srli_epi16(dst_pixel, SK_R16_SHIFT); - src_r_l = _mm_sub_epi16(src_r_l, dst_r); - src_r_l = _mm_mullo_epi16(src_r_l, alpha_scale); - src_r_l = _mm_srli_epi16(src_r_l, 8); - src_r_l = _mm_add_epi16(src_r_l, dst_r); - src_r_l = _mm_and_si128(src_r_l, _mm_set1_epi16(SK_R16_MASK)); - - __m128i dst_g = _mm_srli_epi16(dst_pixel, SK_G16_SHIFT); - dst_g = _mm_and_si128(dst_g, _mm_set1_epi16(SK_G16_MASK)); - src_g_l = _mm_sub_epi16(src_g_l, dst_g); - src_g_l = _mm_mullo_epi16(src_g_l, alpha_scale); - src_g_l = _mm_srli_epi16(src_g_l, 8); - src_g_l = _mm_add_epi16(src_g_l, dst_g); - src_g_l = _mm_and_si128(src_g_l, _mm_set1_epi16(SK_G16_MASK)); - - __m128i dst_b = _mm_and_si128(dst_pixel, _mm_set1_epi16(SK_B16_MASK)); - src_b_l = _mm_sub_epi16(src_b_l, dst_b); - src_b_l = _mm_mullo_epi16(src_b_l, alpha_scale); - src_b_l = _mm_srli_epi16(src_b_l, 8); - src_b_l = _mm_add_epi16(src_b_l, dst_b); - src_b_l = _mm_and_si128(src_b_l, _mm_set1_epi16(SK_B16_MASK)); - - src_r_l = _mm_slli_epi16(src_r_l, SK_R16_SHIFT); - src_g_l = _mm_slli_epi16(src_g_l, SK_G16_SHIFT); - src_r_l = _mm_or_si128(src_r_l, src_g_l); - src_r_l = _mm_or_si128(src_r_l, src_b_l); - - _mm_storeu_si128((__m128i *)dst, src_r_l); - - src += 8; - dst += 8; - x += 8; - count -= 8; - } - - if (count > 0) { - int scale = SkAlpha255To256(alpha); - DITHER_565_SCAN(y); - do { - SkPMColor c = *src++; - SkPMColorAssert(c); - SkASSERT(SkGetPackedA32(c) == 255); - - int dither = DITHER_VALUE(x); - int sr = SkGetPackedR32(c); - int sg = SkGetPackedG32(c); - int sb = SkGetPackedB32(c); - sr = SkDITHER_R32To565(sr, dither); - sg = SkDITHER_G32To565(sg, dither); - sb = SkDITHER_B32To565(sb, dither); - - uint16_t d = *dst; - *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), - SkAlphaBlend(sg, SkGetPackedG16(d), scale), - SkAlphaBlend(sb, SkGetPackedB16(d), scale)); - DITHER_INC_X(x); - } while (--count != 0); - } -} diff --git a/src/opts/SkBlitRow_opts_SSE2.h b/src/opts/SkBlitRow_opts_SSE2.h index 05f3893..b443ec7 100644 --- a/src/opts/SkBlitRow_opts_SSE2.h +++ b/src/opts/SkBlitRow_opts_SSE2.h @@ -28,27 +28,3 @@ void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor); void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[], SkColor color, int width, SkPMColor opaqueDst); - -void S32A_D565_Blend_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/); - -void S32A_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int /*x*/, int /*y*/); - -void S32A_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, - const SkPMColor* SK_RESTRICT src, int count, - U8CPU alpha, int x, int y); - -void S32_D565_Opaque_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,int count, - U8CPU alpha, int x, int y); - -void S32_D565_Blend_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,int count, - U8CPU alpha, int x, int y); - -void S32_D565_Opaque_Dither_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,int count, - U8CPU alpha, int x, int y); - -void S32_D565_Blend_Dither_SSE2(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src,int count, - U8CPU alpha, int x, int y); diff --git a/src/opts/opts_check_SSE2.cpp b/src/opts/opts_check_SSE2.cpp index 5050ea2..1a704b8 100644 --- a/src/opts/opts_check_SSE2.cpp +++ b/src/opts/opts_check_SSE2.cpp @@ -152,32 +152,12 @@ static SkBlitRow::Proc32 platform_32_procs[] = { S32A_Blend_BlitRow32_SSE2, // S32A_Blend, }; -static SkBlitRow::Proc platform_565_procs[] = { - // no dither - S32_D565_Opaque_SSE2, // S32_D565_Opaque, - S32_D565_Blend_SSE2, // S32_D565_Blend, - - S32A_D565_Opaque_SSE2, // S32A_D565_Opaque - S32A_D565_Blend_SSE2, // S32A_D565_Blend - - // dither - S32_D565_Opaque_Dither_SSE2, // S32_D565_Opaque_Dither, - S32_D565_Blend_Dither_SSE2, // S32_D565_Blend_Dither, - - S32A_D565_Opaque_Dither_SSE2, // S32A_D565_Opaque_Dither - NULL // S32A_D565_Blend_Dither -}; - SkBlitRow::Proc SkBlitRow::PlatformProcs4444(unsigned flags) { return NULL; } SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { - if (hasSSE2()) { - return platform_565_procs[flags]; - }else { - return NULL; - } + return NULL; } SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() {