From 522aa8d4d61f30bddbaf996b2933d5f7b1f23880 Mon Sep 17 00:00:00 2001 From: "reed@android.com" Date: Thu, 22 Oct 2009 20:26:53 +0000 Subject: [PATCH] fix S32A_D565_Blend_neon git-svn-id: http://skia.googlecode.com/svn/trunk@405 2bbb7eff-a529-9590-31e7-b0007b416f81 --- src/core/SkBitmapProcState_matrix_clamp.h | 110 +++++------ src/core/SkBitmapProcState_matrix_repeat.h | 66 ++++--- src/opts/SkBlitRow_opts_arm.cpp | 291 ++++++++++++++++++++++------- 3 files changed, 312 insertions(+), 155 deletions(-) diff --git a/src/core/SkBitmapProcState_matrix_clamp.h b/src/core/SkBitmapProcState_matrix_clamp.h index 565af10..2243468 100644 --- a/src/core/SkBitmapProcState_matrix_clamp.h +++ b/src/core/SkBitmapProcState_matrix_clamp.h @@ -299,8 +299,8 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, #endif #if 1 - // 2009/9/30) crashes in ApiDemos - Views - Animation - 3D Transition - // 2009/10/9: reworked, seems right now + // 2009/9/30: crashes in ApiDemos - Views - Animation - 3D Transition + // 2009/10/9: reworked to avoid illegal (but allowed by gas) insn /* srcXY is a batch of 32 bit numbers X0,Y0,X1,Y1... * but we immediately discard the low 16 bits... @@ -389,18 +389,18 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, register int16x4_t out_y asm("d17") = yhi; asm ("vst2.16 {d16-d17},[%2] /* xlo=%P0 xhi=%P1 */" - : - : "w" (out_x), "w" (out_y), "r" (mydst) - ); + : + : "w" (out_x), "w" (out_y), "r" (mydst) + ); } { register int16x4_t out_x asm("d18") = x2hi; register int16x4_t out_y asm("d19") = y2hi; asm ("vst2.16 {d18-d19},[%2] /* xlo=%P0 xhi=%P1 */" - : - : "w" (out_x), "w" (out_y), "r" (mydst+8) - ); + : + : "w" (out_x), "w" (out_y), "r" (mydst+8) + ); } /* XXX: gcc isn't interleaving these with the NEON ops @@ -425,39 +425,38 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, /* for checking our NEON-produced results against vanilla code */ { int bad = (-1); - for (int i = 0; i < base_count; i++) - { - uint32_t val; - val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | - TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); + for (int i = 0; i < base_count; i++) { + uint32_t val; + val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | + TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); - if (val != base_xy[i]) { - bad = i; - break; - } + if (val != base_xy[i]) { + bad = i; + break; + } } if (bad >= 0) { - SkDebugf ("clamp-nofilter-persp failed piece %d\n", bad); - SkDebugf (" maxX %08x maxY %08x\n", maxX, maxY); - bad -= (bad & 0x7); /* align */ - for (int i = bad; i < bad + 8; i++) { - uint32_t val; - val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | - TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); - - SkDebugf ("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n", - i, base_xy[i], val, base_srcXY[i * 2 + 0], - base_srcXY[i * 2 + 1]); - } - SkDebugf ("---\n"); + SkDebugf("clamp-nofilter-persp failed piece %d\n", bad); + SkDebugf(" maxX %08x maxY %08x\n", maxX, maxY); + bad -= (bad & 0x7); /* align */ + for (int i = bad; i < bad + 8; i++) { + uint32_t val; + val = (TILEY_PROCF (base_srcXY[i * 2 + 1], maxY) << 16) | + TILEX_PROCF (base_srcXY[i * 2 + 0], maxX); + + SkDebugf("%d: got %08x want %08x srcXY[0] %08x srcXY[1] %08x\n", + i, base_xy[i], val, base_srcXY[i * 2 + 0], + base_srcXY[i * 2 + 1]); + } + SkDebugf ("---\n"); } if (end_xy != xy) { - SkDebugf ("xy ended at %08x, should be %08x\n", xy, end_xy); + SkDebugf("xy ended at %08x, should be %08x\n", xy, end_xy); } if (end_srcXY != srcXY) { - SkDebugf ("srcXY ended at %08x, should be %08x\n", srcXY, - end_srcXY); + SkDebugf("srcXY ended at %08x, should be %08x\n", srcXY, + end_srcXY); } } #endif @@ -621,10 +620,6 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, int32x4_t wide_dx, wide_fx, wide_onex, wide_fx1; int32x4_t wide_dy, wide_fy, wide_oney, wide_fy1; - /* need side-by-side registers for vst2.32 tricks */ - register int32x4_t wide_x asm("q7"); - register int32x4_t wide_y asm("q6"); - #undef AFFINE_DEBUG #if defined(AFFINE_DEBUG) SkFixed fyp = fy; @@ -649,6 +644,9 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, wide_oney = vdupq_n_s32(oneY); while (count >= 4) { + int32x4_t wide_x; + int32x4_t wide_y; + /* do the X side, then the Y side, then interleave them */ /* original expands to: @@ -698,10 +696,17 @@ static void AFFINE_FILTER_NAME(const SkBitmapProcState& s, wide_y = vorrq_s32(wide_i, wide_fy1); /* interleave as YXYXYXYX as part of the storing */ - asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */" - : - : "w" (wide_y), "w" (wide_x), "r" (xy) - ); + { + /* vst2.32 needs side-by-side registers */ + register int32x4_t t_x asm("q1"); + register int32x4_t t_y asm("q0"); + + t_x = wide_x; t_y = wide_y; + asm ("vst2.32 {q0-q1},[%2] /* y=%q0 x=%q1 */" + : + : "w" (t_y), "w" (t_x), "r" (xy) + ); + } #if defined(AFFINE_DEBUG) /* make sure we're good here -- check the 4 we just output */ @@ -773,22 +778,21 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s, int32x4_t wide_fy1; int32x4_t wide_x, wide_y; - /* need side-by-side regs for vld2/vst2 tricks */ - /* RBE: avoid low registers */ - register int32x4_t wide_first asm ("q6"); - register int32x4_t wide_second asm ("q7"); - while (count >= 4) { + /* need side-by-side regs for vld2/vst2 tricks */ + register int32x4_t wide_first asm ("q0"); + register int32x4_t wide_second asm ("q1"); + /* RBE: it's good, but: * -- we spill a constant that could be easily regnerated * [perhaps tweak gcc's NEON constant costs?] */ /* load src: x-y-x-y-x-y-x-y */ - asm ("vld2.32 {q6-q7},[%2] /* x=%q0 y=%q1 */" + asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" : "=w" (wide_first), "=w" (wide_second) : "r" (srcXY)); - + /* immediately get into vars gcc can move around if needed */ wide_x = wide_first; wide_y = wide_second; @@ -848,16 +852,16 @@ static void PERSP_FILTER_NAME(const SkBitmapProcState& s, /* switch them around; have to do it this way to get them * in the proper registers to match our instruction */ - /* wide_x/wide_y are fixed regs, in wrong order; swap 'em */ - wide_first = wide_y; - wide_second = wide_x; - /* iteration bookkeeping, ahead of the asm() for scheduling */ srcXY += 2*4; count -= 4; /* store interleaved as y-x-y-x-y-x-y-x (NB != read order) */ - asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */" + /* wide_x/wide_y are fixed regs, in wrong order; swap 'em */ + wide_first = wide_y; + wide_second = wide_x; + + asm ("vst2.32 {q6-q7},[%2] /* y=%q0 x=%q1 */" : : "w" (wide_first), "w" (wide_second), "r" (xy)); diff --git a/src/core/SkBitmapProcState_matrix_repeat.h b/src/core/SkBitmapProcState_matrix_repeat.h index 8d743a5..e5309f8 100644 --- a/src/core/SkBitmapProcState_matrix_repeat.h +++ b/src/core/SkBitmapProcState_matrix_repeat.h @@ -95,8 +95,7 @@ static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s, * but some processing of the 'fx' information * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */ - if (count >= 8) - { + if (count >= 8) { /* SkFixed is 16.16 fixed point */ SkFixed dx2 = dx+dx; SkFixed dx4 = dx2+dx2; @@ -224,8 +223,7 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, * but some processing of the 'fx' information * TILEX_PROCF(fx, max) (((fx) & 0xFFFF) * ((max) + 1) >> 16) */ - if (count >= 4) - { + if (count >= 4) { /* SkFixed is 16.16 fixed point */ SkFixed dx4 = dx*4; SkFixed dy4 = dy*4; @@ -257,11 +255,10 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, ybase = vsetq_lane_s32(fy+dy+dy+dy, ybase, 3); /* store & bump */ - do - { + do { int32x4_t xout; - int32x4_t yout; - int16x8_t hi16; + int32x4_t yout; + int16x8_t hi16; /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ /* mask to low 16 [would like to use uzp tricks) */ @@ -303,11 +300,11 @@ static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s, ofx += odx; ofy += ody; } if (bad) { - SkDebugf("repeat-nofilter-affine fails\n"); - SkDebugf("count %d myi %d\n", ocount, myi); - SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n", - bfx, bdx, bfy, bdy); - SkDebugf("maxX %08x maxY %08x\n", maxX, maxY); + SkDebugf("repeat-nofilter-affine fails\n"); + SkDebugf("count %d myi %d\n", ocount, myi); + SkDebugf(" bfx %08x, bdx %08x, bfy %08x bdy %08x\n", + bfx, bdx, bfy, bdy); + SkDebugf("maxX %08x maxY %08x\n", maxX, maxY); } #endif #endif @@ -362,27 +359,36 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, int32_t *mysrc = (int32_t *) srcXY; int16_t *mydst = (int16_t *) xy; do { - register int32x4_t x asm("q0"); - register int32x4_t y asm("q1"); - register int32x4_t x2 asm("q2"); - register int32x4_t y2 asm("q3"); - - int16x8_t hi; - int16x8_t hi2; + int32x4_t x, y, x2, y2; + int16x8_t hi, hi2; /* read array of x,y,x,y,x,y */ /* vld2 does the de-interleaving for us */ - /* dependent on register assignments above */ - asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" - : "=w" (x), "=w" (y) - : "r" (mysrc) - ); + /* isolate reg-bound scopes; gcc will minimize register + * motion if possible; this ensures that we don't lose + * a register across a debugging call because it happens + * to be bound into a call-clobbered register + */ + { + register int32x4_t q0 asm("q0"); + register int32x4_t q1 asm("q1"); + asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" + : "=w" (q0), "=w" (q1) + : "r" (mysrc) + ); + x = q0; y = q1; + } /* offset == 256 bits == 32 bytes == 8 longs */ - asm ("vld2.32 {q2-q3},[%2] /* x=%q0 y=%q1 */" - : "=w" (x2), "=w" (y2) - : "r" (mysrc+8) - ); + { + register int32x4_t q2 asm("q2"); + register int32x4_t q3 asm("q3"); + asm ("vld2.32 {q0-q1},[%2] /* x=%q0 y=%q1 */" + : "=w" (q2), "=w" (q3) + : "r" (mysrc+8) + ); + x = q2; y = q3; + } /* TILEX_PROCF(fx, max) (((fx)&0xFFFF)*((max)+1)>> 16) */ /* mask to low 16 [would like to use uzp tricks) */ @@ -405,7 +411,7 @@ static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s, hi = vreinterpretq_s16_s32(y); vst1q_s16(mydst, hi); - /* and push second 8 entries out */ + /* and likewise for the second 8 entries */ y2 = vsriq_n_s32(y2, x2, 16); hi2 = vreinterpretq_s16_s32(y2); vst1q_s16(mydst+8, hi2); diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index 89b2e91..506c4d4 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -104,8 +104,9 @@ static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", "d30","d31" ); - } else { - // handle count < 8 + } + else + { // handle count < 8 uint16_t* SK_RESTRICT keep_dst; asm volatile ( @@ -427,20 +428,17 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, SkASSERT(255 == alpha); if (count > 0) { + + uint8x8_t alpha_mask; + + static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; + alpha_mask = vld1_u8(alpha_mask_setup); + /* do the NEON unrolled code */ #define UNROLL 4 while (count >= UNROLL) { uint8x8_t src_raw, dst_raw, dst_final; uint8x8_t src_raw_2, dst_raw_2, dst_final_2; - uint8x8_t alpha_mask; - - /* use vtbl, with src_raw as the table */ - /* expect gcc to hoist alpha_mask setup above loop */ - alpha_mask = vdup_n_u8(3); - alpha_mask = vset_lane_u8(7, alpha_mask, 4); - alpha_mask = vset_lane_u8(7, alpha_mask, 5); - alpha_mask = vset_lane_u8(7, alpha_mask, 6); - alpha_mask = vset_lane_u8(7, alpha_mask, 7); /* get the source */ src_raw = vreinterpret_u8_u32(vld1_u32(src)); @@ -454,7 +452,6 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); #endif -#if 1 /* 1st and 2nd bits of the unrolling */ { uint8x8_t dst_cooked; @@ -464,12 +461,10 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, /* get the alphas spread out properly */ alpha_narrow = vtbl1_u8(src_raw, alpha_mask); - alpha_narrow = vsub_u8(vdup_n_u8(255), alpha_narrow); - alpha_wide = vmovl_u8(alpha_narrow); + alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); - /* get the dest, spread it */ - dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); + /* spread the dest */ dst_wide = vmovl_u8(dst_raw); /* alpha mul the dest */ @@ -479,7 +474,6 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, /* sum -- ignoring any byte lane overflows */ dst_final = vadd_u8(src_raw, dst_cooked); } -#endif #if UNROLL > 2 /* the 3rd and 4th bits of our unrolling */ @@ -490,11 +484,10 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, uint16x8_t alpha_wide; alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); - alpha_narrow = vsub_u8(vdup_n_u8(255), alpha_narrow); - alpha_wide = vmovl_u8(alpha_narrow); + alpha_wide = vsubw_u8(vdupq_n_u16(255), alpha_narrow); alpha_wide = vaddq_u16(alpha_wide, vshrq_n_u16(alpha_wide,7)); - /* get the dest, spread it */ + /* spread the dest */ dst_wide = vmovl_u8(dst_raw_2); /* alpha mul the dest */ @@ -544,7 +537,7 @@ static void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, #endif /* Neon version of S32_Blend_BlitRow32() - * portable version is in core/SkBlitRow_D32.cpp + * portable version is in src/core/SkBlitRow_D32.cpp */ #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, @@ -578,18 +571,17 @@ static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, /* get 64 bits of src, widen it, multiply by src_scale */ src_raw = vreinterpret_u8_u32(vld1_u32(src)); src_wide = vmovl_u8(src_raw); - /* gcc hoists vdupq_n_u16(), better code than vmulq_n_u16() */ + /* gcc hoists vdupq_n_u16(), better than using vmulq_n_u16() */ src_wide = vmulq_u16 (src_wide, vdupq_n_u16(src_scale)); /* ditto with dst */ dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); dst_wide = vmovl_u8(dst_raw); - dst_wide = vmulq_u16 (dst_wide, vdupq_n_u16(dst_scale)); - /* sum (knowing it won't overflow 16 bits) and take high bits */ - dst_wide = vaddq_u16(dst_wide, src_wide); - dst_final = vshrn_n_u16(dst_wide, 8); + /* combine add with dst multiply into mul-accumulate */ + dst_wide = vmlaq_u16(src_wide, dst_wide, vdupq_n_u16(dst_scale)); + dst_final = vshrn_n_u16(dst_wide, 8); vst1_u32(dst, vreinterpret_u32_u8(dst_final)); src += UNROLL; @@ -630,73 +622,234 @@ static void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, /////////////////////////////////////////////////////////////////////////////// #if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) -/* RBE: working on this 2009/10/8 */ -static void S32A_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, + +#undef DEBUG_OPAQUE_DITHER + +#if defined(DEBUG_OPAQUE_DITHER) +static void showme8(char *str, void *p, int len) +{ + static char buf[256]; + char tbuf[32]; + int i; + char *pc = (char*) p; + sprintf(buf,"%8s:", str); + for(i=0;i= UNROLL) { + uint8x8_t dbase; + +#if defined(DEBUG_OPAQUE_DITHER) + uint16_t tmpbuf[UNROLL]; + int td[UNROLL]; + int tdv[UNROLL]; + int ta[UNROLL]; + int tap[UNROLL]; + uint16_t in_dst[UNROLL]; + int offset = 0; + int noisy = 0; +#endif + + const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; + dbase = vld1_u8(dstart); + + do { + uint8x8_t sr, sg, sb, sa, d; + uint16x8_t dst8, scale8, alpha; + uint16x8_t dst_r, dst_g, dst_b; + +#if defined(DEBUG_OPAQUE_DITHER) + /* calculate 8 elements worth into a temp buffer */ + { + int my_y = y; + int my_x = x; + SkPMColor* my_src = (SkPMColor*)src; + uint16_t* my_dst = dst; + int i; + + DITHER_565_SCAN(my_y); + for(i=0;i> 3); + // now src and dst expanded are in g:11 r:10 x:1 b:10 + tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); + td[i] = d; + + } else { + tmpbuf[i] = *my_dst; + ta[i] = tdv[i] = td[i] = 0xbeef; + } + in_dst[i] = *my_dst; + my_dst += 1; + DITHER_INC_X(my_x); + } + } +#endif + + /* source is in ABGR */ + { + register uint8x8_t d0 asm("d0"); + register uint8x8_t d1 asm("d1"); + register uint8x8_t d2 asm("d2"); + register uint8x8_t d3 asm("d3"); + + asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) + : "r" (src) + ); + sr = d0; sg = d1; sb = d2; sa = d3; + } + + /* calculate 'd', which will be 0..7 */ + /* dbase[] is 0..7; alpha is 0..256; 16 bits suffice */ + alpha = vaddw_u8(vmovl_u8(sa), vshr_n_u8(sa, 7)); + alpha = vmulq_u16(alpha, vmovl_u8(dbase)); + d = vshrn_n_u16(alpha, 8); /* narrowing too */ + + /* sr = sr - (sr>>5) + d */ + /* watching for 8-bit overflow. d is 0..7; risky range of + * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; + * safe as long as we do ((sr-sr>>5) + d) */ + sr = vsub_u8(sr, vshr_n_u8(sr, 5)); + sr = vadd_u8(sr, d); + + /* sb = sb - (sb>>5) + d */ + sb = vsub_u8(sb, vshr_n_u8(sb, 5)); + sb = vadd_u8(sb, d); + + /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ + sg = vsub_u8(sg, vshr_n_u8(sg, 6)); + sg = vadd_u8(sg, vshr_n_u8(d,1)); + + /* need to pick up 8 dst's -- at 16 bits each, 128 bits */ + dst8 = vld1q_u16(dst); + dst_b = vandq_u16(dst8, vdupq_n_u16(0x001F)); + dst_g = vandq_u16(vshrq_n_u16(dst8,5), vdupq_n_u16(0x003F)); + dst_r = vshrq_n_u16(dst8,11); /* clearing hi bits */ + + /* blend */ + scale8 = vsubw_u8(vdupq_n_u16(255), sa); + scale8 = vaddq_u16(scale8, vshrq_n_u16(scale8, 7)); + scale8 = vshrq_n_u16(scale8, 3); + dst_b = vmulq_u16(dst_b, scale8); + dst_g = vmulq_u16(dst_g, scale8); + dst_r = vmulq_u16(dst_r, scale8); + + /* combine */ + /* NB: vshll widens, need to preserve those bits */ + dst_b = vaddq_u16(dst_b, vshll_n_u8(sb,2)); + dst_g = vaddq_u16(dst_g, vshll_n_u8(sg,3)); + dst_r = vaddq_u16(dst_r, vshll_n_u8(sr,2)); + + /* repack to store */ + dst8 = vandq_u16(vshrq_n_u16(dst_b, 5), vdupq_n_u16(0x001F)); + dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); + dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); + + vst1q_u16(dst, dst8); + +#if defined(DEBUG_OPAQUE_DITHER) + /* verify my 8 elements match the temp buffer */ + { + int i, bad=0; + static int invocation; + + for (i=0;i= UNROLL); + } +#undef UNROLL + + /* residuals */ if (count > 0) { DITHER_565_SCAN(y); do { SkPMColor c = *src++; SkPMColorAssert(c); - /* RBE: make sure we don't generate wrong output if c==0 */ if (c) { - - /* let's do a vld4 to get 64 bits (8 bytes) of each Argb */ - /* so we'll have 8 a's, 8 r's, etc */ - /* little endian: ABGR is the ordering (R at lsb) */ unsigned a = SkGetPackedA32(c); - // RBE: could load a table and do vtbl for these things - // DITHER_VALUE() masks x to 3 bits [0..7] before lookup, so can - // so 8x unrolling gets us perfectly aligned. - // and we could even avoid the vtbl at that point - /* d is 0..7 according to skia/core/SkDither.h asserts */ int d = SkAlphaMul(DITHER_VALUE(x), SkAlpha255To256(a)); unsigned sr = SkGetPackedR32(c); unsigned sg = SkGetPackedG32(c); unsigned sb = SkGetPackedB32(c); - - /* R and B handled identically; G is a little different */ - - /* sr - (sr>>5) means that +d can NOT overflow */ - /* do (sr-(sr>>5)), followed by adding d -- stay in 8 bits */ - /* sr = sr+d - (sr>>5) */ sr = SkDITHER_R32_FOR_565(sr, d); - /* calculate sr+(sr>>5) here, then add d */ - - /* sg = sg + (d>>1) - (sg>>6) */ sg = SkDITHER_G32_FOR_565(sg, d); - /* sg>>6 could be '3' and d>>1 is <= 3, so we're ok */ - /* calculate sg-(sg>>6), then add "d>>1" */ - - - /* sb = sb+d - (sb>>5) */ sb = SkDITHER_B32_FOR_565(sb, d); - /* calculate sb+(sb>>5) here, then add d */ - - /* been dealing in 8x8 through here; gonna have to go to 8x16 */ - - /* need to pick up 8 dst's -- at 16 bits each, 256 bits */ - /* extract dst into 8x16's */ - /* blend */ - /* shift */ - /* reassemble */ - uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); uint32_t dst_expanded = SkExpand_rgb_16(*dst); - - // would be shifted by 8, but the >>3 makes it be just 5 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); // now src and dst expanded are in g:11 r:10 x:1 b:10 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); } dst += 1; - /* RBE: a NOP with wide enough unrolling; wide_enough == 8 */ DITHER_INC_X(x); } while (--count != 0); } @@ -714,13 +867,7 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = { S32_D565_Opaque_PROC, S32_D565_Blend_PROC, S32A_D565_Opaque_PROC, -#if 0 - // when the src-pixel is 0 (transparent), we are still affecting the dst - // so we're skipping this optimization for now S32A_D565_Blend_PROC, -#else - NULL, -#endif // dither NULL, // S32_D565_Opaque_Dither, -- 2.7.4