From b577b41c8ec26c21ae599e80a2707d42f03eaa60 Mon Sep 17 00:00:00 2001 From: "reed@android.com" Date: Tue, 27 Oct 2009 17:49:32 +0000 Subject: [PATCH] add shaderproc32 for index bitmaps. neon version of 32->16+dither git-svn-id: http://skia.googlecode.com/svn/trunk@408 2bbb7eff-a529-9590-31e7-b0007b416f81 --- src/core/SkBitmapProcState.cpp | 21 +++++ src/core/SkBitmapProcState_shaderproc.h | 10 ++ src/opts/SkBlitRow_opts_arm.cpp | 116 +++++++++++++++++++++++- 3 files changed, 146 insertions(+), 1 deletion(-) diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp index 600b963d9d..eabd9665e1 100644 --- a/src/core/SkBitmapProcState.cpp +++ b/src/core/SkBitmapProcState.cpp @@ -289,6 +289,8 @@ static inline U8CPU Filter_8(unsigned x, unsigned y, } while (0) +// clamp + #define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) #define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) #define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) @@ -314,6 +316,23 @@ static inline U8CPU Filter_8(unsigned x, unsigned y, #define SRC_TO_FILTER(src) src #include "SkBitmapProcState_shaderproc.h" + +#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) +#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) +#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) +#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) + +#undef FILTER_PROC +#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_opaque(x, y, a, b, c, d, dst) +#define MAKENAME(suffix) Clamp_SI8_opaque_D32 ## suffix +#define SRCTYPE uint8_t +#define DSTTYPE uint32_t +#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kIndex8_Config) +#define PREAMBLE(state) const SkPMColor* SK_RESTRICT table = state.fBitmap->getColorTable()->lockColors() +#define SRC_TO_FILTER(src) table[src] +#define POSTAMBLE(state) state.fBitmap->getColorTable()->unlockColors(false) +#include "SkBitmapProcState_shaderproc.h" + /////////////////////////////////////////////////////////////////////////////// static bool valid_for_filtering(unsigned dimension) { @@ -503,6 +522,8 @@ bool SkBitmapProcState::chooseProcs(const SkMatrix& inv, const SkPaint& paint) { SkShader::kRepeat_TileMode == fTileModeY) { fShaderProc16 = Repeat_S16_D16_filter_DX_shaderproc; } + } else if (SI8_opaque_D32_filter_DX == fSampleProc32 && clamp_clamp) { + fShaderProc32 = Clamp_SI8_opaque_D32_filter_DX_shaderproc; } // see if our platform has any accelerated overrides diff --git a/src/core/SkBitmapProcState_shaderproc.h b/src/core/SkBitmapProcState_shaderproc.h index b4a53e4006..15831b67bf 100644 --- a/src/core/SkBitmapProcState_shaderproc.h +++ b/src/core/SkBitmapProcState_shaderproc.h @@ -36,6 +36,10 @@ static void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y, fx = SkScalarToFixed(pt.fX) - (oneX >> 1); } +#ifdef PREAMBLE + PREAMBLE(s); +#endif + do { unsigned subX = TILEX_LOW_BITS(fx, maxX); unsigned x0 = TILEX_PROCF(fx, maxX); @@ -51,6 +55,10 @@ static void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y, fx += dx; } while (--count != 0); + +#ifdef POSTAMBLE + POSTAMBLE(s); +#endif } /////////////////////////////////////////////////////////////////////////////// @@ -65,5 +73,7 @@ static void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y, #undef CHECKSTATE #undef SRC_TO_FILTER #undef FILTER_TO_DST +#undef PREAMBLE +#undef POSTAMBLE #undef SCALE_FILTER_NAME diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index 506c4d407e..be1cbdf2b9 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -862,6 +862,120 @@ static void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, /////////////////////////////////////////////////////////////////////////////// +#if defined(__ARM_HAVE_NEON) && defined(SK_CPU_LENDIAN) +/* 2009/10/27: RBE says "a work in progress"; debugging says ok; + * speedup untested, but ARM version is 26 insns/iteration and + * this NEON version is 21 insns/iteration-of-8 (2.62insns/element) + * which is 10x the native version; that's pure instruction counts, + * not accounting for any instruction or memory latencies. + */ + +#undef DEBUG_S32_OPAQUE_DITHER + +static void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, U8CPU alpha, int x, int y) { + SkASSERT(255 == alpha); + +#define UNROLL 8 + if (count >= UNROLL) { + uint8x8_t d; + const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; + d = vld1_u8(dstart); + + while (count >= UNROLL) { + uint8x8_t sr, sg, sb, sa; + uint16x8_t dr, dg, db, da; + uint16x8_t dst8; + + /* source is in ABGR ordering (R == lsb) */ + { + register uint8x8_t d0 asm("d0"); + register uint8x8_t d1 asm("d1"); + register uint8x8_t d2 asm("d2"); + register uint8x8_t d3 asm("d3"); + + asm ("vld4.8 {d0-d3},[%4] /* r=%P0 g=%P1 b=%P2 a=%P3 */" + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3) + : "r" (src) + ); + sr = d0; sg = d1; sb = d2; sa = d3; + } + /* XXX: if we want to prefetch, hide it in the above asm() + * using the gcc __builtin_prefetch(), the prefetch will + * fall to the bottom of the loop -- it won't stick up + * at the top of the loop, just after the vld4. + */ + + /* sr = sr - (sr>>5) + d */ + sr = vsub_u8(sr, vshr_n_u8(sr, 5)); + dr = vaddl_u8(sr, d); + + /* sb = sb - (sb>>5) + d */ + sb = vsub_u8(sb, vshr_n_u8(sb, 5)); + db = vaddl_u8(sb, d); + + /* sg = sg - (sg>>6) + d>>1; similar logic for overflows */ + sg = vsub_u8(sg, vshr_n_u8(sg, 6)); + dg = vaddl_u8(sg, vshr_n_u8(d,1)); + /* XXX: check that the "d>>1" here is hoisted */ + + /* pack high bits of each into 565 format (rgb, b is lsb) */ + dst8 = vshrq_n_u16(db, 3); + dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); + dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr,3), 11); + + /* store it */ + vst1q_u16(dst, dst8); + +#if defined(DEBUG_S32_OPAQUE_DITHER) + /* always good to know if we generated good results */ + { + int i, myx = x, myy = y; + DITHER_565_SCAN(myy); + for (i=0;i 0) { + DITHER_565_SCAN(y); + do { + SkPMColor c = *src++; + SkPMColorAssert(c); + SkASSERT(SkGetPackedA32(c) == 255); + + unsigned dither = DITHER_VALUE(x); + *dst++ = SkDitherRGB32To565(c, dither); + DITHER_INC_X(x); + } while (--count != 0); + } +} + +#define S32_D565_Opaque_Dither_PROC S32_D565_Opaque_Dither_neon +#else +#define S32_D565_Opaque_Dither_PROC NULL +#endif + +/////////////////////////////////////////////////////////////////////////////// + const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = { // no dither S32_D565_Opaque_PROC, @@ -870,7 +984,7 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_565_Procs[] = { S32A_D565_Blend_PROC, // dither - NULL, // S32_D565_Opaque_Dither, + S32_D565_Opaque_Dither_PROC, S32_D565_Blend_Dither_PROC, S32A_D565_Opaque_Dither_PROC, NULL, // S32A_D565_Blend_Dither -- 2.34.1