From 402448d6818cab9d7b7633a0c18fcf574c915357 Mon Sep 17 00:00:00 2001 From: mlee Date: Thu, 29 Jan 2015 06:22:41 -0800 Subject: [PATCH] skia: blend32_16_row for neon version MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This includes blend32_16_row neon implementation for aarch32 and aarch64. For performance, blend32_16_row is called in following tests in nanobench. - Xfermode_SrcOver - tablebench - rotated_rects_bw_alternating_transparent_and_opaque_srcover - rotated_rects_bw_changing_transparent_srcover - rotated_rects_bw_same_transparent_srcover - luma_colorfilter_large - luma_colorfilter_small - chart_bw I can see perf increase in following two tests, especially. For others, looks similar. For each, I tried to run two times. 1) Xfermode_SrcOver - D/skia ( 2000): 3M 57 17.3µs 17.4µs 17.4µs 17.7µs 1% █▃▂▃▂▂▂▁▃▂ 565 Xfermode_SrcOver - D/skia ( 1915): 3M 70 13.5µs 16.9µs 16.7µs 18.8µs 9% ▆█▄▅█▁▅▅▆▄ 565 Xfermode_SrcOver - D/skia ( 2000): 3M 8 11.6µs 11.8µs 12.1µs 14.4µs 7% ▃█▁▁▂▁▁▁▂▂ 565 Xfermode_SrcOver - D/skia ( 2004): 3M 62 10.3µs 12.9µs 13µs 15.2µs 11% █▅▅▆▁▅▅▅▇▃ 565 Xfermode_SrcOver 2) luma_colorfilter_large - D/skia ( 2000): 159M 8 136µs 136µs 136µs 139µs 1% █▃▁▂▁▁▁▁▁▁ 565 luma_colorfilter_large - D/skia ( 1915): 158M 2 135µs 177µs 182µs 269µs 22% ▆▃█▁▁▃▃▃▃▃ 565 luma_colorfilter_large - D/skia ( 2000): 157M 5 84.2µs 85.3µs 87.5µs 110µs 9% █▁▂▁▁▁▁▁▁▁ 565 luma_colorfilter_large - D/skia ( 2004): 159M 6 84.7µs 110µs 112µs 144µs 18% █▄▇▁▁▄▃▄▄▆ 565 luma_colorfilter_large Review URL: https://codereview.chromium.org/847363002 --- src/core/SkBlitter_RGB16.cpp | 39 +++++------ src/opts/SkBlitRow_opts_arm.cpp | 9 ++- src/opts/SkBlitRow_opts_arm_neon.cpp | 131 +++++++++++++++++++++++++++++++++++ src/opts/SkBlitRow_opts_arm_neon.h | 1 + 4 files changed, 157 insertions(+), 23 deletions(-) diff --git a/src/core/SkBlitter_RGB16.cpp b/src/core/SkBlitter_RGB16.cpp index d9193ec..306c95b 100644 --- a/src/core/SkBlitter_RGB16.cpp +++ b/src/core/SkBlitter_RGB16.cpp @@ -77,6 +77,8 @@ protected: uint16_t fRawDither16; // unscaled SkBool8 fDoDither; + SkBlitRow::ColorProc16 fColorProc16; + // illegal SkRGB16_Blitter& operator=(const SkRGB16_Blitter&); @@ -544,6 +546,19 @@ SkRGB16_Blitter::SkRGB16_Blitter(const SkBitmap& device, const SkPaint& paint) fColor16 = SkPackRGB16( SkAlphaMul(r, fScale) >> (8 - SK_R16_BITS), SkAlphaMul(g, fScale) >> (8 - SK_G16_BITS), SkAlphaMul(b, fScale) >> (8 - SK_B16_BITS)); + + // compute SkBlitRow::Procs + unsigned flags = 0; + + if (SkGetPackedA32(fSrcColor32) < 0xFF) { + flags |= SkBlitRow::kSrcPixelAlpha_Flag; + } + + if (fDoDither) { + flags |= SkBlitRow::kDither_Flag; + } + + fColorProc16 = SkBlitRow::ColorFactory16(flags); } const SkBitmap* SkRGB16_Blitter::justAnOpaqueColor(uint32_t* value) { @@ -554,31 +569,12 @@ const SkBitmap* SkRGB16_Blitter::justAnOpaqueColor(uint32_t* value) { return NULL; } -static uint32_t pmcolor_to_expand16(SkPMColor c) { - unsigned r = SkGetPackedR32(c); - unsigned g = SkGetPackedG32(c); - unsigned b = SkGetPackedB32(c); - return (g << 24) | (r << 13) | (b << 2); -} - -static inline void blend32_16_row(SkPMColor src, uint16_t dst[], int count) { - SkASSERT(count > 0); - uint32_t src_expand = pmcolor_to_expand16(src); - unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; - do { - uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; - *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); - dst += 1; - } while (--count != 0); -} - void SkRGB16_Blitter::blitH(int x, int y, int width) { SkASSERT(width > 0); SkASSERT(x + width <= fDevice.width()); uint16_t* SK_RESTRICT device = fDevice.getAddr16(x, y); - // TODO: respect fDoDither - blend32_16_row(fSrcColor32, device, width); + fColorProc16(device, fSrcColor32, width, x, y); } void SkRGB16_Blitter::blitAntiH(int x, int y, @@ -681,10 +677,9 @@ void SkRGB16_Blitter::blitRect(int x, int y, int width, int height) { SkASSERT(x + width <= fDevice.width() && y + height <= fDevice.height()); uint16_t* SK_RESTRICT device = fDevice.getAddr16(x, y); size_t deviceRB = fDevice.rowBytes(); - SkPMColor src32 = fSrcColor32; while (--height >= 0) { - blend32_16_row(src32, device, width); + fColorProc16(device, fSrcColor32, width, x, y); device = (uint16_t*)((char*)device + deviceRB); } } diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index 9a8cfc4..9dc9f3b 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -364,6 +364,13 @@ static const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm[] = { NULL, // S32A_D565_Blend_Dither }; +static const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm[] = { + NULL, // Color32_D565, + NULL, // Color32A_D565, + NULL, // Color32_D565_Dither, + NULL, // Color32A_D565_Dither +}; + static const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { NULL, // S32_Opaque, NULL, // S32_Blend, @@ -378,7 +385,7 @@ SkBlitRow::Proc16 SkBlitRow::PlatformFactory565(unsigned flags) { } SkBlitRow::ColorProc16 SkBlitRow::PlatformColorFactory565(unsigned flags) { - return NULL; + return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_colorprocs_arm)[flags]; } SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp index a1ff172..8981fd7 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -465,6 +465,130 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, } #endif // #ifdef SK_CPU_ARM32 +static uint32_t pmcolor_to_expand16(SkPMColor c) { + unsigned r = SkGetPackedR32(c); + unsigned g = SkGetPackedG32(c); + unsigned b = SkGetPackedB32(c); + return (g << 24) | (r << 13) | (b << 2); +} + +void Color32A_D565_neon(uint16_t dst[], SkPMColor src, int count, int x, int y) { + uint32_t src_expand; + unsigned scale; + uint16x8_t vmask_blue; + + if (count <= 0) return; + SkASSERT(((size_t)dst & 0x01) == 0); + + /* + * This preamble code is in order to make dst aligned to 8 bytes + * in the next mutiple bytes read & write access. + */ + src_expand = pmcolor_to_expand16(src); + scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3; + +#define DST_ALIGN 8 + + /* + * preamble_size is in byte, meantime, this blend32_16_row_neon updates 2 bytes at a time. + */ + int preamble_size = (DST_ALIGN - (size_t)dst) & (DST_ALIGN - 1); + + for (int i = 0; i < preamble_size; i+=2, dst++) { + uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; + *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); + if (--count == 0) + break; + } + + int count16 = 0; + count16 = count >> 4; + vmask_blue = vmovq_n_u16(SK_B16_MASK); + + if (count16) { + uint16x8_t wide_sr; + uint16x8_t wide_sg; + uint16x8_t wide_sb; + uint16x8_t wide_256_sa; + + unsigned sr = SkGetPackedR32(src); + unsigned sg = SkGetPackedG32(src); + unsigned sb = SkGetPackedB32(src); + unsigned sa = SkGetPackedA32(src); + + // Operation: dst_rgb = src_rgb + ((256 - src_a) >> 3) x dst_rgb + // sr: 8-bit based, dr: 5-bit based, with dr x ((256-sa)>>3), 5-bit left shifted, + //thus, for sr, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) + wide_sr = vshlq_n_u16(vmovl_u8(vdup_n_u8(sr)), 2); // widen and src_red shift + + // sg: 8-bit based, dg: 6-bit based, with dg x ((256-sa)>>3), 5-bit left shifted, + //thus, for sg, do 3-bit left shift to match MSB : (8 + 3 = 6 + 5) + wide_sg = vshlq_n_u16(vmovl_u8(vdup_n_u8(sg)), 3); // widen and src_grn shift + + // sb: 8-bit based, db: 5-bit based, with db x ((256-sa)>>3), 5-bit left shifted, + //thus, for sb, do 2-bit left shift to match MSB : (8 + 2 = 5 + 5) + wide_sb = vshlq_n_u16(vmovl_u8(vdup_n_u8(sb)), 2); // widen and src blu shift + + wide_256_sa = + vshrq_n_u16(vsubw_u8(vdupq_n_u16(256), vdup_n_u8(sa)), 3); // (256 - sa) >> 3 + + while (count16-- > 0) { + uint16x8_t vdst1, vdst1_r, vdst1_g, vdst1_b; + uint16x8_t vdst2, vdst2_r, vdst2_g, vdst2_b; + vdst1 = vld1q_u16(dst); + dst += 8; + vdst2 = vld1q_u16(dst); + dst -= 8; //to store dst again. + + vdst1_g = vshlq_n_u16(vdst1, SK_R16_BITS); // shift green to top of lanes + vdst1_b = vdst1 & vmask_blue; // extract blue + vdst1_r = vshrq_n_u16(vdst1, SK_R16_SHIFT); // extract red + vdst1_g = vshrq_n_u16(vdst1_g, SK_R16_BITS + SK_B16_BITS); // extract green + + vdst2_g = vshlq_n_u16(vdst2, SK_R16_BITS); // shift green to top of lanes + vdst2_b = vdst2 & vmask_blue; // extract blue + vdst2_r = vshrq_n_u16(vdst2, SK_R16_SHIFT); // extract red + vdst2_g = vshrq_n_u16(vdst2_g, SK_R16_BITS + SK_B16_BITS); // extract green + + vdst1_r = vmlaq_u16(wide_sr, wide_256_sa, vdst1_r); // sr + (256-sa) x dr1 + vdst1_g = vmlaq_u16(wide_sg, wide_256_sa, vdst1_g); // sg + (256-sa) x dg1 + vdst1_b = vmlaq_u16(wide_sb, wide_256_sa, vdst1_b); // sb + (256-sa) x db1 + + vdst2_r = vmlaq_u16(wide_sr, wide_256_sa, vdst2_r); // sr + (256-sa) x dr2 + vdst2_g = vmlaq_u16(wide_sg, wide_256_sa, vdst2_g); // sg + (256-sa) x dg2 + vdst2_b = vmlaq_u16(wide_sb, wide_256_sa, vdst2_b); // sb + (256-sa) x db2 + + vdst1_r = vshrq_n_u16(vdst1_r, 5); // 5-bit right shift for 5-bit red + vdst1_g = vshrq_n_u16(vdst1_g, 5); // 5-bit right shift for 6-bit green + vdst1_b = vshrq_n_u16(vdst1_b, 5); // 5-bit right shift for 5-bit blue + + vdst1 = vsliq_n_u16(vdst1_b, vdst1_g, SK_G16_SHIFT); // insert green into blue + vdst1 = vsliq_n_u16(vdst1, vdst1_r, SK_R16_SHIFT); // insert red into green/blue + + vdst2_r = vshrq_n_u16(vdst2_r, 5); // 5-bit right shift for 5-bit red + vdst2_g = vshrq_n_u16(vdst2_g, 5); // 5-bit right shift for 6-bit green + vdst2_b = vshrq_n_u16(vdst2_b, 5); // 5-bit right shift for 5-bit blue + + vdst2 = vsliq_n_u16(vdst2_b, vdst2_g, SK_G16_SHIFT); // insert green into blue + vdst2 = vsliq_n_u16(vdst2, vdst2_r, SK_R16_SHIFT); // insert red into green/blue + + vst1q_u16(dst, vdst1); + dst += 8; + vst1q_u16(dst, vdst2); + dst += 8; + } + } + + count &= 0xF; + if (count > 0) { + do { + uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale; + *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5); + dst += 1; + } while (--count != 0); + } +} + static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { prod += vdupq_n_u16(128); prod += vshrq_n_u16(prod, 8); @@ -1665,6 +1789,13 @@ const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[] = { NULL, // S32A_D565_Blend_Dither }; +const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[] = { + Color32A_D565_neon, // Color32_D565, + Color32A_D565_neon, // Color32A_D565, + Color32A_D565_neon, // Color32_D565_Dither, + Color32A_D565_neon, // Color32A_D565_Dither +}; + const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { NULL, // S32_Opaque, S32_Blend_BlitRow32_neon, // S32_Blend, diff --git a/src/opts/SkBlitRow_opts_arm_neon.h b/src/opts/SkBlitRow_opts_arm_neon.h index 95825f7..92d58ce 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.h +++ b/src/opts/SkBlitRow_opts_arm_neon.h @@ -10,6 +10,7 @@ #include "SkBlitRow.h" extern const SkBlitRow::Proc16 sk_blitrow_platform_565_procs_arm_neon[]; +extern const SkBlitRow::ColorProc16 sk_blitrow_platform_565_colorprocs_arm_neon[]; extern const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[]; extern void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, -- 2.7.4