From dbe7f52412f55561dcc3a51fa3df2779c9a368bf Mon Sep 17 00:00:00 2001 From: "commit-bot@chromium.org" Date: Wed, 27 Nov 2013 17:08:36 +0000 Subject: [PATCH] ARM Skia NEON patches - 16/17 - Blitmask MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Blitmask: NEON optimised version of the D32_A8 functions Here are the microbenchmark results I got for the D32_A8 functions: Cortex-A9: ========== +-------+--------+--------+--------+ | count | Black | Opaque | Color | +-------+--------+--------+--------+ | 1 | -14% | -39,5% | -37,5% | +-------+--------+--------+--------+ | 2 | -3% | -29,9% | -25% | +-------+--------+--------+--------+ | 4 | -11,3% | -22% | -14,5% | +-------+--------+--------+--------+ | 8 | +128% | +66,6% | +105% | +-------+--------+--------+--------+ | 16 | +159% | +102% | +149% | +-------+--------+--------+--------+ | 64 | +189% | +136% | +189% | +-------+--------+--------+--------+ | 256 | +126% | +102% | +149% | +-------+--------+--------+--------+ | 1024 | +67,5% | +81,4% | +123% | +-------+--------+--------+--------+ Cortex-A15: =========== +-------+--------+--------+--------+ | count | Black | Opaque | Color | +-------+--------+--------+--------+ | 1 | -24% | -46,5% | -37,5% | +-------+--------+--------+--------+ | 2 | -18,5% | -35,5% | -28% | +-------+--------+--------+--------+ | 4 | -5,2% | -17,5% | -15,5% | +-------+--------+--------+--------+ | 8 | +72% | +65,8% | +84,7% | +-------+--------+--------+--------+ | 16 | +168% | +117% | +149% | +-------+--------+--------+--------+ | 64 | +165% | +110% | +145% | +-------+--------+--------+--------+ | 256 | +106% | +99,6% | +141% | +-------+--------+--------+--------+ | 1024 | +93,7% | +94,7% | +130% | +-------+--------+--------+--------+ Blitmask: add NEON optimised PlatformBlitRowProcs16 Here are the microbenchmark results (speedup vs. C code): +-------+-----------------+-----------------+ | | Cortex-A9 | Cortex-A15 | | count +--------+--------+--------+--------+ | | Blend | Opaque | Blend | Opaque | +-------+--------+--------+--------+--------+ | 1 | -19,2% | -36,7% | -33,6% | -44,7% | +-------+--------+--------+--------+--------+ | 2 | -12,6% | -27,8% | -39% | -48% | +-------+--------+--------+--------+--------+ | 4 | -11,5% | -21,6% | -37,7% | -44,3% | +-------+--------+--------+--------+--------+ | 8 | +141% | +59,7% | +123% | +48,7% | +-------+--------+--------+--------+--------+ | 16 | +213% | +119% | +214% | +121% | +-------+--------+--------+--------+--------+ | 64 | +212% | +105% | +242% | +167% | +-------+--------+--------+--------+--------+ | 256 | +289% | +167% | +249% | +207% | +-------+--------+--------+--------+--------+ | 1024 | +273% | +169% | +146% | +220% | +-------+--------+--------+--------+--------+ Signed-off-by: Kévin PETIT BUG= R=djsollen@google.com, mtklein@google.com, reed@google.com Author: kevin.petit.arm@gmail.com Review URL: https://codereview.chromium.org/23719002 git-svn-id: http://skia.googlecode.com/svn/trunk@12420 2bbb7eff-a529-9590-31e7-b0007b416f81 --- gyp/opts.gyp | 1 + src/opts/SkBlitMask_opts_arm.cpp | 27 +++- src/opts/SkBlitMask_opts_arm_neon.cpp | 255 ++++++++++++++++++++++++++++++++++ src/opts/SkBlitMask_opts_arm_neon.h | 16 +++ src/opts/SkColor_opts_neon.h | 17 +++ 5 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 src/opts/SkBlitMask_opts_arm_neon.cpp create mode 100644 src/opts/SkBlitMask_opts_arm_neon.h diff --git a/gyp/opts.gyp b/gyp/opts.gyp index 01bcdde..bf93926 100644 --- a/gyp/opts.gyp +++ b/gyp/opts.gyp @@ -177,6 +177,7 @@ '../src/opts/SkBitmapProcState_matrixProcs_neon.cpp', '../src/opts/SkBitmapProcState_matrix_clamp_neon.h', '../src/opts/SkBitmapProcState_matrix_repeat_neon.h', + '../src/opts/SkBlitMask_opts_arm_neon.cpp', '../src/opts/SkBlitRow_opts_arm_neon.cpp', '../src/opts/SkMorphology_opts_neon.cpp', '../src/opts/SkXfermode_opts_arm_neon.cpp', diff --git a/src/opts/SkBlitMask_opts_arm.cpp b/src/opts/SkBlitMask_opts_arm.cpp index 0ad0919..2bf7603 100644 --- a/src/opts/SkBlitMask_opts_arm.cpp +++ b/src/opts/SkBlitMask_opts_arm.cpp @@ -1,14 +1,39 @@ +#include "SkColor.h" +#include "SkColorPriv.h" #include "SkBlitMask.h" +#include "SkUtilsArm.h" +#include "SkBlitMask_opts_arm_neon.h" SkBlitMask::ColorProc SkBlitMask::PlatformColorProcs(SkBitmap::Config dstConfig, SkMask::Format maskFormat, SkColor color) { +#if SK_ARM_NEON_IS_NONE + return NULL; +#else +#if SK_ARM_NEON_IS_DYNAMIC + if (!sk_cpu_arm_has_neon()) { + return NULL; + } +#endif + if ((SkBitmap::kARGB_8888_Config == dstConfig) && + (SkMask::kA8_Format == maskFormat)) { + return D32_A8_Factory_neon(color); + } +#endif + + // We don't need to handle the SkMask::kLCD16_Format case as the default + // LCD16 will call us through SkBlitMask::PlatformBlitRowProcs16() + return NULL; } SkBlitMask::BlitLCD16RowProc SkBlitMask::PlatformBlitRowProcs16(bool isOpaque) { - return NULL; + if (isOpaque) { + return SK_ARM_NEON_WRAP(SkBlitLCD16OpaqueRow); + } else { + return SK_ARM_NEON_WRAP(SkBlitLCD16Row); + } } SkBlitMask::RowProc SkBlitMask::PlatformRowProcs(SkBitmap::Config dstConfig, diff --git a/src/opts/SkBlitMask_opts_arm_neon.cpp b/src/opts/SkBlitMask_opts_arm_neon.cpp new file mode 100644 index 0000000..7db6fcb --- /dev/null +++ b/src/opts/SkBlitMask_opts_arm_neon.cpp @@ -0,0 +1,255 @@ + +#include "SkBlitMask.h" +#include "SkColor_opts_neon.h" + +static void D32_A8_Black_neon(void* SK_RESTRICT dst, size_t dstRB, + const void* SK_RESTRICT maskPtr, size_t maskRB, + SkColor, int width, int height) { + SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; + const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; + + maskRB -= width; + dstRB -= (width << 2); + do { + int w = width; + while (w >= 8) { + uint8x8_t vmask = vld1_u8(mask); + uint16x8_t vscale = vsubw_u8(vdupq_n_u16(256), vmask); + uint8x8x4_t vdevice = vld4_u8((uint8_t*)device); + + vdevice = SkAlphaMulQ_neon8(vdevice, vscale); + vdevice.val[NEON_A] += vmask; + + vst4_u8((uint8_t*)device, vdevice); + + mask += 8; + device += 8; + w -= 8; + } + while (w-- > 0) { + unsigned aa = *mask++; + *device = (aa << SK_A32_SHIFT) + + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); + device += 1; + }; + device = (uint32_t*)((char*)device + dstRB); + mask += maskRB; + } while (--height != 0); +} + +template +static void D32_A8_Opaque_Color_neon(void* SK_RESTRICT dst, size_t dstRB, + const void* SK_RESTRICT maskPtr, size_t maskRB, + SkColor color, int width, int height) { + SkPMColor pmc = SkPreMultiplyColor(color); + SkPMColor* SK_RESTRICT device = (SkPMColor*)dst; + const uint8_t* SK_RESTRICT mask = (const uint8_t*)maskPtr; + uint8x8x4_t vpmc; + + maskRB -= width; + dstRB -= (width << 2); + + if (width >= 8) { + vpmc.val[NEON_A] = vdup_n_u8(SkGetPackedA32(pmc)); + vpmc.val[NEON_R] = vdup_n_u8(SkGetPackedR32(pmc)); + vpmc.val[NEON_G] = vdup_n_u8(SkGetPackedG32(pmc)); + vpmc.val[NEON_B] = vdup_n_u8(SkGetPackedB32(pmc)); + } + do { + int w = width; + while (w >= 8) { + uint8x8_t vmask = vld1_u8(mask); + uint16x8_t vscale, vmask256 = SkAlpha255To256_neon8(vmask); + if (isColor) { + vscale = vsubw_u8(vdupq_n_u16(256), + SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256)); + } else { + vscale = vsubw_u8(vdupq_n_u16(256), vmask); + } + uint8x8x4_t vdev = vld4_u8((uint8_t*)device); + + vdev.val[NEON_A] = SkAlphaMul_neon8(vpmc.val[NEON_A], vmask256) + + SkAlphaMul_neon8(vdev.val[NEON_A], vscale); + vdev.val[NEON_R] = SkAlphaMul_neon8(vpmc.val[NEON_R], vmask256) + + SkAlphaMul_neon8(vdev.val[NEON_R], vscale); + vdev.val[NEON_G] = SkAlphaMul_neon8(vpmc.val[NEON_G], vmask256) + + SkAlphaMul_neon8(vdev.val[NEON_G], vscale); + vdev.val[NEON_B] = SkAlphaMul_neon8(vpmc.val[NEON_B], vmask256) + + SkAlphaMul_neon8(vdev.val[NEON_B], vscale); + + vst4_u8((uint8_t*)device, vdev); + + mask += 8; + device += 8; + w -= 8; + } + + while (w--) { + unsigned aa = *mask++; + if (isColor) { + *device = SkBlendARGB32(pmc, *device, aa); + } else { + *device = SkAlphaMulQ(pmc, SkAlpha255To256(aa)) + + SkAlphaMulQ(*device, SkAlpha255To256(255 - aa)); + } + device += 1; + }; + + device = (uint32_t*)((char*)device + dstRB); + mask += maskRB; + + } while (--height != 0); +} + +static void D32_A8_Opaque_neon(void* SK_RESTRICT dst, size_t dstRB, + const void* SK_RESTRICT maskPtr, size_t maskRB, + SkColor color, int width, int height) { + D32_A8_Opaque_Color_neon(dst, dstRB, maskPtr, maskRB, color, width, height); +} + +static void D32_A8_Color_neon(void* SK_RESTRICT dst, size_t dstRB, + const void* SK_RESTRICT maskPtr, size_t maskRB, + SkColor color, int width, int height) { + D32_A8_Opaque_Color_neon(dst, dstRB, maskPtr, maskRB, color, width, height); +} + +SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color) { + if (SK_ColorBLACK == color) { + return D32_A8_Black_neon; + } else if (0xFF == SkColorGetA(color)) { + return D32_A8_Opaque_neon; + } else { + return D32_A8_Color_neon; + } +} + +//////////////////////////////////////////////////////////////////////////////// + +void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], + SkColor color, int width, + SkPMColor opaqueDst) { + int colR = SkColorGetR(color); + int colG = SkColorGetG(color); + int colB = SkColorGetB(color); + + uint8x8_t vcolR, vcolG, vcolB; + uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB; + + if (width >= 8) { + vcolR = vdup_n_u8(colR); + vcolG = vdup_n_u8(colG); + vcolB = vdup_n_u8(colB); + vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst)); + vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst)); + vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst)); + vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst)); + } + + while (width >= 8) { + uint8x8x4_t vdst; + uint16x8_t vmask; + uint16x8_t vmaskR, vmaskG, vmaskB; + uint8x8_t vsel_trans, vsel_opq; + + vdst = vld4_u8((uint8_t*)dst); + vmask = vld1q_u16(src); + + // Prepare compare masks + vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0))); + vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF))); + + // Get all the color masks on 5 bits + vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); + vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), + SK_B16_BITS + SK_R16_BITS + 1); + vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); + + // Upscale to 0..32 + vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); + vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); + vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); + + vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF)); + vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]); + + vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); + vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); + vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); + + vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]); + vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]); + vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]); + + vst4_u8((uint8_t*)dst, vdst); + + dst += 8; + src += 8; + width -= 8; + } + + // Leftovers + for (int i = 0; i < width; i++) { + dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i], + opaqueDst); + } +} + +void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], + SkColor color, int width, SkPMColor) { + int colA = SkColorGetA(color); + int colR = SkColorGetR(color); + int colG = SkColorGetG(color); + int colB = SkColorGetB(color); + + colA = SkAlpha255To256(colA); + + uint8x8_t vcolR, vcolG, vcolB; + uint16x8_t vcolA; + + if (width >= 8) { + vcolA = vdupq_n_u16(colA); + vcolR = vdup_n_u8(colR); + vcolG = vdup_n_u8(colG); + vcolB = vdup_n_u8(colB); + } + + while (width >= 8) { + uint8x8x4_t vdst; + uint16x8_t vmask; + uint16x8_t vmaskR, vmaskG, vmaskB; + + vdst = vld4_u8((uint8_t*)dst); + vmask = vld1q_u16(src); + + // Get all the color masks on 5 bits + vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT); + vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS), + SK_B16_BITS + SK_R16_BITS + 1); + vmaskB = vmask & vdupq_n_u16(SK_B16_MASK); + + // Upscale to 0..32 + vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4); + vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4); + vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4); + + vmaskR = vshrq_n_u16(vmaskR * vcolA, 8); + vmaskG = vshrq_n_u16(vmaskG * vcolA, 8); + vmaskB = vshrq_n_u16(vmaskB * vcolA, 8); + + vdst.val[NEON_A] = vdup_n_u8(0xFF); + vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR); + vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG); + vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB); + + vst4_u8((uint8_t*)dst, vdst); + + dst += 8; + src += 8; + width -= 8; + } + + for (int i = 0; i < width; i++) { + dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]); + } +} + diff --git a/src/opts/SkBlitMask_opts_arm_neon.h b/src/opts/SkBlitMask_opts_arm_neon.h new file mode 100644 index 0000000..fdbce14 --- /dev/null +++ b/src/opts/SkBlitMask_opts_arm_neon.h @@ -0,0 +1,16 @@ +#ifndef SkBlitMask_opts_arm_neon_DEFINED +#define SkBlitMask_opts_arm_neon_DEFINED + +#include "SkColor.h" +#include "SkBlitMask.h" + +extern SkBlitMask::ColorProc D32_A8_Factory_neon(SkColor color); + +extern void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[], + SkColor color, int width, + SkPMColor opaqueDst); + +extern void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[], + SkColor color, int width, SkPMColor); + +#endif // #ifndef SkBlitMask_opts_arm_neon_DEFINED diff --git a/src/opts/SkColor_opts_neon.h b/src/opts/SkColor_opts_neon.h index f812397..85752f5 100644 --- a/src/opts/SkColor_opts_neon.h +++ b/src/opts/SkColor_opts_neon.h @@ -2,6 +2,7 @@ #define SkColor_opts_neon_DEFINED #include "SkTypes.h" +#include "SkColorPriv.h" #include @@ -65,4 +66,20 @@ static inline uint16x8_t SkPixel32ToPixel16_neon8(uint8x8x4_t vsrc) { return ret; } +/* This function blends 8 pixels of the same channel in the exact same way as + * SkBlend32. + */ +static inline uint8x8_t SkBlend32_neon8(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) { + int16x8_t src_wide, dst_wide; + + src_wide = vreinterpretq_s16_u16(vmovl_u8(src)); + dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst)); + + src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale); + + dst_wide += vshrq_n_s16(src_wide, 5); + + return vmovn_u16(vreinterpretq_u16_s16(dst_wide)); +} + #endif /* #ifndef SkColor_opts_neon_DEFINED */ -- 2.7.4