From be233d63ca015c2991f4fe0802e4a31a71642062 Mon Sep 17 00:00:00 2001 From: "commit-bot@chromium.org" Date: Thu, 13 Feb 2014 18:37:35 +0000 Subject: [PATCH] ARM Skia NEON patches - 27 - S32A_D565_Blend MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit BlitRow565: new intrinsics version of S32A_D565_Blend This new version is basically a rewrite of the existing code with a few speed and accuracy improvements. There is a switch to enable pixel perfect results at the cost of a (quite big) decrease of performances (disabled in this patch). Here are the benchmark results (speedup vs. existing code): +-------+------------+------------+ | count | Cortex -A9 | Cortex-A15 | +-------+------------+------------+ | 1 | +103.6% | +12% | +-------+------------+------------+ | 2 | +3.6% | +21.6% | +-------+------------+------------+ | 4 | +0.8% | -0.8% | +-------+------------+------------+ | 8 | +3.9% | -1% | +-------+------------+------------+ | 16 | +14.7% | +5.7% | +-------+------------+------------+ | 64 | +18.1% | +13.2% | +-------+------------+------------+ | 256 | +16.3% | +27.4% | +-------+------------+------------+ | 1024 | +78.2% | +17.4% | +-------+------------+------------+ Signed-off-by: Kévin PETIT BUG=skia: R=djsollen@google.com, mtklein@google.com, halcanary@google.com Author: kevin.petit@arm.com Review URL: https://codereview.chromium.org/156113005 git-svn-id: http://skia.googlecode.com/svn/trunk@13438 2bbb7eff-a529-9590-31e7-b0007b416f81 --- expectations/gm/ignored-tests.txt | 5 + src/opts/SkBlitRow_opts_arm_neon.cpp | 198 ++++++++++++++++++----------------- 2 files changed, 109 insertions(+), 94 deletions(-) diff --git a/expectations/gm/ignored-tests.txt b/expectations/gm/ignored-tests.txt index 5bfc6a4..4c0e834 100644 --- a/expectations/gm/ignored-tests.txt +++ b/expectations/gm/ignored-tests.txt @@ -37,3 +37,8 @@ # deprecated calling pattern. # https://codereview.chromium.org/154163002/ extractbitmap + +# Added by kevin.petit@arm.com for https://codereview.chromium.org/156113005/ +shadertext3 +gradients_view_perspective +drawbitmaprect diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp index 672980d..1de1a20 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -230,113 +230,123 @@ void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, } } +static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { + prod += vdupq_n_u16(128); + prod += vshrq_n_u16(prod, 8); + return vshrq_n_u16(prod, 8); +} + void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha, int /*x*/, int /*y*/) { + SkASSERT(255 > alpha); - U8CPU alpha_for_asm = alpha; - - asm volatile ( - /* This code implements a Neon version of S32A_D565_Blend. The output differs from - * the original in two respects: - * 1. The results have a few mismatches compared to the original code. These mismatches - * never exceed 1. It's possible to improve accuracy vs. a floating point - * implementation by introducing rounding right shifts (vrshr) for the final stage. - * Rounding is not present in the code below, because although results would be closer - * to a floating point implementation, the number of mismatches compared to the - * original code would be far greater. - * 2. On certain inputs, the original code can overflow, causing colour channels to - * mix. Although the Neon code can also overflow, it doesn't allow one colour channel - * to affect another. + /* This code implements a Neon version of S32A_D565_Blend. The results have + * a few mismatches compared to the original code. These mismatches never + * exceed 1. */ -#if 1 - /* reflects SkAlpha255To256()'s change from a+a>>7 to a+1 */ - "add %[alpha], %[alpha], #1 \n\t" // adjust range of alpha 0-256 -#else - "add %[alpha], %[alpha], %[alpha], lsr #7 \n\t" // adjust range of alpha 0-256 -#endif - "vmov.u16 q3, #255 \n\t" // set up constant - "movs r4, %[count], lsr #3 \n\t" // calc. count>>3 - "vmov.u16 d2[0], %[alpha] \n\t" // move alpha to Neon - "beq 2f \n\t" // if count8 == 0, exit - "vmov.u16 q15, #0x1f \n\t" // set up blue mask - - "1: \n\t" - "vld1.u16 {d0, d1}, [%[dst]] \n\t" // load eight dst RGB565 pixels - "subs r4, r4, #1 \n\t" // decrement loop counter - "vld4.u8 {d24, d25, d26, d27}, [%[src]]! \n\t" // load eight src ABGR32 pixels - // and deinterleave - - "vshl.u16 q9, q0, #5 \n\t" // shift green to top of lanes - "vand q10, q0, q15 \n\t" // extract blue - "vshr.u16 q8, q0, #11 \n\t" // extract red - "vshr.u16 q9, q9, #10 \n\t" // extract green - // dstrgb = {q8, q9, q10} - - "vshr.u8 d24, d24, #3 \n\t" // shift red to 565 range - "vshr.u8 d25, d25, #2 \n\t" // shift green to 565 range - "vshr.u8 d26, d26, #3 \n\t" // shift blue to 565 range - - "vmovl.u8 q11, d24 \n\t" // widen red to 16 bits - "vmovl.u8 q12, d25 \n\t" // widen green to 16 bits - "vmovl.u8 q14, d27 \n\t" // widen alpha to 16 bits - "vmovl.u8 q13, d26 \n\t" // widen blue to 16 bits - // srcrgba = {q11, q12, q13, q14} - - "vmul.u16 q2, q14, d2[0] \n\t" // sa * src_scale - "vmul.u16 q11, q11, d2[0] \n\t" // red result = src_red * src_scale - "vmul.u16 q12, q12, d2[0] \n\t" // grn result = src_grn * src_scale - "vmul.u16 q13, q13, d2[0] \n\t" // blu result = src_blu * src_scale - - "vshr.u16 q2, q2, #8 \n\t" // sa * src_scale >> 8 - "vsub.u16 q2, q3, q2 \n\t" // 255 - (sa * src_scale >> 8) - // dst_scale = q2 - - "vmla.u16 q11, q8, q2 \n\t" // red result += dst_red * dst_scale - "vmla.u16 q12, q9, q2 \n\t" // grn result += dst_grn * dst_scale - "vmla.u16 q13, q10, q2 \n\t" // blu result += dst_blu * dst_scale + if (count >= 8) { + uint16x8_t valpha_max, vmask_blue; + uint8x8_t valpha; -#if 1 - // trying for a better match with SkDiv255Round(a) - // C alg is: a+=128; (a+a>>8)>>8 - // we'll use just a rounding shift [q2 is available for scratch] - "vrshr.u16 q11, q11, #8 \n\t" // shift down red - "vrshr.u16 q12, q12, #8 \n\t" // shift down green - "vrshr.u16 q13, q13, #8 \n\t" // shift down blue + // prepare constants + valpha_max = vmovq_n_u16(255); + valpha = vdup_n_u8(alpha); + vmask_blue = vmovq_n_u16(SK_B16_MASK); + + do { + uint16x8_t vdst, vdst_r, vdst_g, vdst_b; + uint16x8_t vres_a, vres_r, vres_g, vres_b; + uint8x8x4_t vsrc; + + // load pixels + vdst = vld1q_u16(dst); +#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) + asm ( + "vld4.u8 %h[vsrc], [%[src]]!" + : [vsrc] "=w" (vsrc), [src] "+&r" (src) + : : + ); #else - // arm's original "truncating divide by 256" - "vshr.u16 q11, q11, #8 \n\t" // shift down red - "vshr.u16 q12, q12, #8 \n\t" // shift down green - "vshr.u16 q13, q13, #8 \n\t" // shift down blue + register uint8x8_t d0 asm("d0"); + register uint8x8_t d1 asm("d1"); + register uint8x8_t d2 asm("d2"); + register uint8x8_t d3 asm("d3"); + + asm volatile ( + "vld4.u8 {d0-d3},[%[src]]!;" + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), + [src] "+&r" (src) + : : + ); + vsrc.val[0] = d0; + vsrc.val[1] = d1; + vsrc.val[2] = d2; + vsrc.val[3] = d3; #endif - "vsli.u16 q13, q12, #5 \n\t" // insert green into blue - "vsli.u16 q13, q11, #11 \n\t" // insert red into green/blue - "vst1.16 {d26, d27}, [%[dst]]! \n\t" // write pixel back to dst, update ptr - "bne 1b \n\t" // if counter != 0, loop - "2: \n\t" // exit + // deinterleave dst + vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes + vdst_b = vdst & vmask_blue; // extract blue + vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red + vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green + + // shift src to 565 + vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); + vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); + vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); + + // calc src * src_scale + vres_a = vmull_u8(vsrc.val[NEON_A], valpha); + vres_r = vmull_u8(vsrc.val[NEON_R], valpha); + vres_g = vmull_u8(vsrc.val[NEON_G], valpha); + vres_b = vmull_u8(vsrc.val[NEON_B], valpha); + + // prepare dst_scale + vres_a = SkDiv255Round_neon8(vres_a); + vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 + + // add dst * dst_scale to previous result + vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); + vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); + vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); + +#ifdef S32A_D565_BLEND_EXACT + // It is possible to get exact results with this but it is slow, + // even slower than C code in some cases + vres_r = SkDiv255Round_neon8(vres_r); + vres_g = SkDiv255Round_neon8(vres_g); + vres_b = SkDiv255Round_neon8(vres_b); +#else + vres_r = vrshrq_n_u16(vres_r, 8); + vres_g = vrshrq_n_u16(vres_g, 8); + vres_b = vrshrq_n_u16(vres_b, 8); +#endif + // pack result + vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue + vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue - : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count), [alpha] "+r" (alpha_for_asm) - : - : "cc", "memory", "r4", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" - ); + // store + vst1q_u16(dst, vres_b); + dst += 8; + count -= 8; + } while (count >= 8); + } - count &= 7; - if (count > 0) { - do { - SkPMColor sc = *src++; - if (sc) { - uint16_t dc = *dst; - unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); - unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); - unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); - unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); - *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); - } - dst += 1; - } while (--count != 0); + // leftovers + while (count-- > 0) { + SkPMColor sc = *src++; + if (sc) { + uint16_t dc = *dst; + unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); + unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); + unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); + unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); + *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); + } + dst += 1; } } -- 2.7.4