From 0060159457453ca45a47828648c8f29d5695983c Mon Sep 17 00:00:00 2001 From: "commit-bot@chromium.org" Date: Fri, 20 Sep 2013 15:38:49 +0000 Subject: [PATCH] ARM Skia NEON patches - 21 - new NEON S32_D565_Opaque MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit BlitRow565: NEON version of S32_D565_Opaque Here's a new implementation of S32_D565_Opaque in NEON. It improves dramatically the speed compared to S32A_D565_Opaque. Here are the benchmark results (speedup vs. existing NEON): +-------+-----------+------------+ | count | Cortex-A9 | Cortex-A15 | +-------+-----------+------------+ | 1 | +130% | +139% | +-------+-----------+------------+ | 2 | +65,2% | +51% | +-------+-----------+------------+ | 4 | -25,5% | +10,2% | +-------+-----------+------------+ | 8 | +63,8% | +32,1% | +-------+-----------+------------+ | 16 | +110% | +49,2% | +-------+-----------+------------+ | 64 | +153% | +123,5% | +-------+-----------+------------+ | 256 | +151% | +144,7% | +-------+-----------+------------+ | 1024 | +272% | +157,2% | +-------+-----------+------------+ Signed-off-by: Kévin PETIT BUG= R=djsollen@google.com, mtklein@google.com Author: kevin.petit.arm@gmail.com Review URL: https://chromiumcodereview.appspot.com/22351006 git-svn-id: http://skia.googlecode.com/svn/trunk@11415 2bbb7eff-a529-9590-31e7-b0007b416f81 --- src/opts/SkBlitRow_opts_arm_neon.cpp | 46 ++++++++++++++++++++++++++++++++---- src/opts/SkColor_opts_neon.h | 12 ++++++++++ 2 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 src/opts/SkColor_opts_neon.h diff --git a/src/opts/SkBlitRow_opts_arm_neon.cpp b/src/opts/SkBlitRow_opts_arm_neon.cpp index 705ee99..ffa0a8b 100644 --- a/src/opts/SkBlitRow_opts_arm_neon.cpp +++ b/src/opts/SkBlitRow_opts_arm_neon.cpp @@ -15,9 +15,45 @@ #include "SkUtils.h" #include "SkCachePreload_arm.h" - +#include "SkColor_opts_neon.h" #include +void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, int count, + U8CPU alpha, int /*x*/, int /*y*/) { + SkASSERT(255 == alpha); + + while (count >= 8) { + uint8x8x4_t vsrc; + uint16x8_t vdst; + + // Load + vsrc = vld4_u8((uint8_t*)src); + + // Convert src to 565 + vdst = vshll_n_u8(vsrc.val[NEON_R], 8); + vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_G], 8), 5); + vdst = vsriq_n_u16(vdst, vshll_n_u8(vsrc.val[NEON_B], 8), 5+6); + + // Store + vst1q_u16(dst, vdst); + + // Prepare next iteration + dst += 8; + src += 8; + count -= 8; + }; + + // Leftovers + while (count > 0) { + SkPMColor c = *src++; + SkPMColorAssert(c); + *dst = SkPixel32ToPixel16_ToU16(c); + dst++; + count--; + }; +} + void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, U8CPU alpha, int /*x*/, int /*y*/) { @@ -1330,10 +1366,10 @@ void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { // no dither - // NOTE: For the two functions below, we don't have a special version - // that assumes that each source pixel is opaque. But our S32A is - // still faster than the default, so use it. - S32A_D565_Opaque_neon, // really S32_D565_Opaque + // NOTE: For the S32_D565_Blend function below, we don't have a special + // version that assumes that each source pixel is opaque. But our + // S32A is still faster than the default, so use it. + S32_D565_Opaque_neon, S32A_D565_Blend_neon, // really S32_D565_Blend S32A_D565_Opaque_neon, S32A_D565_Blend_neon, diff --git a/src/opts/SkColor_opts_neon.h b/src/opts/SkColor_opts_neon.h new file mode 100644 index 0000000..adc2641 --- /dev/null +++ b/src/opts/SkColor_opts_neon.h @@ -0,0 +1,12 @@ +#ifndef SkColor_opts_neon_DEFINED +#define SkColor_opts_neon_DEFINED + +#include "SkTypes.h" + +#define NEON_A (SK_A32_SHIFT / 8) +#define NEON_R (SK_R32_SHIFT / 8) +#define NEON_G (SK_G32_SHIFT / 8) +#define NEON_B (SK_B32_SHIFT / 8) + +#endif /* #ifndef SkColor_opts_neon_DEFINED */ + -- 2.7.4