From 3bc2624a4b89c49efd65f5e548ac5f2dd9351431 Mon Sep 17 00:00:00 2001 From: mtklein Date: Wed, 17 Feb 2016 14:21:28 -0800 Subject: [PATCH] try plain-old code for sk_memset16/32 now that NEON is compile-time Most of these implementations now just say "always inline". Let's see if we can get away with the simplicity of doing that all the time. These inlined implementations can autovectorize easily. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1639863002 CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review URL: https://codereview.chromium.org/1639863002 --- include/core/SkUtils.h | 42 +++--------------- src/core/SkOpts.cpp | 3 -- src/core/SkOpts.h | 4 -- src/opts/SkOpts_neon.cpp | 3 -- src/opts/SkUtils_opts.h | 110 ----------------------------------------------- 5 files changed, 6 insertions(+), 156 deletions(-) delete mode 100644 src/opts/SkUtils_opts.h diff --git a/include/core/SkUtils.h b/include/core/SkUtils.h index 2b390f0..b5674ec 100644 --- a/include/core/SkUtils.h +++ b/include/core/SkUtils.h @@ -10,36 +10,15 @@ #include "SkTypes.h" -namespace SkOpts { - extern void (*memset16)(uint16_t[], uint16_t, int); - extern void (*memset32)(uint32_t[], uint32_t, int); -} - -/////////////////////////////////////////////////////////////////////////////// - -// Inlining heuristics were determined by using perf.skia.org and bench/MemsetBench.cpp. -// When using MSVC, inline is better >= 1K and worse <= 100. The Nexus Player was the opposite. -// Otherwise, when NEON or SSE is available to GCC or Clang, they can handle it best. -// See https://code.google.com/p/chromium/issues/detail?id=516426#c15 for more details. -// See also skia:4316; it might be a good idea to use rep stosw/stosd here. -#define INLINE_IF(cond) if (cond) { while (count --> 0) { *buffer++ = value; } return; } - /** Similar to memset(), but it assigns a 16bit value into the buffer. @param buffer The memory to have value copied into it @param value The 16bit value to be copied into buffer @param count The number of times value should be copied into the buffer. */ static inline void sk_memset16(uint16_t buffer[], uint16_t value, int count) { -#if defined(_MSC_VER) - INLINE_IF(count > 300) -#elif defined(SK_BUILD_FOR_ANDROID) && defined(SK_CPU_X86) - INLINE_IF(count < 300) -#elif defined(SK_ARM_HAS_NEON) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 - INLINE_IF(true) -#else - INLINE_IF(count <= 10) -#endif - SkOpts::memset16(buffer, value, count); + for (int i = 0; i < count; i++) { + buffer[i] = value; + } } /** Similar to memset(), but it assigns a 32bit value into the buffer. @@ -48,20 +27,11 @@ static inline void sk_memset16(uint16_t buffer[], uint16_t value, int count) { @param count The number of times value should be copied into the buffer. */ static inline void sk_memset32(uint32_t buffer[], uint32_t value, int count) { -#if defined(_MSC_VER) - INLINE_IF(count > 300) -#elif defined(SK_BUILD_FOR_ANDROID) && defined(SK_CPU_X86) - INLINE_IF(count < 300) -#elif defined(SK_ARM_HAS_NEON) || SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 - INLINE_IF(true) -#else - INLINE_IF(count <= 10) -#endif - SkOpts::memset32(buffer, value, count); + for (int i = 0; i < count; i++) { + buffer[i] = value; + } } -#undef INLINE_IF - /////////////////////////////////////////////////////////////////////////////// #define kMaxBytesInUTF8Sequence 4 diff --git a/src/core/SkOpts.cpp b/src/core/SkOpts.cpp index 7534ac5..e34b6d7 100644 --- a/src/core/SkOpts.cpp +++ b/src/core/SkOpts.cpp @@ -18,7 +18,6 @@ #include "SkMorphologyImageFilter_opts.h" #include "SkSwizzler_opts.h" #include "SkTextureCompressor_opts.h" -#include "SkUtils_opts.h" #include "SkXfermode_opts.h" namespace SK_OPTS_NS { @@ -68,8 +67,6 @@ namespace SkOpts { // If our global compile options are set high enough, these defaults might even be // CPU-specialized, e.g. a typical x86-64 machine might start with SSE2 defaults. // They'll still get a chance to be replaced with even better ones, e.g. using SSE4.1. - decltype(memset16) memset16 = sk_default::memset16; - decltype(memset32) memset32 = sk_default::memset32; decltype(create_xfermode) create_xfermode = sk_default::create_xfermode; decltype(color_cube_filter_span) color_cube_filter_span = sk_default::color_cube_filter_span; diff --git a/src/core/SkOpts.h b/src/core/SkOpts.h index c717526..42c47a5 100644 --- a/src/core/SkOpts.h +++ b/src/core/SkOpts.h @@ -23,10 +23,6 @@ namespace SkOpts { // Declare function pointers here... - // See SkUtils.h - extern void (*memset16)(uint16_t[], uint16_t, int); - extern void (*memset32)(uint32_t[], uint32_t, int); - // May return nullptr if we haven't specialized the given Mode. extern SkXfermode* (*create_xfermode)(const ProcCoeff&, SkXfermode::Mode); diff --git a/src/opts/SkOpts_neon.cpp b/src/opts/SkOpts_neon.cpp index 80fb4e9..e96cf17 100644 --- a/src/opts/SkOpts_neon.cpp +++ b/src/opts/SkOpts_neon.cpp @@ -16,13 +16,10 @@ #include "SkMorphologyImageFilter_opts.h" #include "SkSwizzler_opts.h" #include "SkTextureCompressor_opts.h" -#include "SkUtils_opts.h" #include "SkXfermode_opts.h" namespace SkOpts { void Init_neon() { - memset16 = sk_neon::memset16; - memset32 = sk_neon::memset32; create_xfermode = sk_neon::create_xfermode; box_blur_xx = sk_neon::box_blur_xx; diff --git a/src/opts/SkUtils_opts.h b/src/opts/SkUtils_opts.h deleted file mode 100644 index 44fe643..0000000 --- a/src/opts/SkUtils_opts.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright 2015 Google Inc. - * - * Use of this source code is governed by a BSD-style license that can be - * found in the LICENSE file. - */ - -#ifndef SkUtils_opts_DEFINED -#define SkUtils_opts_DEFINED - -namespace SK_OPTS_NS { - -#if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 - -static void memset16(uint16_t* dst, uint16_t val, int n) { - auto dst8 = (__m128i*)dst; - auto val8 = _mm_set1_epi16(val); - for ( ; n >= 8; n -= 8) { - _mm_storeu_si128(dst8++, val8); - } - dst = (uint16_t*)dst8; - if (n & 4) { - _mm_storel_epi64((__m128i*)dst, val8); - dst += 4; - } - if (n & 2) { - *(uint32_t*)dst = _mm_cvtsi128_si32(val8); - dst += 2; - } - if (n & 1) { - *dst = val; - } -} - -static void memset32(uint32_t* dst, uint32_t val, int n) { - auto dst4 = (__m128i*)dst; - auto val4 = _mm_set1_epi32(val); - for ( ; n >= 4; n -= 4) { - _mm_storeu_si128(dst4++, val4); - } - dst = (uint32_t*)dst4; - if (n & 2) { - _mm_storel_epi64((__m128i*)dst, val4); - dst += 2; - } - if (n & 1) { - *dst = val; - } -} - -#elif defined(SK_ARM_HAS_NEON) - -static void memset16(uint16_t* dst, uint16_t value, int n) { - uint16x8_t v8 = vdupq_n_u16(value); - uint16x8x4_t v32 = {{ v8, v8, v8, v8 }}; - - while (n >= 32) { - vst4q_u16(dst, v32); // This swizzles, but we don't care: all lanes are the same, value. - dst += 32; - n -= 32; - } - switch (n / 8) { - case 3: vst1q_u16(dst, v8); dst += 8; - case 2: vst1q_u16(dst, v8); dst += 8; - case 1: vst1q_u16(dst, v8); dst += 8; - } - if (n & 4) { - vst1_u16(dst, vget_low_u16(v8)); - dst += 4; - } - switch (n & 3) { - case 3: *dst++ = value; - case 2: *dst++ = value; - case 1: *dst = value; - } -} - -static void memset32(uint32_t* dst, uint32_t value, int n) { - uint32x4_t v4 = vdupq_n_u32(value); - uint32x4x4_t v16 = {{ v4, v4, v4, v4 }}; - - while (n >= 16) { - vst4q_u32(dst, v16); // This swizzles, but we don't care: all lanes are the same, value. - dst += 16; - n -= 16; - } - switch (n / 4) { - case 3: vst1q_u32(dst, v4); dst += 4; - case 2: vst1q_u32(dst, v4); dst += 4; - case 1: vst1q_u32(dst, v4); dst += 4; - } - if (n & 2) { - vst1_u32(dst, vget_low_u32(v4)); - dst += 2; - } - if (n & 1) { - *dst = value; - } -} - -#else // Neither NEON nor SSE2. - -static void memset16(uint16_t* dst, uint16_t val, int n) { while (n --> 0) { *dst++ = val; } } -static void memset32(uint32_t* dst, uint32_t val, int n) { while (n --> 0) { *dst++ = val; } } - -#endif - -} // namespace SK_OPTS_NS - -#endif//SkUtils_opts_DEFINED -- 2.7.4