From d05a8752738f84b0115678b3cdad89237173e904 Mon Sep 17 00:00:00 2001 From: mtklein Date: Fri, 29 Jul 2016 10:10:15 -0700 Subject: [PATCH] SkNx: add Sk4u This lets us get at logical >> in a nicely principled way. BUG=skia: GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2197683002 CQ_INCLUDE_TRYBOTS=master.client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot Review-Url: https://codereview.chromium.org/2197683002 --- src/core/SkNx.h | 3 +- src/core/SkRasterPipelineBlitter.cpp | 3 +- src/opts/SkNx_neon.h | 72 +++++++++++++++++++++++++++++++----- src/opts/SkNx_sse.h | 70 +++++++++++++++++++++++++++++------ 4 files changed, 124 insertions(+), 24 deletions(-) diff --git a/src/core/SkNx.h b/src/core/SkNx.h index 308addd..881a475 100644 --- a/src/core/SkNx.h +++ b/src/core/SkNx.h @@ -293,7 +293,8 @@ typedef SkNx<4, uint16_t> Sk4h; typedef SkNx<8, uint16_t> Sk8h; typedef SkNx<16, uint16_t> Sk16h; -typedef SkNx<4, int> Sk4i; +typedef SkNx<4, int32_t> Sk4i; +typedef SkNx<4, uint32_t> Sk4u; // Include platform specific specializations if available. #if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp index a27abbd..f3fc76e 100644 --- a/src/core/SkRasterPipelineBlitter.cpp +++ b/src/core/SkRasterPipelineBlitter.cpp @@ -297,8 +297,7 @@ static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x, sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff], sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] }; - // TODO: this >> doesn't really need mask if we make it logical instead of arithmetic. - da = SkNx_cast((Sk4i::Load(ptr) >> SK_A32_SHIFT) & 0xff) * (1/255.0f); + da = SkNx_cast(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f); st->next(x, r,g,b,a, dr,dg,db,da); } diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index 23567b7..eea6800 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -361,15 +361,15 @@ public: }; template <> -class SkNx<4, int> { +class SkNx<4, int32_t> { public: SkNx(const int32x4_t& vec) : fVec(vec) {} SkNx() {} - SkNx(int v) { + SkNx(int32_t v) { fVec = vdupq_n_s32(v); } - SkNx(int a, int b, int c, int d) { + SkNx(int32_t a, int32_t b, int32_t c, int32_t d) { fVec = (int32x4_t){a,b,c,d}; } static SkNx Load(const void* ptr) { @@ -378,9 +378,9 @@ public: void store(void* ptr) const { return vst1q_s32((int32_t*)ptr, fVec); } - int operator[](int k) const { + int32_t operator[](int k) const { SkASSERT(0 <= k && k < 4); - union { int32x4_t v; int is[4]; } pun = {fVec}; + union { int32x4_t v; int32_t is[4]; } pun = {fVec}; return pun.is[k&3]; } @@ -415,17 +415,69 @@ public: int32x4_t fVec; }; +template <> +class SkNx<4, uint32_t> { +public: + SkNx(const uint32x4_t& vec) : fVec(vec) {} + + SkNx() {} + SkNx(uint32_t v) { + fVec = vdupq_n_u32(v); + } + SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { + fVec = (uint32x4_t){a,b,c,d}; + } + static SkNx Load(const void* ptr) { + return vld1q_u32((const uint32_t*)ptr); + } + void store(void* ptr) const { + return vst1q_u32((uint32_t*)ptr, fVec); + } + uint32_t operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { uint32x4_t v; uint32_t us[4]; } pun = {fVec}; + return pun.us[k&3]; + } + + SkNx operator + (const SkNx& o) const { return vaddq_u32(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return vsubq_u32(fVec, o.fVec); } + SkNx operator * (const SkNx& o) const { return vmulq_u32(fVec, o.fVec); } + + SkNx operator & (const SkNx& o) const { return vandq_u32(fVec, o.fVec); } + SkNx operator | (const SkNx& o) const { return vorrq_u32(fVec, o.fVec); } + SkNx operator ^ (const SkNx& o) const { return veorq_u32(fVec, o.fVec); } + + SkNx operator << (int bits) const { SHIFT32(vshlq_n_u32, fVec, bits); } + SkNx operator >> (int bits) const { SHIFT32(vshrq_n_u32, fVec, bits); } + + SkNx operator == (const SkNx& o) const { return vceqq_u32(fVec, o.fVec); } + SkNx operator < (const SkNx& o) const { return vcltq_u32(fVec, o.fVec); } + SkNx operator > (const SkNx& o) const { return vcgtq_u32(fVec, o.fVec); } + + static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.fVec); } + // TODO as needed + + SkNx thenElse(const SkNx& t, const SkNx& e) const { + return vbslq_u32(fVec, t.fVec, e.fVec); + } + + uint32x4_t fVec; +}; + #undef SHIFT32 #undef SHIFT16 #undef SHIFT8 -template<> inline Sk4i SkNx_cast(const Sk4f& src) { +template<> inline Sk4i SkNx_cast(const Sk4f& src) { return vcvtq_s32_f32(src.fVec); } -template<> inline Sk4f SkNx_cast(const Sk4i& src) { +template<> inline Sk4f SkNx_cast(const Sk4i& src) { return vcvtq_f32_s32(src.fVec); } +template<> inline Sk4f SkNx_cast(const Sk4u& src) { + return SkNx_cast(Sk4i::Load(&src)); +} template<> inline Sk4h SkNx_cast(const Sk4f& src) { return vqmovn_u32(vcvtq_u32_f32(src.fVec)); @@ -468,16 +520,16 @@ template<> inline Sk4b SkNx_cast(const Sk4h& src) { return vmovn_u16(vcombine_u16(src.fVec, src.fVec)); } -template<> inline Sk4b SkNx_cast(const Sk4i& src) { +template<> inline Sk4b SkNx_cast(const Sk4i& src) { uint16x4_t _16 = vqmovun_s32(src.fVec); return vqmovn_u16(vcombine_u16(_16, _16)); } -template<> inline Sk4i SkNx_cast(const Sk4h& src) { +template<> inline Sk4i SkNx_cast(const Sk4h& src) { return vreinterpretq_s32_u32(vmovl_u16(src.fVec)); } -template<> inline Sk4h SkNx_cast(const Sk4i& src) { +template<> inline Sk4h SkNx_cast(const Sk4i& src) { return vmovn_u32(vreinterpretq_u32_s32(src.fVec)); } diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 3881b54..003b89f 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -130,14 +130,14 @@ public: }; template <> -class SkNx<4, int> { +class SkNx<4, int32_t> { public: SkNx(const __m128i& vec) : fVec(vec) {} SkNx() {} - SkNx(int val) : fVec(_mm_set1_epi32(val)) {} + SkNx(int32_t val) : fVec(_mm_set1_epi32(val)) {} static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } - SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {} + SkNx(int32_t a, int32_t b, int32_t c, int32_t d) : fVec(_mm_setr_epi32(a,b,c,d)) {} void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } @@ -161,9 +161,9 @@ public: SkNx operator < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVec); } SkNx operator > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVec); } - int operator[](int k) const { + int32_t operator[](int k) const { SkASSERT(0 <= k && k < 4); - union { __m128i v; int is[4]; } pun = {fVec}; + union { __m128i v; int32_t is[4]; } pun = {fVec}; return pun.is[k&3]; } @@ -180,6 +180,51 @@ public: }; template <> +class SkNx<4, uint32_t> { +public: + SkNx(const __m128i& vec) : fVec(vec) {} + + SkNx() {} + SkNx(uint32_t val) : fVec(_mm_set1_epi32(val)) {} + static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); } + SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) : fVec(_mm_setr_epi32(a,b,c,d)) {} + + void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); } + + SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); } + SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); } + // Not quite sure how to best do operator * in SSE2. We probably don't use it. + + SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); } + SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); } + SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); } + + SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); } + SkNx operator >> (int bits) const { return _mm_srli_epi32(fVec, bits); } + + SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVec); } + // operator < and > take a little extra fiddling to make work for unsigned ints. + + uint32_t operator[](int k) const { + SkASSERT(0 <= k && k < 4); + union { __m128i v; uint32_t us[4]; } pun = {fVec}; + return pun.us[k&3]; + } + + SkNx thenElse(const SkNx& t, const SkNx& e) const { + #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 + return _mm_blendv_epi8(e.fVec, t.fVec, fVec); + #else + return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), + _mm_andnot_si128(fVec, e.fVec)); + #endif + } + + __m128i fVec; +}; + + +template <> class SkNx<4, uint16_t> { public: SkNx(const __m128i& vec) : fVec(vec) {} @@ -315,15 +360,18 @@ public: __m128i fVec; }; -template<> /*static*/ inline Sk4f SkNx_cast(const Sk4i& src) { +template<> /*static*/ inline Sk4f SkNx_cast(const Sk4i& src) { return _mm_cvtepi32_ps(src.fVec); } +template<> /*static*/ inline Sk4f SkNx_cast(const Sk4u& src) { + return SkNx_cast(Sk4i::Load(&src)); +} -template <> /*static*/ inline Sk4i SkNx_cast(const Sk4f& src) { +template <> /*static*/ inline Sk4i SkNx_cast(const Sk4f& src) { return _mm_cvttps_epi32(src.fVec); } -template<> /*static*/ inline Sk4h SkNx_cast(const Sk4i& src) { +template<> /*static*/ inline Sk4h SkNx_cast(const Sk4i& src) { #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 // TODO: This seems to be causing code generation problems. Investigate? return _mm_packus_epi32(src.fVec); @@ -339,7 +387,7 @@ template<> /*static*/ inline Sk4h SkNx_cast(const Sk4i& src) { } template<> /*static*/ inline Sk4h SkNx_cast(const Sk4f& src) { - return SkNx_cast(SkNx_cast(src)); + return SkNx_cast(SkNx_cast(src)); } template<> /*static*/ inline Sk4b SkNx_cast(const Sk4f& src) { @@ -391,11 +439,11 @@ template<> /*static*/ inline Sk4b SkNx_cast(const Sk4h& src) return _mm_packus_epi16(src.fVec, src.fVec); } -template<> /*static*/ inline Sk4i SkNx_cast(const Sk4h& src) { +template<> /*static*/ inline Sk4i SkNx_cast(const Sk4h& src) { return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128()); } -template<> /*static*/ inline Sk4b SkNx_cast(const Sk4i& src) { +template<> /*static*/ inline Sk4b SkNx_cast(const Sk4i& src) { return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec); } -- 2.7.4