From b5e861185a69b3e47d1c1bf622fd3f83e5f13898 Mon Sep 17 00:00:00 2001 From: mtklein Date: Wed, 24 Jun 2015 15:18:39 -0700 Subject: [PATCH] Implement four more xfermodes with Sk4px. HardLight, Overlay, Darken, and Lighten are all ~2x faster with SSE, ~25% faster with NEON. This covers all previously-implemented NEON xfermodes. 3 previous SSE xfermodes remain. Those need division and sqrt, so I'm planning on using SkPMFloat for them. It'll help the readability and NEON speed if I move that into [0,1] space first. The main new concept here is c.thenElse(t,e), which behaves like (c ? t : e) except, of course, both t and e are evaluated. This allows us to emulate conditionals with vectors. This also removes the concept of SkNb. Instead of a standalone bool vector, each SkNi or SkNf will just return their own types for comparisons. Turns out to be a lot more manageable this way. BUG=skia: Committed: https://skia.googlesource.com/skia/+/b9d4163bebab0f5639f9c5928bb5fc15f472dddc CQ_EXTRA_TRYBOTS=client.skia.compile:Build-Ubuntu-GCC-Arm64-Debug-Android-Trybot Review URL: https://codereview.chromium.org/1196713004 --- src/core/Sk4px.h | 2 + src/core/Sk4pxXfermode.h | 60 ++++++++++++++--- src/core/SkNx.h | 92 ++++++++++++-------------- src/opts/SkNx_neon.h | 120 +++++++++++++++++----------------- src/opts/SkNx_sse.h | 94 +++++++++++--------------- src/opts/SkXfermode_opts_SSE2.cpp | 8 ++- src/opts/SkXfermode_opts_arm_neon.cpp | 1 + 7 files changed, 199 insertions(+), 178 deletions(-) diff --git a/src/core/Sk4px.h b/src/core/Sk4px.h index 26d4d0f..e046e26 100644 --- a/src/core/Sk4px.h +++ b/src/core/Sk4px.h @@ -85,6 +85,8 @@ public: // These just keep the types as Sk4px so the user doesn't have to keep casting. Sk4px operator + (const Sk4px& o) const { return INHERITED::operator+(o); } Sk4px operator - (const Sk4px& o) const { return INHERITED::operator-(o); } + Sk4px operator < (const Sk4px& o) const { return INHERITED::operator<(o); } + Sk4px thenElse(const Sk4px& t, const Sk4px& e) const { return INHERITED::thenElse(t,e); } // Generally faster than (*this * o).div255(). // May be incorrect by +-1, but is always exactly correct when *this or o is 0 or 255. diff --git a/src/core/Sk4pxXfermode.h b/src/core/Sk4pxXfermode.h index b4ebd85..09490dc 100644 --- a/src/core/Sk4pxXfermode.h +++ b/src/core/Sk4pxXfermode.h @@ -60,6 +60,44 @@ XFERMODE(Exclusion) { return (s - p) + (d - p.zeroAlphas()); } +XFERMODE(HardLight) { + auto alphas = SrcOver::Xfer(s,d); + + auto sa = s.alphas(), + da = d.alphas(); + + auto isDark = s < (sa-s); + + auto dark = s*d << 1, + lite = sa*da - ((da-d)*(sa-s) << 1), + both = s*da.inv() + d*sa.inv(); + + // TODO: do isDark in 16-bit so we only have to div255() once. + auto colors = isDark.thenElse((dark + both).div255(), + (lite + both).div255()); + return alphas.zeroColors() + colors.zeroAlphas(); +} +XFERMODE(Overlay) { return HardLight::Xfer(d,s); } + +XFERMODE(Darken) { + auto sda = s.approxMulDiv255(d.alphas()), + dsa = d.approxMulDiv255(s.alphas()); + auto srcover = s + (d - dsa), + dstover = d + (s - sda); + auto alphas = srcover, + colors = (sda < dsa).thenElse(srcover, dstover); + return alphas.zeroColors() + colors.zeroAlphas(); +} +XFERMODE(Lighten) { + auto sda = s.approxMulDiv255(d.alphas()), + dsa = d.approxMulDiv255(s.alphas()); + auto srcover = s + (d - dsa), + dstover = d + (s - sda); + auto alphas = srcover, + colors = (sda < dsa).thenElse(dstover, srcover); + return alphas.zeroColors() + colors.zeroAlphas(); +} + #undef XFERMODE // A reasonable fallback mode for doing AA is to simply apply the transfermode first, @@ -71,17 +109,15 @@ static Sk4px xfer_aa(const Sk4px& s, const Sk4px& d, const Sk4px& aa) { } // For some transfermodes we specialize AA, either for correctness or performance. -#ifndef SK_NO_SPECIALIZED_AA_XFERMODES - #define XFERMODE_AA(Name) \ - template <> Sk4px xfer_aa(const Sk4px& s, const Sk4px& d, const Sk4px& aa) +#define XFERMODE_AA(Name) \ + template <> Sk4px xfer_aa(const Sk4px& s, const Sk4px& d, const Sk4px& aa) - // Plus' clamp needs to happen after AA. skia:3852 - XFERMODE_AA(Plus) { // [ clamp( (1-AA)D + (AA)(S+D) ) == clamp(D + AA*S) ] - return d.saturatedAdd(s.approxMulDiv255(aa)); - } +// Plus' clamp needs to happen after AA. skia:3852 +XFERMODE_AA(Plus) { // [ clamp( (1-AA)D + (AA)(S+D) ) == clamp(D + AA*S) ] + return d.saturatedAdd(s.approxMulDiv255(aa)); +} - #undef XFERMODE_AA -#endif +#undef XFERMODE_AA template class SkT4pxXfermode : public SkProcCoeffXfermode { @@ -130,6 +166,12 @@ static SkProcCoeffXfermode* SkCreate4pxXfermode(const ProcCoeff& rec, SkXfermode case SkXfermode::kMultiply_Mode: return SkT4pxXfermode::Create(rec); case SkXfermode::kDifference_Mode: return SkT4pxXfermode::Create(rec); case SkXfermode::kExclusion_Mode: return SkT4pxXfermode::Create(rec); +#if !defined(SK_SUPPORT_LEGACY_XFERMODES) // For staging in Chrome (layout tests). + case SkXfermode::kHardLight_Mode: return SkT4pxXfermode::Create(rec); + case SkXfermode::kOverlay_Mode: return SkT4pxXfermode::Create(rec); + case SkXfermode::kDarken_Mode: return SkT4pxXfermode::Create(rec); + case SkXfermode::kLighten_Mode: return SkT4pxXfermode::Create(rec); +#endif default: break; } #endif diff --git a/src/core/SkNx.h b/src/core/SkNx.h index 1342266..dadb3ec 100644 --- a/src/core/SkNx.h +++ b/src/core/SkNx.h @@ -26,22 +26,6 @@ namespace { // The default implementations just fall back on a pair of size N/2. -// SkNb is a _very_ minimal class representing a vector of bools returned by comparison operators. -// We pass along the byte size of the compared types (Bytes) to help platform specializations. -template -class SkNb { -public: - SkNb() {} - SkNb(const SkNb& lo, const SkNb& hi) : fLo(lo), fHi(hi) {} - - bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); } - bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); } - -protected: - REQUIRE(0 == (N & (N-1))); - SkNb fLo, fHi; -}; - template class SkNi { public: @@ -78,14 +62,19 @@ public: static SkNi Min(const SkNi& a, const SkNi& b) { return SkNi(SkNi::Min(a.fLo, b.fLo), SkNi::Min(a.fHi, b.fHi)); } - - // TODO: comparisons, max? + SkNi operator < (const SkNi& o) const { return SkNi(fLo < o.fLo, fHi < o.fHi); } template T kth() const { SkASSERT(0 <= k && k < N); return k < N/2 ? fLo.template kth() : fHi.template kth(); } + bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); } + bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); } + SkNi thenElse(const SkNi& t, const SkNi& e) const { + return SkNi(fLo.thenElse(t.fLo, e.fLo), fHi.thenElse(t.fHi, e.fHi)); + } + protected: REQUIRE(0 == (N & (N-1))); @@ -94,11 +83,9 @@ protected: template class SkNf { - typedef SkNb Nb; - static int32_t MyNi(float); static int64_t MyNi(double); - typedef SkNi Ni; + typedef decltype(MyNi(T())) I; public: SkNf() {} explicit SkNf(T val) : fLo(val), fHi(val) {} @@ -115,19 +102,19 @@ public: fHi.store(vals+N/2); } - Ni castTrunc() const { return Ni(fLo.castTrunc(), fHi.castTrunc()); } + SkNi castTrunc() const { return SkNi(fLo.castTrunc(), fHi.castTrunc()); } SkNf operator + (const SkNf& o) const { return SkNf(fLo + o.fLo, fHi + o.fHi); } SkNf operator - (const SkNf& o) const { return SkNf(fLo - o.fLo, fHi - o.fHi); } SkNf operator * (const SkNf& o) const { return SkNf(fLo * o.fLo, fHi * o.fHi); } SkNf operator / (const SkNf& o) const { return SkNf(fLo / o.fLo, fHi / o.fHi); } - Nb operator == (const SkNf& o) const { return Nb(fLo == o.fLo, fHi == o.fHi); } - Nb operator != (const SkNf& o) const { return Nb(fLo != o.fLo, fHi != o.fHi); } - Nb operator < (const SkNf& o) const { return Nb(fLo < o.fLo, fHi < o.fHi); } - Nb operator > (const SkNf& o) const { return Nb(fLo > o.fLo, fHi > o.fHi); } - Nb operator <= (const SkNf& o) const { return Nb(fLo <= o.fLo, fHi <= o.fHi); } - Nb operator >= (const SkNf& o) const { return Nb(fLo >= o.fLo, fHi >= o.fHi); } + SkNf operator == (const SkNf& o) const { return SkNf(fLo == o.fLo, fHi == o.fHi); } + SkNf operator != (const SkNf& o) const { return SkNf(fLo != o.fLo, fHi != o.fHi); } + SkNf operator < (const SkNf& o) const { return SkNf(fLo < o.fLo, fHi < o.fHi); } + SkNf operator > (const SkNf& o) const { return SkNf(fLo > o.fLo, fHi > o.fHi); } + SkNf operator <= (const SkNf& o) const { return SkNf(fLo <= o.fLo, fHi <= o.fHi); } + SkNf operator >= (const SkNf& o) const { return SkNf(fLo >= o.fLo, fHi >= o.fHi); } static SkNf Min(const SkNf& l, const SkNf& r) { return SkNf(SkNf::Min(l.fLo, r.fLo), SkNf::Min(l.fHi, r.fHi)); @@ -151,6 +138,12 @@ public: return k < N/2 ? fLo.template kth() : fHi.template kth(); } + bool allTrue() const { return fLo.allTrue() && fHi.allTrue(); } + bool anyTrue() const { return fLo.anyTrue() || fHi.anyTrue(); } + SkNf thenElse(const SkNf& t, const SkNf& e) const { + return SkNf(fLo.thenElse(t.fLo, t.fHi), fHi.thenElse(t.fLo, t.fHi)); + } + protected: REQUIRE(0 == (N & (N-1))); SkNf(const SkNf& lo, const SkNf& hi) : fLo(lo), fHi(hi) {} @@ -161,17 +154,6 @@ protected: // Bottom out the default implementations with scalars when nothing's been specialized. -template -class SkNb<1, Bytes> { -public: - SkNb() {} - explicit SkNb(bool val) : fVal(val) {} - bool allTrue() const { return fVal; } - bool anyTrue() const { return fVal; } -protected: - bool fVal; -}; - template class SkNi<1,T> { public: @@ -195,23 +177,26 @@ public: SkNi operator >> (int bits) const { return SkNi(fVal >> bits); } static SkNi Min(const SkNi& a, const SkNi& b) { return SkNi(SkTMin(a.fVal, b.fVal)); } + SkNi operator <(const SkNi& o) const { return SkNi(fVal < o.fVal); } template T kth() const { SkASSERT(0 == k); return fVal; } + bool allTrue() const { return fVal; } + bool anyTrue() const { return fVal; } + SkNi thenElse(const SkNi& t, const SkNi& e) const { return fVal ? t : e; } + protected: T fVal; }; template class SkNf<1,T> { - typedef SkNb<1, sizeof(T)> Nb; - static int32_t MyNi(float); static int64_t MyNi(double); - typedef SkNi<1, decltype(MyNi(T()))> Ni; + typedef decltype(MyNi(T())) I; public: SkNf() {} explicit SkNf(T val) : fVal(val) {} @@ -219,19 +204,19 @@ public: void store(T vals[1]) const { vals[0] = fVal; } - Ni castTrunc() const { return Ni(fVal); } + SkNi<1,I> castTrunc() const { return SkNi<1,I>(fVal); } SkNf operator + (const SkNf& o) const { return SkNf(fVal + o.fVal); } SkNf operator - (const SkNf& o) const { return SkNf(fVal - o.fVal); } SkNf operator * (const SkNf& o) const { return SkNf(fVal * o.fVal); } SkNf operator / (const SkNf& o) const { return SkNf(fVal / o.fVal); } - Nb operator == (const SkNf& o) const { return Nb(fVal == o.fVal); } - Nb operator != (const SkNf& o) const { return Nb(fVal != o.fVal); } - Nb operator < (const SkNf& o) const { return Nb(fVal < o.fVal); } - Nb operator > (const SkNf& o) const { return Nb(fVal > o.fVal); } - Nb operator <= (const SkNf& o) const { return Nb(fVal <= o.fVal); } - Nb operator >= (const SkNf& o) const { return Nb(fVal >= o.fVal); } + SkNf operator == (const SkNf& o) const { return SkNf(fVal == o.fVal); } + SkNf operator != (const SkNf& o) const { return SkNf(fVal != o.fVal); } + SkNf operator < (const SkNf& o) const { return SkNf(fVal < o.fVal); } + SkNf operator > (const SkNf& o) const { return SkNf(fVal > o.fVal); } + SkNf operator <= (const SkNf& o) const { return SkNf(fVal <= o.fVal); } + SkNf operator >= (const SkNf& o) const { return SkNf(fVal >= o.fVal); } static SkNf Min(const SkNf& l, const SkNf& r) { return SkNf(SkTMin(l.fVal, r.fVal)); } static SkNf Max(const SkNf& l, const SkNf& r) { return SkNf(SkTMax(l.fVal, r.fVal)); } @@ -249,12 +234,21 @@ public: return fVal; } + bool allTrue() const { return this->pun(); } + bool anyTrue() const { return this->pun(); } + SkNf thenElse(const SkNf& t, const SkNf& e) const { return this->pun() ? t : e; } + protected: // We do double sqrts natively, or via floats for any other type. template static U Sqrt(U val) { return (U) ::sqrtf((float)val); } static double Sqrt(double val) { return ::sqrt ( val); } + I pun() const { + union { T f; I i; } pun = { fVal }; + return pun.i; + } + T fVal; }; diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h index da926e0..b319807 100644 --- a/src/opts/SkNx_neon.h +++ b/src/opts/SkNx_neon.h @@ -33,34 +33,7 @@ namespace { // See SkNx.h case 31: return op(v, 31); } return fVec template <> -class SkNb<2, 4> { -public: - SkNb(uint32x2_t vec) : fVec(vec) {} - - SkNb() {} - bool allTrue() const { return vget_lane_u32(fVec, 0) && vget_lane_u32(fVec, 1); } - bool anyTrue() const { return vget_lane_u32(fVec, 0) || vget_lane_u32(fVec, 1); } - - uint32x2_t fVec; -}; - -template <> -class SkNb<4, 4> { -public: - SkNb(uint32x4_t vec) : fVec(vec) {} - - SkNb() {} - bool allTrue() const { return vgetq_lane_u32(fVec, 0) && vgetq_lane_u32(fVec, 1) - && vgetq_lane_u32(fVec, 2) && vgetq_lane_u32(fVec, 3); } - bool anyTrue() const { return vgetq_lane_u32(fVec, 0) || vgetq_lane_u32(fVec, 1) - || vgetq_lane_u32(fVec, 2) || vgetq_lane_u32(fVec, 3); } - - uint32x4_t fVec; -}; - -template <> class SkNf<2, float> { - typedef SkNb<2, 4> Nb; public: SkNf(float32x2_t vec) : fVec(vec) {} @@ -93,12 +66,14 @@ public: #endif } - Nb operator == (const SkNf& o) const { return vceq_f32(fVec, o.fVec); } - Nb operator < (const SkNf& o) const { return vclt_f32(fVec, o.fVec); } - Nb operator > (const SkNf& o) const { return vcgt_f32(fVec, o.fVec); } - Nb operator <= (const SkNf& o) const { return vcle_f32(fVec, o.fVec); } - Nb operator >= (const SkNf& o) const { return vcge_f32(fVec, o.fVec); } - Nb operator != (const SkNf& o) const { return vmvn_u32(vceq_f32(fVec, o.fVec)); } + SkNf operator == (const SkNf& o) const { return vreinterpret_f32_u32(vceq_f32(fVec, o.fVec)); } + SkNf operator < (const SkNf& o) const { return vreinterpret_f32_u32(vclt_f32(fVec, o.fVec)); } + SkNf operator > (const SkNf& o) const { return vreinterpret_f32_u32(vcgt_f32(fVec, o.fVec)); } + SkNf operator <= (const SkNf& o) const { return vreinterpret_f32_u32(vcle_f32(fVec, o.fVec)); } + SkNf operator >= (const SkNf& o) const { return vreinterpret_f32_u32(vcge_f32(fVec, o.fVec)); } + SkNf operator != (const SkNf& o) const { + return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec))); + } static SkNf Min(const SkNf& l, const SkNf& r) { return vmin_f32(l.fVec, r.fVec); } static SkNf Max(const SkNf& l, const SkNf& r) { return vmax_f32(l.fVec, r.fVec); } @@ -126,25 +101,21 @@ public: return vget_lane_f32(fVec, k&1); } + bool allTrue() const { + auto v = vreinterpret_u32_f32(fVec); + return vget_lane_u32(v,0) && vget_lane_u32(v,1); + } + bool anyTrue() const { + auto v = vreinterpret_u32_f32(fVec); + return vget_lane_u32(v,0) || vget_lane_u32(v,1); + } + float32x2_t fVec; }; #if defined(SK_CPU_ARM64) template <> -class SkNb<2, 8> { -public: - SkNb(uint64x2_t vec) : fVec(vec) {} - - SkNb() {} - bool allTrue() const { return vgetq_lane_u64(fVec, 0) && vgetq_lane_u64(fVec, 1); } - bool anyTrue() const { return vgetq_lane_u64(fVec, 0) || vgetq_lane_u64(fVec, 1); } - - uint64x2_t fVec; -}; - -template <> class SkNf<2, double> { - typedef SkNb<2, 8> Nb; public: SkNf(float64x2_t vec) : fVec(vec) {} @@ -160,13 +131,14 @@ public: SkNf operator * (const SkNf& o) const { return vmulq_f64(fVec, o.fVec); } SkNf operator / (const SkNf& o) const { return vdivq_f64(fVec, o.fVec); } - Nb operator == (const SkNf& o) const { return vceqq_f64(fVec, o.fVec); } - Nb operator < (const SkNf& o) const { return vcltq_f64(fVec, o.fVec); } - Nb operator > (const SkNf& o) const { return vcgtq_f64(fVec, o.fVec); } - Nb operator <= (const SkNf& o) const { return vcleq_f64(fVec, o.fVec); } - Nb operator >= (const SkNf& o) const { return vcgeq_f64(fVec, o.fVec); } - Nb operator != (const SkNf& o) const { - return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(fVec, o.fVec)))); + // vreinterpretq_f64_u64 and vreinterpretq_f64_u32 don't seem to exist.... weird. + SkNf operator==(const SkNf& o) const { return (float64x2_t)(vceqq_f64(fVec, o.fVec)); } + SkNf operator <(const SkNf& o) const { return (float64x2_t)(vcltq_f64(fVec, o.fVec)); } + SkNf operator >(const SkNf& o) const { return (float64x2_t)(vcgtq_f64(fVec, o.fVec)); } + SkNf operator<=(const SkNf& o) const { return (float64x2_t)(vcleq_f64(fVec, o.fVec)); } + SkNf operator>=(const SkNf& o) const { return (float64x2_t)(vcgeq_f64(fVec, o.fVec)); } + SkNf operator != (const SkNf& o) const { + return (float64x2_t)(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(fVec, o.fVec)))); } static SkNf Min(const SkNf& l, const SkNf& r) { return vminq_f64(l.fVec, r.fVec); } @@ -202,6 +174,16 @@ public: return vgetq_lane_f64(fVec, k&1); } + // vreinterpretq_u64_f64 doesn't seem to exist.... weird. + bool allTrue() const { + auto v = (uint64x2_t)(fVec); + return vgetq_lane_u64(v,0) && vgetq_lane_u64(v,1); + } + bool anyTrue() const { + auto v = (uint64x2_t)(fVec); + return vgetq_lane_u64(v,0) || vgetq_lane_u64(v,1); + } + float64x2_t fVec; }; #endif//defined(SK_CPU_ARM64) @@ -235,7 +217,6 @@ public: template <> class SkNf<4, float> { - typedef SkNb<4, 4> Nb; public: SkNf(float32x4_t vec) : fVec(vec) {} @@ -270,12 +251,14 @@ public: #endif } - Nb operator == (const SkNf& o) const { return vceqq_f32(fVec, o.fVec); } - Nb operator < (const SkNf& o) const { return vcltq_f32(fVec, o.fVec); } - Nb operator > (const SkNf& o) const { return vcgtq_f32(fVec, o.fVec); } - Nb operator <= (const SkNf& o) const { return vcleq_f32(fVec, o.fVec); } - Nb operator >= (const SkNf& o) const { return vcgeq_f32(fVec, o.fVec); } - Nb operator != (const SkNf& o) const { return vmvnq_u32(vceqq_f32(fVec, o.fVec)); } + SkNf operator==(const SkNf& o) const { return vreinterpretq_f32_u32(vceqq_f32(fVec, o.fVec)); } + SkNf operator <(const SkNf& o) const { return vreinterpretq_f32_u32(vcltq_f32(fVec, o.fVec)); } + SkNf operator >(const SkNf& o) const { return vreinterpretq_f32_u32(vcgtq_f32(fVec, o.fVec)); } + SkNf operator<=(const SkNf& o) const { return vreinterpretq_f32_u32(vcleq_f32(fVec, o.fVec)); } + SkNf operator>=(const SkNf& o) const { return vreinterpretq_f32_u32(vcgeq_f32(fVec, o.fVec)); } + SkNf operator!=(const SkNf& o) const { + return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); + } static SkNf Min(const SkNf& l, const SkNf& r) { return vminq_f32(l.fVec, r.fVec); } static SkNf Max(const SkNf& l, const SkNf& r) { return vmaxq_f32(l.fVec, r.fVec); } @@ -303,6 +286,17 @@ public: return vgetq_lane_f32(fVec, k&3); } + bool allTrue() const { + auto v = vreinterpretq_u32_f32(fVec); + return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1) + && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3); + } + bool anyTrue() const { + auto v = vreinterpretq_u32_f32(fVec); + return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1) + || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3); + } + float32x4_t fVec; }; @@ -363,12 +357,18 @@ public: SkNi operator - (const SkNi& o) const { return vsubq_u8(fVec, o.fVec); } static SkNi Min(const SkNi& a, const SkNi& b) { return vminq_u8(a.fVec, b.fVec); } + SkNi operator < (const SkNi& o) const { return vcltq_u8(fVec, o.fVec); } template uint8_t kth() const { SkASSERT(0 <= k && k < 15); return vgetq_lane_u8(fVec, k&16); } + SkNi thenElse(const SkNi& t, const SkNi& e) const { + return vorrq_u8(vandq_u8(t.fVec, fVec), + vbicq_u8(e.fVec, fVec)); + } + uint8x16_t fVec; }; diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h index 12a4719..9b4de70 100644 --- a/src/opts/SkNx_sse.h +++ b/src/opts/SkNx_sse.h @@ -12,46 +12,9 @@ namespace { // See SkNx.h -template <> -class SkNb<2, 4> { -public: - SkNb(const __m128i& vec) : fVec(vec) {} - - SkNb() {} - bool allTrue() const { return 0xff == (_mm_movemask_epi8(fVec) & 0xff); } - bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(fVec) & 0xff); } - - __m128i fVec; -}; - -template <> -class SkNb<4, 4> { -public: - SkNb(const __m128i& vec) : fVec(vec) {} - - SkNb() {} - bool allTrue() const { return 0xffff == _mm_movemask_epi8(fVec); } - bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(fVec); } - - __m128i fVec; -}; - -template <> -class SkNb<2, 8> { -public: - SkNb(const __m128i& vec) : fVec(vec) {} - - SkNb() {} - bool allTrue() const { return 0xffff == _mm_movemask_epi8(fVec); } - bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(fVec); } - - __m128i fVec; -}; - template <> class SkNf<2, float> { - typedef SkNb<2, 4> Nb; public: SkNf(const __m128& vec) : fVec(vec) {} @@ -69,12 +32,12 @@ public: SkNf operator * (const SkNf& o) const { return _mm_mul_ps(fVec, o.fVec); } SkNf operator / (const SkNf& o) const { return _mm_div_ps(fVec, o.fVec); } - Nb operator == (const SkNf& o) const { return _mm_castps_si128(_mm_cmpeq_ps (fVec, o.fVec)); } - Nb operator != (const SkNf& o) const { return _mm_castps_si128(_mm_cmpneq_ps(fVec, o.fVec)); } - Nb operator < (const SkNf& o) const { return _mm_castps_si128(_mm_cmplt_ps (fVec, o.fVec)); } - Nb operator > (const SkNf& o) const { return _mm_castps_si128(_mm_cmpgt_ps (fVec, o.fVec)); } - Nb operator <= (const SkNf& o) const { return _mm_castps_si128(_mm_cmple_ps (fVec, o.fVec)); } - Nb operator >= (const SkNf& o) const { return _mm_castps_si128(_mm_cmpge_ps (fVec, o.fVec)); } + SkNf operator == (const SkNf& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } + SkNf operator != (const SkNf& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } + SkNf operator < (const SkNf& o) const { return _mm_cmplt_ps (fVec, o.fVec); } + SkNf operator > (const SkNf& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } + SkNf operator <= (const SkNf& o) const { return _mm_cmple_ps (fVec, o.fVec); } + SkNf operator >= (const SkNf& o) const { return _mm_cmpge_ps (fVec, o.fVec); } static SkNf Min(const SkNf& l, const SkNf& r) { return _mm_min_ps(l.fVec, r.fVec); } static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_ps(l.fVec, r.fVec); } @@ -93,12 +56,14 @@ public: return pun.fs[k&1]; } + bool allTrue() const { return 0xff == (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } + bool anyTrue() const { return 0x00 != (_mm_movemask_epi8(_mm_castps_si128(fVec)) & 0xff); } + __m128 fVec; }; template <> class SkNf<2, double> { - typedef SkNb<2, 8> Nb; public: SkNf(const __m128d& vec) : fVec(vec) {} @@ -114,12 +79,12 @@ public: SkNf operator * (const SkNf& o) const { return _mm_mul_pd(fVec, o.fVec); } SkNf operator / (const SkNf& o) const { return _mm_div_pd(fVec, o.fVec); } - Nb operator == (const SkNf& o) const { return _mm_castpd_si128(_mm_cmpeq_pd (fVec, o.fVec)); } - Nb operator != (const SkNf& o) const { return _mm_castpd_si128(_mm_cmpneq_pd(fVec, o.fVec)); } - Nb operator < (const SkNf& o) const { return _mm_castpd_si128(_mm_cmplt_pd (fVec, o.fVec)); } - Nb operator > (const SkNf& o) const { return _mm_castpd_si128(_mm_cmpgt_pd (fVec, o.fVec)); } - Nb operator <= (const SkNf& o) const { return _mm_castpd_si128(_mm_cmple_pd (fVec, o.fVec)); } - Nb operator >= (const SkNf& o) const { return _mm_castpd_si128(_mm_cmpge_pd (fVec, o.fVec)); } + SkNf operator == (const SkNf& o) const { return _mm_cmpeq_pd (fVec, o.fVec); } + SkNf operator != (const SkNf& o) const { return _mm_cmpneq_pd(fVec, o.fVec); } + SkNf operator < (const SkNf& o) const { return _mm_cmplt_pd (fVec, o.fVec); } + SkNf operator > (const SkNf& o) const { return _mm_cmpgt_pd (fVec, o.fVec); } + SkNf operator <= (const SkNf& o) const { return _mm_cmple_pd (fVec, o.fVec); } + SkNf operator >= (const SkNf& o) const { return _mm_cmpge_pd (fVec, o.fVec); } static SkNf Min(const SkNf& l, const SkNf& r) { return _mm_min_pd(l.fVec, r.fVec); } static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_pd(l.fVec, r.fVec); } @@ -138,6 +103,9 @@ public: return pun.ds[k&1]; } + bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castpd_si128(fVec)); } + bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castpd_si128(fVec)); } + __m128d fVec; }; @@ -181,7 +149,6 @@ public: template <> class SkNf<4, float> { - typedef SkNb<4, 4> Nb; public: SkNf(const __m128& vec) : fVec(vec) {} @@ -199,12 +166,12 @@ public: SkNf operator * (const SkNf& o) const { return _mm_mul_ps(fVec, o.fVec); } SkNf operator / (const SkNf& o) const { return _mm_div_ps(fVec, o.fVec); } - Nb operator == (const SkNf& o) const { return _mm_castps_si128(_mm_cmpeq_ps (fVec, o.fVec)); } - Nb operator != (const SkNf& o) const { return _mm_castps_si128(_mm_cmpneq_ps(fVec, o.fVec)); } - Nb operator < (const SkNf& o) const { return _mm_castps_si128(_mm_cmplt_ps (fVec, o.fVec)); } - Nb operator > (const SkNf& o) const { return _mm_castps_si128(_mm_cmpgt_ps (fVec, o.fVec)); } - Nb operator <= (const SkNf& o) const { return _mm_castps_si128(_mm_cmple_ps (fVec, o.fVec)); } - Nb operator >= (const SkNf& o) const { return _mm_castps_si128(_mm_cmpge_ps (fVec, o.fVec)); } + SkNf operator == (const SkNf& o) const { return _mm_cmpeq_ps (fVec, o.fVec); } + SkNf operator != (const SkNf& o) const { return _mm_cmpneq_ps(fVec, o.fVec); } + SkNf operator < (const SkNf& o) const { return _mm_cmplt_ps (fVec, o.fVec); } + SkNf operator > (const SkNf& o) const { return _mm_cmpgt_ps (fVec, o.fVec); } + SkNf operator <= (const SkNf& o) const { return _mm_cmple_ps (fVec, o.fVec); } + SkNf operator >= (const SkNf& o) const { return _mm_cmpge_ps (fVec, o.fVec); } static SkNf Min(const SkNf& l, const SkNf& r) { return _mm_min_ps(l.fVec, r.fVec); } static SkNf Max(const SkNf& l, const SkNf& r) { return _mm_max_ps(l.fVec, r.fVec); } @@ -223,6 +190,9 @@ public: return pun.fs[k&3]; } + bool allTrue() const { return 0xffff == _mm_movemask_epi8(_mm_castps_si128(fVec)); } + bool anyTrue() const { return 0x0000 != _mm_movemask_epi8(_mm_castps_si128(fVec)); } + __m128 fVec; }; @@ -312,6 +282,11 @@ public: SkNi operator - (const SkNi& o) const { return _mm_sub_epi8(fVec, o.fVec); } static SkNi Min(const SkNi& a, const SkNi& b) { return _mm_min_epu8(a.fVec, b.fVec); } + SkNi operator < (const SkNi& o) const { + // There's no unsigned _mm_cmplt_epu8, so we flip the sign bits then use a signed compare. + auto flip = _mm_set1_epi8(char(0x80)); + return _mm_cmplt_epi8(_mm_xor_si128(flip, fVec), _mm_xor_si128(flip, o.fVec)); + } template uint8_t kth() const { SkASSERT(0 <= k && k < 16); @@ -320,6 +295,11 @@ public: return k % 2 == 0 ? pair : (pair >> 8); } + SkNi thenElse(const SkNi& t, const SkNi& e) const { + return _mm_or_si128(_mm_and_si128 (fVec, t.fVec), + _mm_andnot_si128(fVec, e.fVec)); + } + __m128i fVec; }; diff --git a/src/opts/SkXfermode_opts_SSE2.cpp b/src/opts/SkXfermode_opts_SSE2.cpp index b924770..f877280 100644 --- a/src/opts/SkXfermode_opts_SSE2.cpp +++ b/src/opts/SkXfermode_opts_SSE2.cpp @@ -515,15 +515,17 @@ void SkSSE2ProcCoeffXfermode::toString(SkString* str) const { SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_SSE2(const ProcCoeff& rec, SkXfermode::Mode mode) { SkXfermodeProcSIMD proc = nullptr; - // TODO(mtklein): implement these Sk4px. switch (mode) { + // TODO(mtklein): Sk4pxXfermode has these now. Clean up. case SkProcCoeffXfermode::kOverlay_Mode: proc = overlay_modeproc_SSE2; break; case SkProcCoeffXfermode::kDarken_Mode: proc = darken_modeproc_SSE2; break; case SkProcCoeffXfermode::kLighten_Mode: proc = lighten_modeproc_SSE2; break; - case SkProcCoeffXfermode::kColorDodge_Mode: proc = colordodge_modeproc_SSE2; break; - case SkProcCoeffXfermode::kColorBurn_Mode: proc = colorburn_modeproc_SSE2; break; case SkProcCoeffXfermode::kHardLight_Mode: proc = hardlight_modeproc_SSE2; break; + + // TODO(mtklein): implement these with SkPMFloat. case SkProcCoeffXfermode::kSoftLight_Mode: proc = softlight_modeproc_SSE2; break; + case SkProcCoeffXfermode::kColorDodge_Mode: proc = colordodge_modeproc_SSE2; break; + case SkProcCoeffXfermode::kColorBurn_Mode: proc = colorburn_modeproc_SSE2; break; default: break; } return proc ? SkNEW_ARGS(SkSSE2ProcCoeffXfermode, (rec, mode, (void*)proc)) : nullptr; diff --git a/src/opts/SkXfermode_opts_arm_neon.cpp b/src/opts/SkXfermode_opts_arm_neon.cpp index 1759429..205a00b 100644 --- a/src/opts/SkXfermode_opts_arm_neon.cpp +++ b/src/opts/SkXfermode_opts_arm_neon.cpp @@ -1016,6 +1016,7 @@ SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec, if (auto xfermode = SkCreate4pxXfermode(rec, mode)) { return xfermode; } + // TODO: Sk4pxXfermode now covers every mode found in this file. Delete them all! if (auto proc = gNEONXfermodeProcs[mode]) { return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, (void*)proc)); } -- 2.7.4