SkNx: add Sk4u

author mtklein <mtklein@chromium.org>

Fri, 29 Jul 2016 17:10:15 +0000 (10:10 -0700)

committer Commit bot <commit-bot@chromium.org>

Fri, 29 Jul 2016 17:10:15 +0000 (10:10 -0700)
author mtklein <mtklein@chromium.org>
Fri, 29 Jul 2016 17:10:15 +0000 (10:10 -0700)
committer Commit bot <commit-bot@chromium.org>
Fri, 29 Jul 2016 17:10:15 +0000 (10:10 -0700)
diff --git a/src/core/SkNx.h b/src/core/SkNx.h

index 308addd..881a475 100644 (file)
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -293,7 +293,8 @@ typedef SkNx<4,  uint16_t> Sk4h;
  typedef SkNx<8,  uint16_t> Sk8h;
  typedef SkNx<16, uint16_t> Sk16h;
  
-typedef SkNx<4,       int> Sk4i;
+typedef SkNx<4,  int32_t> Sk4i;
+typedef SkNx<4, uint32_t> Sk4u;
  
  // Include platform specific specializations if available.
  #if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp

index a27abbd..f3fc76e 100644 (file)
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -297,8 +297,7 @@ static void SK_VECTORCALL load_d_srgb(SkRasterPipeline::Stage* st, size_t x,
             sk_linear_from_srgb[(ptr[2] >> SK_B32_SHIFT) & 0xff],
             sk_linear_from_srgb[(ptr[3] >> SK_B32_SHIFT) & 0xff] };
  
-    // TODO: this >> doesn't really need mask if we make it logical instead of arithmetic.
-    da = SkNx_cast<float>((Sk4i::Load(ptr) >> SK_A32_SHIFT) & 0xff) * (1/255.0f);
+    da = SkNx_cast<float>(Sk4u::Load(ptr) >> SK_A32_SHIFT) * (1/255.0f);
  
      st->next(x, r,g,b,a, dr,dg,db,da);
  }
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h

index 23567b7..eea6800 100644 (file)
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -361,15 +361,15 @@ public:
  };
  
  template <>
-class SkNx<4, int> {
+class SkNx<4, int32_t> {
  public:
      SkNx(const int32x4_t& vec) : fVec(vec) {}
  
      SkNx() {}
-    SkNx(int v) {
+    SkNx(int32_t v) {
          fVec = vdupq_n_s32(v);
      }
-    SkNx(int a, int b, int c, int d) {
+    SkNx(int32_t a, int32_t b, int32_t c, int32_t d) {
          fVec = (int32x4_t){a,b,c,d};
      }
      static SkNx Load(const void* ptr) {
@@ -378,9 +378,9 @@ public:
      void store(void* ptr) const {
          return vst1q_s32((int32_t*)ptr, fVec);
      }
-    int operator[](int k) const {
+    int32_t operator[](int k) const {
          SkASSERT(0 <= k && k < 4);
-        union { int32x4_t v; int is[4]; } pun = {fVec};
+        union { int32x4_t v; int32_t is[4]; } pun = {fVec};
          return pun.is[k&3];
      }
  
@@ -415,17 +415,69 @@ public:
      int32x4_t fVec;
  };
  
+template <>
+class SkNx<4, uint32_t> {
+public:
+    SkNx(const uint32x4_t& vec) : fVec(vec) {}
+
+    SkNx() {}
+    SkNx(uint32_t v) {
+        fVec = vdupq_n_u32(v);
+    }
+    SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+        fVec = (uint32x4_t){a,b,c,d};
+    }
+    static SkNx Load(const void* ptr) {
+        return vld1q_u32((const uint32_t*)ptr);
+    }
+    void store(void* ptr) const {
+        return vst1q_u32((uint32_t*)ptr, fVec);
+    }
+    uint32_t operator[](int k) const {
+        SkASSERT(0 <= k && k < 4);
+        union { uint32x4_t v; uint32_t us[4]; } pun = {fVec};
+        return pun.us[k&3];
+    }
+
+    SkNx operator + (const SkNx& o) const { return vaddq_u32(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return vsubq_u32(fVec, o.fVec); }
+    SkNx operator * (const SkNx& o) const { return vmulq_u32(fVec, o.fVec); }
+
+    SkNx operator & (const SkNx& o) const { return vandq_u32(fVec, o.fVec); }
+    SkNx operator | (const SkNx& o) const { return vorrq_u32(fVec, o.fVec); }
+    SkNx operator ^ (const SkNx& o) const { return veorq_u32(fVec, o.fVec); }
+
+    SkNx operator << (int bits) const { SHIFT32(vshlq_n_u32, fVec, bits); }
+    SkNx operator >> (int bits) const { SHIFT32(vshrq_n_u32, fVec, bits); }
+
+    SkNx operator == (const SkNx& o) const { return vceqq_u32(fVec, o.fVec); }
+    SkNx operator <  (const SkNx& o) const { return vcltq_u32(fVec, o.fVec); }
+    SkNx operator >  (const SkNx& o) const { return vcgtq_u32(fVec, o.fVec); }
+
+    static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u32(a.fVec, b.fVec); }
+    // TODO as needed
+
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
+        return vbslq_u32(fVec, t.fVec, e.fVec);
+    }
+
+    uint32x4_t fVec;
+};
+
  #undef SHIFT32
  #undef SHIFT16
  #undef SHIFT8
  
-template<> inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {
+template<> inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {
      return vcvtq_s32_f32(src.fVec);
  
  }
-template<> inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {
+template<> inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
      return vcvtq_f32_s32(src.fVec);
  }
+template<> inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {
+    return SkNx_cast<float>(Sk4i::Load(&src));
+}
  
  template<> inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
      return vqmovn_u32(vcvtq_u32_f32(src.fVec));
@@ -468,16 +520,16 @@ template<> inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src) {
      return vmovn_u16(vcombine_u16(src.fVec, src.fVec));
  }
  
-template<> inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {
+template<> inline Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
      uint16x4_t _16 = vqmovun_s32(src.fVec);
      return vqmovn_u16(vcombine_u16(_16, _16));
  }
  
-template<> inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {
+template<> inline Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
      return vreinterpretq_s32_u32(vmovl_u16(src.fVec));
  }
  
-template<> inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
+template<> inline Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {
      return vmovn_u32(vreinterpretq_u32_s32(src.fVec));
  }
  
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h

index 3881b54..003b89f 100644 (file)
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -130,14 +130,14 @@ public:
  };
  
  template <>
-class SkNx<4, int> {
+class SkNx<4, int32_t> {
  public:
      SkNx(const __m128i& vec) : fVec(vec) {}
  
      SkNx() {}
-    SkNx(int val) : fVec(_mm_set1_epi32(val)) {}
+    SkNx(int32_t val) : fVec(_mm_set1_epi32(val)) {}
      static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
-    SkNx(int a, int b, int c, int d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
+    SkNx(int32_t a, int32_t b, int32_t c, int32_t d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
  
      void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
  
@@ -161,9 +161,9 @@ public:
      SkNx operator  < (const SkNx& o) const { return _mm_cmplt_epi32 (fVec, o.fVec); }
      SkNx operator  > (const SkNx& o) const { return _mm_cmpgt_epi32 (fVec, o.fVec); }
  
-    int operator[](int k) const {
+    int32_t operator[](int k) const {
          SkASSERT(0 <= k && k < 4);
-        union { __m128i v; int is[4]; } pun = {fVec};
+        union { __m128i v; int32_t is[4]; } pun = {fVec};
          return pun.is[k&3];
      }
  
@@ -180,6 +180,51 @@ public:
  };
  
  template <>
+class SkNx<4, uint32_t> {
+public:
+    SkNx(const __m128i& vec) : fVec(vec) {}
+
+    SkNx() {}
+    SkNx(uint32_t val) : fVec(_mm_set1_epi32(val)) {}
+    static SkNx Load(const void* ptr) { return _mm_loadu_si128((const __m128i*)ptr); }
+    SkNx(uint32_t a, uint32_t b, uint32_t c, uint32_t d) : fVec(_mm_setr_epi32(a,b,c,d)) {}
+
+    void store(void* ptr) const { _mm_storeu_si128((__m128i*)ptr, fVec); }
+
+    SkNx operator + (const SkNx& o) const { return _mm_add_epi32(fVec, o.fVec); }
+    SkNx operator - (const SkNx& o) const { return _mm_sub_epi32(fVec, o.fVec); }
+    // Not quite sure how to best do operator * in SSE2.  We probably don't use it.
+
+    SkNx operator & (const SkNx& o) const { return _mm_and_si128(fVec, o.fVec); }
+    SkNx operator | (const SkNx& o) const { return _mm_or_si128(fVec, o.fVec); }
+    SkNx operator ^ (const SkNx& o) const { return _mm_xor_si128(fVec, o.fVec); }
+
+    SkNx operator << (int bits) const { return _mm_slli_epi32(fVec, bits); }
+    SkNx operator >> (int bits) const { return _mm_srli_epi32(fVec, bits); }
+
+    SkNx operator == (const SkNx& o) const { return _mm_cmpeq_epi32 (fVec, o.fVec); }
+    // operator < and > take a little extra fiddling to make work for unsigned ints.
+
+    uint32_t operator[](int k) const {
+        SkASSERT(0 <= k && k < 4);
+        union { __m128i v; uint32_t us[4]; } pun = {fVec};
+        return pun.us[k&3];
+    }
+
+    SkNx thenElse(const SkNx& t, const SkNx& e) const {
+    #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
+        return _mm_blendv_epi8(e.fVec, t.fVec, fVec);
+    #else
+        return _mm_or_si128(_mm_and_si128   (fVec, t.fVec),
+                            _mm_andnot_si128(fVec, e.fVec));
+    #endif
+    }
+
+    __m128i fVec;
+};
+
+
+template <>
  class SkNx<4, uint16_t> {
  public:
      SkNx(const __m128i& vec) : fVec(vec) {}
@@ -315,15 +360,18 @@ public:
      __m128i fVec;
  };
  
-template<> /*static*/ inline Sk4f SkNx_cast<float, int>(const Sk4i& src) {
+template<> /*static*/ inline Sk4f SkNx_cast<float, int32_t>(const Sk4i& src) {
      return _mm_cvtepi32_ps(src.fVec);
  }
+template<> /*static*/ inline Sk4f SkNx_cast<float, uint32_t>(const Sk4u& src) {
+    return SkNx_cast<float>(Sk4i::Load(&src));
+}
  
-template <> /*static*/ inline Sk4i SkNx_cast<int, float>(const Sk4f& src) {
+template <> /*static*/ inline Sk4i SkNx_cast<int32_t, float>(const Sk4f& src) {
      return _mm_cvttps_epi32(src.fVec);
  }
  
-template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
+template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int32_t>(const Sk4i& src) {
  #if 0 && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
      // TODO: This seems to be causing code generation problems.   Investigate?
      return _mm_packus_epi32(src.fVec);
@@ -339,7 +387,7 @@ template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, int>(const Sk4i& src) {
  }
  
  template<> /*static*/ inline Sk4h SkNx_cast<uint16_t, float>(const Sk4f& src) {
-    return SkNx_cast<uint16_t>(SkNx_cast<int>(src));
+    return SkNx_cast<uint16_t>(SkNx_cast<int32_t>(src));
  }
  
  template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, float>(const Sk4f& src) {
@@ -391,11 +439,11 @@ template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, uint16_t>(const Sk4h& src)
      return _mm_packus_epi16(src.fVec, src.fVec);
  }
  
-template<> /*static*/ inline Sk4i SkNx_cast<int, uint16_t>(const Sk4h& src) {
+template<> /*static*/ inline Sk4i SkNx_cast<int32_t, uint16_t>(const Sk4h& src) {
      return _mm_unpacklo_epi16(src.fVec, _mm_setzero_si128());
  }
  
-template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int>(const Sk4i& src) {
+template<> /*static*/ inline Sk4b SkNx_cast<uint8_t, int32_t>(const Sk4i& src) {
      return _mm_packus_epi16(_mm_packus_epi16(src.fVec, src.fVec), src.fVec);
  }
author	mtklein <mtklein@chromium.org>
	Fri, 29 Jul 2016 17:10:15 +0000 (10:10 -0700)
committer	Commit bot <commit-bot@chromium.org>
	Fri, 29 Jul 2016 17:10:15 +0000 (10:10 -0700)
src/core/SkNx.h		patch \| blob \| history
src/core/SkRasterPipelineBlitter.cpp		patch \| blob \| history
src/opts/SkNx_neon.h		patch \| blob \| history
src/opts/SkNx_sse.h		patch \| blob \| history