Add Sk4h_load4 for loading F16.

author mtklein <mtklein@chromium.org>

Tue, 26 Jul 2016 15:01:19 +0000 (08:01 -0700)

committer Commit bot <commit-bot@chromium.org>

Tue, 26 Jul 2016 15:01:20 +0000 (08:01 -0700)
author mtklein <mtklein@chromium.org>
Tue, 26 Jul 2016 15:01:19 +0000 (08:01 -0700)
committer Commit bot <commit-bot@chromium.org>
Tue, 26 Jul 2016 15:01:20 +0000 (08:01 -0700)
diff --git a/src/core/SkHalf.h b/src/core/SkHalf.h

index adf8d3a..bc9dd79 100644 (file)
--- a/src/core/SkHalf.h
+++ b/src/core/SkHalf.h
@@ -37,19 +37,18 @@ static inline Sk4h SkFloatToHalf_finite(const Sk4f&);
  
  // GCC 4.9 lacks the intrinsics to use ARMv8 f16<->f32 instructions, so we use inline assembly.
  
-static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
+static inline Sk4f SkHalfToFloat_finite(const Sk4h& hs) {
  #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)
      float32x4_t fs;
-    asm ("fmov  %d[fs], %[hs]        \n"   // vcreate_f16(hs)
-         "fcvtl %[fs].4s, %[fs].4h   \n"   // vcvt_f32_f16(...)
+    asm ("fcvtl %[fs].4s, %[hs].4h   \n"   // vcvt_f32_f16(...)
          : [fs] "=w" (fs)                   // =w: write-only NEON register
-        : [hs] "r" (hs));                  //  r: read-only 64-bit general register
+        : [hs] "w" (hs.fVec));             //  w: read-only NEON register
      return fs;
  #else
-    Sk4i bits      = SkNx_cast<int>(Sk4h::Load(&hs)),   // Expand to 32 bit.
-         sign      = bits & 0x00008000,                 // Save the sign bit for later...
-         positive  = bits ^ sign,                       // ...but strip it off for now.
-         is_denorm = positive < (1<<10);                // Exponent == 0?
+    Sk4i bits      = SkNx_cast<int>(hs),   // Expand to 32 bit.
+         sign      = bits & 0x00008000,    // Save the sign bit for later...
+         positive  = bits ^ sign,          // ...but strip it off for now.
+         is_denorm = positive < (1<<10);   // Exponent == 0?
  
      // For normal half floats, extend the mantissa by 13 zero bits,
      // then adjust the exponent from 15 bias to 127 bias.
@@ -66,6 +65,10 @@ static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
  #endif
  }
  
+static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {
+    return SkHalfToFloat_finite(Sk4h::Load(&hs));
+}
+
  static inline Sk4h SkFloatToHalf_finite(const Sk4f& fs) {
  #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)
      float32x4_t vec = fs.fVec;
diff --git a/src/core/SkNx.h b/src/core/SkNx.h

index 253fcf2..308addd 100644 (file)
--- a/src/core/SkNx.h
+++ b/src/core/SkNx.h
@@ -309,6 +309,19 @@ SI Sk4i Sk4f_round(const Sk4f& x) {
               (int) lrintf (x[3]), };
  }
  
+// Load 4 Sk4h and transpose them (256 bits total).
+SI void Sk4h_load4(const void* vptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
+    const uint64_t* ptr = (const uint64_t*)vptr;
+    auto p0 = Sk4h::Load(ptr+0),
+         p1 = Sk4h::Load(ptr+1),
+         p2 = Sk4h::Load(ptr+2),
+         p3 = Sk4h::Load(ptr+3);
+    *r = { p0[0], p1[0], p2[0], p3[0] };
+    *g = { p0[1], p1[1], p2[1], p3[1] };
+    *b = { p0[2], p1[2], p2[2], p3[2] };
+    *a = { p0[3], p1[3], p2[3], p3[3] };
+}
+
  // Transpose 4 Sk4h and store (256 bits total).
  SI void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b, const Sk4h& a) {
      uint64_t* dst64 = (uint64_t*) dst;
diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp

index 6518a88..a27abbd 100644 (file)
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -231,15 +231,13 @@ static void SK_VECTORCALL load_d_f16(SkRasterPipeline::Stage* st, size_t x,
                                       Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
      auto ptr = st->ctx<const uint64_t*>() + x;
  
-    // TODO: This can be made a lot more efficient with platform-specific code.
-    auto p0 = SkHalfToFloat_finite(ptr[0]),
-         p1 = SkHalfToFloat_finite(ptr[1]),
-         p2 = SkHalfToFloat_finite(ptr[2]),
-         p3 = SkHalfToFloat_finite(ptr[3]);
-    dr = { p0[0], p1[0], p2[0], p3[0] };
-    dg = { p0[1], p1[1], p2[1], p3[1] };
-    db = { p0[2], p1[2], p2[2], p3[2] };
-    da = { p0[3], p1[3], p2[3], p3[3] };
+    Sk4h rh, gh, bh, ah;
+    Sk4h_load4(ptr, &rh, &gh, &bh, &ah);
+
+    dr = SkHalfToFloat_finite(rh);
+    dg = SkHalfToFloat_finite(gh);
+    db = SkHalfToFloat_finite(bh);
+    da = SkHalfToFloat_finite(ah);
  
      st->next(x, r,g,b,a, dr,dg,db,da);
  }
diff --git a/src/opts/SkNx_neon.h b/src/opts/SkNx_neon.h

index 53e95eb..df11de3 100644 (file)
--- a/src/opts/SkNx_neon.h
+++ b/src/opts/SkNx_neon.h
@@ -483,6 +483,14 @@ static inline Sk4i Sk4f_round(const Sk4f& x) {
      return vcvtq_s32_f32((x + 0.5f).fVec);
  }
  
+static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
+    uint16x4x4_t rgba = vld4_u16((const uint16_t*)ptr);
+    *r = rgba.val[0];
+    *g = rgba.val[1];
+    *b = rgba.val[2];
+    *a = rgba.val[3];
+}
+
  static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b,
                                 const Sk4h& a) {
      uint16x4x4_t rgba = {{
diff --git a/src/opts/SkNx_sse.h b/src/opts/SkNx_sse.h

index c0e4828..3881b54 100644 (file)
--- a/src/opts/SkNx_sse.h
+++ b/src/opts/SkNx_sse.h
@@ -403,6 +403,19 @@ static inline Sk4i Sk4f_round(const Sk4f& x) {
      return _mm_cvtps_epi32(x.fVec);
  }
  
+static inline void Sk4h_load4(const void* ptr, Sk4h* r, Sk4h* g, Sk4h* b, Sk4h* a) {
+    __m128i lo = _mm_loadu_si128(((__m128i*)ptr) + 0),
+            hi = _mm_loadu_si128(((__m128i*)ptr) + 1);
+    __m128i even = _mm_unpacklo_epi16(lo, hi),   // r0 r2 g0 g2 b0 b2 a0 a2
+             odd = _mm_unpackhi_epi16(lo, hi);   // r1 r3 ...
+    __m128i rg = _mm_unpacklo_epi16(even, odd),  // r0 r1 r2 r3 g0 g1 g2 g3
+            ba = _mm_unpackhi_epi16(even, odd);  // b0 b1 ...   a0 a1 ...
+    *r = rg;
+    *g = _mm_srli_si128(rg, 8);
+    *b = ba;
+    *a = _mm_srli_si128(ba, 8);
+}
+
  static inline void Sk4h_store4(void* dst, const Sk4h& r, const Sk4h& g, const Sk4h& b,
                                 const Sk4h& a) {
      __m128i rg = _mm_unpacklo_epi16(r.fVec, g.fVec);
author	mtklein <mtklein@chromium.org>
	Tue, 26 Jul 2016 15:01:19 +0000 (08:01 -0700)
committer	Commit bot <commit-bot@chromium.org>
	Tue, 26 Jul 2016 15:01:20 +0000 (08:01 -0700)
src/core/SkHalf.h		patch \| blob \| history
src/core/SkNx.h		patch \| blob \| history
src/core/SkRasterPipelineBlitter.cpp		patch \| blob \| history
src/opts/SkNx_neon.h		patch \| blob \| history
src/opts/SkNx_sse.h		patch \| blob \| history