SkSplicer: begin on sse2/sse4.1 support

author Mike Klein <mtklein@chromium.org>

Wed, 8 Feb 2017 17:50:17 +0000 (12:50 -0500)

committer Skia Commit-Bot <skia-commit-bot@chromium.org>

Wed, 8 Feb 2017 21:07:05 +0000 (21:07 +0000)
author Mike Klein <mtklein@chromium.org>
Wed, 8 Feb 2017 17:50:17 +0000 (12:50 -0500)
committer Skia Commit-Bot <skia-commit-bot@chromium.org>
Wed, 8 Feb 2017 21:07:05 +0000 (21:07 +0000)
diff --git a/src/splicer/SkSplicer.cpp b/src/splicer/SkSplicer.cpp

index 6a0fc3c..cba7b2a 100644 (file)
--- a/src/splicer/SkSplicer.cpp
+++ b/src/splicer/SkSplicer.cpp
@@ -41,6 +41,7 @@ namespace {
          1.0f, 255.0f, 1/255.0f, 0x000000ff,
          0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f,       // from_srgb
          12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f,   //   to_srgb
+        0x77800000, 0x07800000,                            // fp16 <-> fp32
      };
  
      // We do this a lot, so it's nice to infer the correct size.  Works fine with arrays.
diff --git a/src/splicer/SkSplicer_shared.h b/src/splicer/SkSplicer_shared.h

index 6a8f14c..0ad0a09 100644 (file)
--- a/src/splicer/SkSplicer_shared.h
+++ b/src/splicer/SkSplicer_shared.h
@@ -38,6 +38,10 @@ struct SkSplicer_constants {
      float    _0689206;     //  0.689206f
      float   n_00988;       // -0.0988f
      float    _00043;       //  0.0043f
+
+    // fp16 <-> fp32
+    uint32_t _0x77800000;
+    uint32_t _0x07800000;
  };
  
  #endif//SkSplicer_shared_DEFINED
diff --git a/src/splicer/SkSplicer_stages.cpp b/src/splicer/SkSplicer_stages.cpp

index 9c5a442..bff58c2 100644 (file)
--- a/src/splicer/SkSplicer_stages.cpp
+++ b/src/splicer/SkSplicer_stages.cpp
@@ -25,14 +25,15 @@ using K = const SkSplicer_constants;
      using U8  = uint8_t  __attribute__((ext_vector_type(4)));
  
      // We polyfill a few routines that Clang doesn't build into ext_vector_types.
+    static F   fma(F f, F m, F a)                   { return vfmaq_f32(a,f,m);        }
      static F   min(F a, F b)                        { return vminq_f32(a,b);          }
      static F   max(F a, F b)                        { return vmaxq_f32(a,b);          }
-    static F   fma(F f, F m, F a)                   { return vfmaq_f32(a,f,m);        }
      static F   rcp  (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e  ) * e; }
      static F   rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
-    static F   if_then_else(I32 c, F t, F e)        { return vbslq_f32((U32)c,t,e);   }
      static U32 round(F v, F scale)                  { return vcvtnq_u32_f32(v*scale); }
  
+    static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
+
      static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
  #elif defined(__ARM_NEON__)
      #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
@@ -46,19 +47,17 @@ using K = const SkSplicer_constants;
      using U32 = uint32_t __attribute__((ext_vector_type(2)));
      using U8  = uint8_t  __attribute__((ext_vector_type(2)));
  
-    static F   min(F a, F b)                        { return vmin_f32(a,b);          }
-    static F   max(F a, F b)                        { return vmax_f32(a,b);          }
-    static F   fma(F f, F m, F a)                   { return vfma_f32(a,f,m);        }
-    static F   rcp  (F v)  { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e  ) * e; }
-    static F   rsqrt(F v)  { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
-    static F   if_then_else(I32 c, F t, F e)        { return vbsl_f32((U32)c,t,e);   }
-    static U32 round(F v, F scale)                  { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+    static F   fma(F f, F m, F a)                  { return vfma_f32(a,f,m);        }
+    static F   min(F a, F b)                       { return vmin_f32(a,b);          }
+    static F   max(F a, F b)                       { return vmax_f32(a,b);          }
+    static F   rcp  (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e  ) * e; }
+    static F   rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
+    static U32 round(F v, F scale)                 { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+
+    static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
  
      static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
-#else
-    #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
-        #error On x86, compile with -mavx2 -mfma -mf16c.
-    #endif
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
      #include <immintrin.h>
  
      // These are __m256 and __m256i, but friendlier and strongly-typed.
@@ -67,15 +66,40 @@ using K = const SkSplicer_constants;
      using U32 = uint32_t __attribute__((ext_vector_type(8)));
      using U8  = uint8_t  __attribute__((ext_vector_type(8)));
  
-    static F   min(F a, F b)                 { return _mm256_min_ps  (a,b);  }
-    static F   max(F a, F b)                 { return _mm256_max_ps  (a,b);  }
-    static F   fma(F f, F m, F a)            { return _mm256_fmadd_ps(f,m,a);}
-    static F   rcp  (F v)                    { return _mm256_rcp_ps     (v); }
-    static F   rsqrt(F v)                    { return _mm256_rsqrt_ps   (v); }
-    static F   if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
-    static U32 round(F v, F scale)           { return _mm256_cvtps_epi32(v*scale); }
+    static F   fma(F f, F m, F a)  { return _mm256_fmadd_ps(f,m,a);}
+    static F   min(F a, F b)       { return _mm256_min_ps(a,b);    }
+    static F   max(F a, F b)       { return _mm256_max_ps(a,b);    }
+    static F   rcp  (F v)          { return _mm256_rcp_ps  (v);    }
+    static F   rsqrt(F v)          { return _mm256_rsqrt_ps(v);    }
+    static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
  
      static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
+#elif defined(__SSE2__)
+    #include <immintrin.h>
+
+    using F   = float    __attribute__((ext_vector_type(4)));
+    using I32 =  int32_t __attribute__((ext_vector_type(4)));
+    using U32 = uint32_t __attribute__((ext_vector_type(4)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(4)));
+
+    static F   fma(F f, F m, F a)  { return f*m+a;           }
+    static F   min(F a, F b)       { return _mm_min_ps(a,b); }
+    static F   max(F a, F b)       { return _mm_max_ps(a,b); }
+    static F   rcp  (F v)          { return _mm_rcp_ps  (v); }
+    static F   rsqrt(F v)          { return _mm_rsqrt_ps(v); }
+    static U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) {
+    #if defined(__SSE4_1__)
+        return _mm_blendv_ps(e,t,c);
+    #else
+        return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
+    #endif
+    }
+
+    static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
  #endif
  
  static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F);   }
@@ -310,7 +334,7 @@ STAGE(load_f16) {
      g = {ga[0], ga[2]};
      b = {rb[1], rb[3]};
      a = {ga[1], ga[3]};
-#else
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
      auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
           _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
           _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
@@ -330,6 +354,25 @@ STAGE(load_f16) {
      g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
      b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
      a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+#elif defined(__SSE2__)
+    auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
+         _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
+
+    auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
+         _13 = _mm_unpackhi_epi16(_01, _23);  // r1 r3 g1 g3 b1 b3 a1 a3
+
+    auto rg = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
+         ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
+
+    auto half_to_float = [&](U32 h) {
+        return (F)(h << 13)             // Line up the mantissa,
+             * (F)U32(k->_0x77800000);  // then fix up the exponent.
+    };
+
+    r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128()));
+    g = half_to_float(_mm_unpackhi_epi16(rg, _mm_setzero_si128()));
+    b = half_to_float(_mm_unpacklo_epi16(ba, _mm_setzero_si128()));
+    a = half_to_float(_mm_unpackhi_epi16(ba, _mm_setzero_si128()));
  #endif
  }
  
@@ -350,7 +393,7 @@ STAGE(store_f16) {
          vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
      }};
      vst2_f16((float16_t*)ptr, rb_ga);
-#else
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
      auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
           G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
           B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
@@ -365,6 +408,19 @@ STAGE(store_f16) {
      _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
      _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
      _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+#elif defined(__SSE2__)
+    auto float_to_half = [&](F f) {
+        return (U32)(f * (F)U32(k->_0x07800000)) // Fix up the exponent,
+            >> 13;                               // then line up the mantissa.
+    };
+    U32 R = float_to_half(r),
+        G = float_to_half(g),
+        B = float_to_half(b),
+        A = float_to_half(a);
+    U32 rg = R | _mm_slli_si128(G,2),
+        ba = B | _mm_slli_si128(A,2);
+    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
+    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
  #endif
  }
  
diff --git a/src/splicer/build_stages.py b/src/splicer/build_stages.py

index 900b47f..a4fd97d 100755 (executable)
--- a/src/splicer/build_stages.py
+++ b/src/splicer/build_stages.py
@@ -17,6 +17,17 @@ objdump = 'gobjdump'
  
  cflags = '-std=c++11 -Os -fomit-frame-pointer'.split()
  
+sse2 = '-msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split()
+subprocess.check_call(['clang++'] + cflags + sse2 +
+                      ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
+                      ['-o', 'sse2.o'])
+
+sse41 = '-msse4.1'.split()
+subprocess.check_call(['clang++'] + cflags + sse41 +
+                      ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
+                      ['-o', 'sse41.o'])
+
+
  hsw = '-mavx2 -mfma -mf16c'.split()
  subprocess.check_call(['clang++'] + cflags + hsw +
                        ['-c', 'src/splicer/SkSplicer_stages.cpp'] +
author	Mike Klein <mtklein@chromium.org>
	Wed, 8 Feb 2017 17:50:17 +0000 (12:50 -0500)
committer	Skia Commit-Bot <skia-commit-bot@chromium.org>
	Wed, 8 Feb 2017 21:07:05 +0000 (21:07 +0000)
src/splicer/SkSplicer.cpp		patch \| blob \| history
src/splicer/SkSplicer_shared.h		patch \| blob \| history
src/splicer/SkSplicer_stages.cpp		patch \| blob \| history
src/splicer/build_stages.py		patch \| blob \| history