Replace _mm_cvtps_epi32(x) with _mm_cvttps_epi32(_mm_add_ps(0.5f), x).

author mtklein <mtklein@chromium.org>

Mon, 23 Mar 2015 19:01:45 +0000 (12:01 -0700)

committer Commit bot <commit-bot@chromium.org>

Mon, 23 Mar 2015 19:01:46 +0000 (12:01 -0700)
author mtklein <mtklein@chromium.org>
Mon, 23 Mar 2015 19:01:45 +0000 (12:01 -0700)
committer Commit bot <commit-bot@chromium.org>
Mon, 23 Mar 2015 19:01:46 +0000 (12:01 -0700)
diff --git a/src/opts/Sk4x_sse.h b/src/opts/Sk4x_sse.h

index ab6876dfd7c812d1fb56f5eb5b9585558b917b5d..a923a7404d34f7e1d19607db700ce5e7e958ca39 100644 (file)
--- a/src/opts/Sk4x_sse.h
+++ b/src/opts/Sk4x_sse.h
@@ -86,11 +86,10 @@ M(Sk4f) LoadAligned(const float fs[4]) { return _mm_load_ps (fs); }
  M(void) store       (float fs[4]) const { _mm_storeu_ps(fs, fVec); }
  M(void) storeAligned(float fs[4]) const { _mm_store_ps (fs, fVec); }
  
-template <>
-M(Sk4i) reinterpret<Sk4i>() const { return as_4i(fVec); }
+template <> M(Sk4i) reinterpret<Sk4i>() const { return as_4i(fVec); }
  
-template <>
-M(Sk4i) cast<Sk4i>() const { return _mm_cvtps_epi32(fVec); }
+// cvttps truncates, same as (int) when positive.
+template <> M(Sk4i) cast<Sk4i>() const { return _mm_cvttps_epi32(fVec); }
  
  // We're going to try a little experiment here and skip allTrue(), anyTrue(), and bit-manipulators
  // for Sk4f.  Code that calls them probably does so accidentally.
diff --git a/src/opts/SkPMFloat_SSE2.h b/src/opts/SkPMFloat_SSE2.h

index 9c1295b56555e5f6e5b32fd5ff7f16e6066d2843..2a85b1a74fcf82be61d4dd66706711c62e08405d 100644 (file)
--- a/src/opts/SkPMFloat_SSE2.h
+++ b/src/opts/SkPMFloat_SSE2.h
@@ -25,7 +25,8 @@ inline SkPMColor SkPMFloat::get() const {
  }
  
  inline SkPMColor SkPMFloat::clamped() const {
-    __m128i fix8_32 = _mm_cvtps_epi32(fColors),  // _mm_cvtps_epi32 rounds for us!
+    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
+    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)),
              fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
              fix8    = _mm_packus_epi16(fix8_16, fix8_16);
      SkPMColor c = _mm_cvtsi128_si32(fix8);
@@ -47,10 +48,11 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4
  
  inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
      // Same as _SSSE3.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    __m128i c0 = _mm_cvtps_epi32(floats[0].fColors),  // _mm_cvtps_epi32 rounds for us!
-            c1 = _mm_cvtps_epi32(floats[1].fColors),
-            c2 = _mm_cvtps_epi32(floats[2].fColors),
-            c3 = _mm_cvtps_epi32(floats[3].fColors);
+    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
+    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[0].fColors)),
+            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[1].fColors)),
+            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[2].fColors)),
+            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[3].fColors));
      __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
                                       _mm_packus_epi16(c2, c3));
      _mm_storeu_si128((__m128i*)colors, c3210);
diff --git a/src/opts/SkPMFloat_SSSE3.h b/src/opts/SkPMFloat_SSSE3.h

index b2c2b23429ef68da52b090e5e86e1c263577679b..ab54caf3d41acd80e64daf16ef83952a7d78567c 100644 (file)
--- a/src/opts/SkPMFloat_SSSE3.h
+++ b/src/opts/SkPMFloat_SSSE3.h
@@ -23,7 +23,8 @@ inline SkPMFloat::SkPMFloat(SkPMColor c) {
  inline SkPMColor SkPMFloat::get() const {
      SkASSERT(this->isValid());
      const int _ = 255;  // _ means to zero that byte.
-    __m128i fix8_32 = _mm_cvtps_epi32(fColors),  // _mm_cvtps_epi32 rounds for us!
+    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
+    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)),
              fix8    = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_,_,_, 12,8,4,0));
      SkPMColor c = _mm_cvtsi128_si32(fix8);
      SkPMColorAssert(c);
@@ -31,7 +32,8 @@ inline SkPMColor SkPMFloat::get() const {
  }
  
  inline SkPMColor SkPMFloat::clamped() const {
-    __m128i fix8_32 = _mm_cvtps_epi32(fColors),  // _mm_cvtps_epi32 rounds for us!
+    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
+    __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)),
              fix8_16 = _mm_packus_epi16(fix8_32, fix8_32),
              fix8    = _mm_packus_epi16(fix8_16, fix8_16);
      SkPMColor c = _mm_cvtsi128_si32(fix8);
@@ -51,10 +53,11 @@ inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4
  
  inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat floats[4]) {
      // Same as _SSE2.h's.  We use 3 _mm_packus_epi16() where the naive loop uses 8.
-    __m128i c0 = _mm_cvtps_epi32(floats[0].fColors),  // _mm_cvtps_epi32 rounds for us!
-            c1 = _mm_cvtps_epi32(floats[1].fColors),
-            c2 = _mm_cvtps_epi32(floats[2].fColors),
-            c3 = _mm_cvtps_epi32(floats[3].fColors);
+    // We don't use _mm_cvtps_epi32, because we want precise control over how 0.5 rounds (up).
+    __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[0].fColors)),
+            c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[1].fColors)),
+            c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[2].fColors)),
+            c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[3].fColors));
      __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1),
                                       _mm_packus_epi16(c2, c3));
      _mm_storeu_si128((__m128i*)colors, c3210);
diff --git a/tests/PMFloatTest.cpp b/tests/PMFloatTest.cpp

index c3d5e48b4dd6e5eec107d208947d442043b11e45..5b53fc659f6f937958a066b3d31b91ea3c7631e2 100644 (file)
--- a/tests/PMFloatTest.cpp
+++ b/tests/PMFloatTest.cpp
@@ -11,8 +11,8 @@ DEF_TEST(SkPMFloat, r) {
      REPORTER_ASSERT(r, SkScalarNearlyEqual( 51.0f, pmf.b()));
      REPORTER_ASSERT(r, c == pmf.get());
  
-    // Test rounding.  (Don't bother testing .5... we don't care which way it goes.)
-    pmf = SkPMFloat(254.6f, 204.3f, 153.1f, 50.8f);
+    // Test rounding.
+    pmf = SkPMFloat(254.5f, 203.5f, 153.1f, 50.8f);
      REPORTER_ASSERT(r, c == pmf.get());
  
      // Test clamping.
diff --git a/tests/Sk4xTest.cpp b/tests/Sk4xTest.cpp

index d7a016c3c2406a500579303916863e0551385e46..8c3b977b5bf5f609f5df4fe4335ab71a0ef02ec6 100644 (file)
--- a/tests/Sk4xTest.cpp
+++ b/tests/Sk4xTest.cpp
@@ -59,6 +59,9 @@ DEF_TEST(Sk4x_Conversions, r) {
      ASSERT_NE(twoi, twof.reinterpret<Sk4i>());
      ASSERT_EQ(twof, twoi.cast<Sk4f>());
      ASSERT_NE(twof, twoi.reinterpret<Sk4f>());
+
+    ASSERT_EQ(Sk4i(0,0,0,0), Sk4f(0.5f, 0.49f, 0.51f, 0.99f).cast<Sk4i>());
+    ASSERT_EQ(Sk4i(1,1,1,1), Sk4f(1.5f, 1.49f, 1.51f, 1.99f).cast<Sk4i>());
  }
  
  DEF_TEST(Sk4x_Bits, r) {
author	mtklein <mtklein@chromium.org>
	Mon, 23 Mar 2015 19:01:45 +0000 (12:01 -0700)
committer	Commit bot <commit-bot@chromium.org>
	Mon, 23 Mar 2015 19:01:46 +0000 (12:01 -0700)
src/opts/Sk4x_sse.h		patch \| blob \| history
src/opts/SkPMFloat_SSE2.h		patch \| blob \| history
src/opts/SkPMFloat_SSSE3.h		patch \| blob \| history
tests/PMFloatTest.cpp		patch \| blob \| history
tests/Sk4xTest.cpp		patch \| blob \| history