From b5acf6e702e318b405a04d38bfdac602dc3ed773 Mon Sep 17 00:00:00 2001
From: mtklein <mtklein@chromium.org>
Date: Mon, 25 Jul 2016 06:13:47 -0700
Subject: [PATCH] Add a clamp stage to SkRasterPipelineBlitter.

This clamps to [0,1] premul just before every store to memory.

By making the clamp a stage itself, this design makes it easy to move the clamp
around, to replace it with a debug-only assert-we're-clamped stage for certain
formats, clamp in more places, programatically not clamp, etc. etc.

Before this change, clamping was a little haphazard: store_srgb clamped
R, G and B to [0,1], but not A, and didn't clamp the colors to A.  565
didn't clamp at all.

6 GMs draw subtly differently in sRGB, I think because we've started clamping
colors to alpha to enforce premultiplication better.  No changes for 565.

My hope is that now no other stage need ever concern itself with clamping.

So we don't double-clamp, I've added a _noclamp version of sk_linear_to_srgb()
that simply asserts a clamp isn't necessary.  This happens to expose the Sk4f
_needs_trunc version that might be useful for power users (*cough* Matt *cough*).

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2178793002

Review-Url: https://codereview.chromium.org/2178793002
---
 src/core/SkRasterPipelineBlitter.cpp | 33 +++++++++++++++++++++++++++------
 src/core/SkSRGB.h                    | 27 ++++++++++++++++++++-------
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/src/core/SkRasterPipelineBlitter.cpp b/src/core/SkRasterPipelineBlitter.cpp
index 169bb9f..9d89b4d 100644
--- a/src/core/SkRasterPipelineBlitter.cpp
+++ b/src/core/SkRasterPipelineBlitter.cpp
@@ -9,7 +9,6 @@
 #include "SkColor.h"
 #include "SkColorFilter.h"
 #include "SkPM4f.h"
-#include "SkPM4fPriv.h"
 #include "SkRasterPipeline.h"
 #include "SkShader.h"
 #include "SkSRGB.h"
@@ -57,6 +56,22 @@ SkBlitter* SkCreateRasterPipelineBlitter(const SkPixmap& dst,
     return SkRasterPipelineBlitter::Create(dst, paint, alloc);
 }
 
+// Clamp colors into [0,1] premul (e.g. just before storing back to memory).
+static void SK_VECTORCALL clamp_01_premul(SkRasterPipeline::Stage* st, size_t x,
+                                          Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
+                                          Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
+    a = Sk4f::Max(a, 0.0f);
+    r = Sk4f::Max(r, 0.0f);
+    g = Sk4f::Max(g, 0.0f);
+    b = Sk4f::Max(b, 0.0f);
+
+    a = Sk4f::Min(a, 1.0f);
+    r = Sk4f::Min(r, a);
+    g = Sk4f::Min(g, a);
+    b = Sk4f::Min(b, a);
+
+    st->next(x, r,g,b,a, dr,dg,db,da);
+}
 
 // The default shader produces a constant color (from the SkPaint).
 static void SK_VECTORCALL constant_color(SkRasterPipeline::Stage* st, size_t x,
@@ -255,10 +270,10 @@ static void SK_VECTORCALL store_srgb(SkRasterPipeline::Stage* st, size_t x,
                                      Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
                                      Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
     auto dst = st->ctx<uint32_t*>() + x;
-    ( sk_linear_to_srgb(r) << SK_R32_SHIFT
-    | sk_linear_to_srgb(g) << SK_G32_SHIFT
-    | sk_linear_to_srgb(b) << SK_B32_SHIFT
-    | Sk4f_round(255.0f*a) << SK_A32_SHIFT).store(dst);
+    ( sk_linear_to_srgb_noclamp(r) << SK_R32_SHIFT
+    | sk_linear_to_srgb_noclamp(g) << SK_G32_SHIFT
+    | sk_linear_to_srgb_noclamp(b) << SK_B32_SHIFT
+    |       Sk4f_round(255.0f * a) << SK_A32_SHIFT).store(dst);
 }
 
 // Tail variant of store_srgb() handling 1 pixel at a time.
@@ -266,7 +281,12 @@ static void SK_VECTORCALL store_srgb_1(SkRasterPipeline::Stage* st, size_t x,
                                        Sk4f  r, Sk4f  g, Sk4f  b, Sk4f  a,
                                        Sk4f dr, Sk4f dg, Sk4f db, Sk4f da) {
     auto dst = st->ctx<uint32_t*>() + x;
-    *dst = Sk4f_toS32(swizzle_rb_if_bgra({ r[0], g[0], b[0], a[0] }));
+    Sk4i rgb = sk_linear_to_srgb_noclamp(swizzle_rb_if_bgra({ r[0], g[0], b[0], 0.0f }));
+
+    uint32_t rgba;
+    SkNx_cast<uint8_t>(rgb).store(&rgba);
+    rgba |= (uint32_t)(255.0f * a[0] + 0.5f) << 24;
+    *dst = rgba;
 }
 
 static bool supported(const SkImageInfo& info) {
@@ -343,6 +363,7 @@ void SkRasterPipelineBlitter::append_load_d(SkRasterPipeline* p, const void* dst
 void SkRasterPipelineBlitter::append_store(SkRasterPipeline* p, void* dst) const {
     SkASSERT(supported(fDst.info()));
 
+    p->append(clamp_01_premul);
     switch (fDst.info().colorType()) {
         case kN32_SkColorType:
             if (fDst.info().gammaCloseToSRGB()) {
diff --git a/src/core/SkSRGB.h b/src/core/SkSRGB.h
index 08ba860..31fd4ae 100644
--- a/src/core/SkSRGB.h
+++ b/src/core/SkSRGB.h
@@ -28,14 +28,15 @@ static inline Sk4f sk_clamp_0_255(const Sk4f& x) {
     return Sk4f::Min(Sk4f::Max(x, 0.0f), 255.0f);
 }
 
-static inline Sk4i sk_linear_to_srgb(const Sk4f& x) {
+// This should probably only be called from sk_linear_to_srgb() or sk_linear_to_srgb_noclamp().
+// It generally doesn't make sense to work with sRGB floats.
+static inline Sk4f sk_linear_to_srgb_needs_trunc(const Sk4f& x) {
     // Approximation of the sRGB gamma curve (within 1 when scaled to 8-bit pixels).
     //
-    // Tuned by brute force to minimize the number of bytes that fail to round trip,
-    // here 0 (of 256), and then to minimize the number of points halfway between bytes
-    // (in linear space) that fail to hit the right byte, here 131 (of 255), and to
-    // minimize the number of monotonicity regressions over the range [0,1], here 0.
-
+    // Constants tuned by brute force to minimize (in order of importance) after truncation:
+    //    1) the number of bytes that fail to round trip (0 of 256);
+    //    2) the number of points in [FLT_MIN, 1.0f] that are non-monotonic (0 of ~1 billion);
+    //    3) the number of points halfway between bytes that hit the wrong byte (131 of 255).
     auto rsqrt = x.rsqrt(),
          sqrt  = rsqrt.invert(),
          ftrt  = rsqrt.rsqrt();
@@ -45,8 +46,20 @@ static inline Sk4i sk_linear_to_srgb(const Sk4f& x) {
     auto hi = (-0.0974983f * 255.0f)
             + (+0.687999f  * 255.0f) * sqrt
             + (+0.412999f  * 255.0f) * ftrt;
+    return (x < 0.0048f).thenElse(lo, hi);
+}
+
+static inline Sk4i sk_linear_to_srgb(const Sk4f& x) {
+    Sk4f f = sk_linear_to_srgb_needs_trunc(x);
+    return SkNx_cast<int>(sk_clamp_0_255(f));
+}
 
-    return SkNx_cast<int>(sk_clamp_0_255((x < 0.0048f).thenElse(lo, hi)));
+static inline Sk4i sk_linear_to_srgb_noclamp(const Sk4f& x) {
+    Sk4f f = sk_linear_to_srgb_needs_trunc(x);
+    for (int i = 0; i < 4; i++) {
+        SkASSERTF(0.0f <= f[i] && f[i] < 256.0f, "f[%d] was %g, outside [0,256)\n", i, f[i]);
+    }
+    return SkNx_cast<int>(f);
 }
 
 #endif//SkSRGB_DEFINED
-- 
2.7.4