arithmetic mode with Sk4f
authormtklein <mtklein@chromium.org>
Sun, 10 Apr 2016 13:23:28 +0000 (06:23 -0700)
committerCommit bot <commit-bot@chromium.org>
Sun, 10 Apr 2016 13:23:28 +0000 (06:23 -0700)
After reading the SSE version, I figured I'd show off the new hotness a little.  This'll get us SSE, NEON and portable implementations all in one easy to read package.

Since we've been talking about it, it's worth noting the several ways this implementation is still not constant time:
  - short circuits on 0x00 and 0xff coverage;
  - floating point multiplication with untrusted k1-k4; if someone figures out a clever way to sometimes create denorm floats and sometimes not, there's a gigantic performance difference.

I would hazard the pin is constant time now though.

I've also fixed the lerp to lerp between dst and r instead of src and r.  That can't have been right.

curr/maxrss loops min median mean max stddev samples    config bench
   9/9   MB 1 25.5ms 25.5ms 25.5ms 25.5ms 0% ▃▁▁▃▂▇▅▆▇█ 8888 Xfermode_arithmetic_enforce_pm_aa
   9/9   MB 1 24.1ms 24.2ms 24.2ms 24.3ms 0% ▄▃▁▄█▆▆█▃█ 8888 Xfermode_arithmetic_aa
   9/9   MB 1 102ms 102ms 102ms 103ms 0% ▁▅▂▆▂█▂█▁▂ 8888 Xfermode_arithmetic_enforce_pm
   9/9   MB 1 94.8ms 95.4ms 95.2ms 95.8ms 0% ▅▅▁▁▁▁▄▇█▇ 8888 Xfermode_arithmetic

~~~~>

curr/maxrss loops min median mean max stddev samples    config bench
   9/9   MB 1 9.71ms 9.74ms 9.73ms 9.78ms 0% █▅▄▄▁▂▂▂▄▄ 8888 Xfermode_arithmetic_enforce_pm_aa
   9/9   MB 1 9.5ms 9.57ms 9.58ms 9.7ms 1% ▂▁█▅▂▂▆▃▄▄ 8888 Xfermode_arithmetic_aa
   9/9   MB 1 21.8ms 21.8ms 21.8ms 21.9ms 0% █▂▂▂▂▂▂▁▄▂ 8888 Xfermode_arithmetic_enforce_pm
   9/9   MB 1 16.5ms 16.6ms 16.6ms 16.6ms 0% ▃█▁▁▄▄▁▁▆▅ 8888 Xfermode_arithmetic

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1873963003
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review URL: https://codereview.chromium.org/1873963003

src/effects/SkArithmeticMode.cpp

index fbe6358ae4ece2550e28d09f76843da41f87dbca..e926f1bc9ad860d98999f732914cf0be03be2225 100644 (file)
@@ -7,10 +7,11 @@
 
 #include "SkArithmeticMode.h"
 #include "SkColorPriv.h"
+#include "SkNx.h"
 #include "SkReadBuffer.h"
-#include "SkWriteBuffer.h"
 #include "SkString.h"
 #include "SkUnPreMultiply.h"
+#include "SkWriteBuffer.h"
 #if SK_SUPPORT_GPU
 #include "SkArithmeticMode_gpu.h"
 #endif
@@ -63,64 +64,37 @@ sk_sp<SkFlattenable> SkArithmeticMode_scalar::CreateProc(SkReadBuffer& buffer) {
     return SkArithmeticMode::Make(k1, k2, k3, k4, enforcePMColor);
 }
 
-static int pinToByte(int value) {
-    if (value < 0) {
-        value = 0;
-    } else if (value > 255) {
-        value = 255;
-    }
-    return value;
-}
+void SkArithmeticMode_scalar::xfer32(SkPMColor dst[], const SkPMColor src[],
+                                 int count, const SkAlpha aaCoverage[]) const {
+    const Sk4f k1 = fK[0] * (1/255.0f),
+               k2 = fK[1],
+               k3 = fK[2],
+               k4 = fK[3] * 255.0f + 0.5f;
+
+    auto pin = [](float min, const Sk4f& val, float max) {
+        return Sk4f::Max(min, Sk4f::Min(val, max));
+    };
+
+    for (int i = 0; i < count; i++) {
+        if (aaCoverage && aaCoverage[i] == 0) {
+            continue;
+        }
 
-static int arith(SkScalar k1, SkScalar k2, SkScalar k3, SkScalar k4,
-                 int src, int dst) {
-    SkScalar result = SkScalarMul(k1, src * dst) +
-                      SkScalarMul(k2, src) +
-                      SkScalarMul(k3, dst) +
-                      k4;
-    int res = SkScalarRoundToInt(result);
-    return pinToByte(res);
-}
+        Sk4f s = SkNx_cast<float>(Sk4b::Load(src+i)),
+             d = SkNx_cast<float>(Sk4b::Load(dst+i)),
+             r = pin(0, k1*s*d + k2*s + k3*d + k4, 255);
 
-static int blend(int src, int dst, int scale) {
-    return dst + ((src - dst) * scale >> 8);
-}
+        if (fEnforcePMColor) {
+            Sk4f a = SkNx_shuffle<3,3,3,3>(r);
+            r = Sk4f::Min(a, r);
+        }
 
-void SkArithmeticMode_scalar::xfer32(SkPMColor dst[], const SkPMColor src[],
-                                 int count, const SkAlpha aaCoverage[]) const {
-    SkScalar k1 = fK[0] / 255;
-    SkScalar k2 = fK[1];
-    SkScalar k3 = fK[2];
-    SkScalar k4 = fK[3] * 255;
-
-    for (int i = 0; i < count; ++i) {
-        if ((nullptr == aaCoverage) || aaCoverage[i]) {
-            SkPMColor sc = src[i];
-            SkPMColor dc = dst[i];
-
-            int a, r, g, b;
-
-            a = arith(k1, k2, k3, k4, SkGetPackedA32(sc), SkGetPackedA32(dc));
-            r = arith(k1, k2, k3, k4, SkGetPackedR32(sc), SkGetPackedR32(dc));
-            g = arith(k1, k2, k3, k4, SkGetPackedG32(sc), SkGetPackedG32(dc));
-            b = arith(k1, k2, k3, k4, SkGetPackedB32(sc), SkGetPackedB32(dc));
-            if (fEnforcePMColor) {
-                r = SkMin32(r, a);
-                g = SkMin32(g, a);
-                b = SkMin32(b, a);
-            }
-
-            // apply antialias coverage if necessary
-            if (aaCoverage && 0xFF != aaCoverage[i]) {
-                int scale = aaCoverage[i] + (aaCoverage[i] >> 7);
-                a = blend(a, SkGetPackedA32(sc), scale);
-                r = blend(r, SkGetPackedR32(sc), scale);
-                g = blend(g, SkGetPackedG32(sc), scale);
-                b = blend(b, SkGetPackedB32(sc), scale);
-            }
-
-            dst[i] = fEnforcePMColor ? SkPackARGB32(a, r, g, b) : SkPackARGB32NoCheck(a, r, g, b);
+        if (aaCoverage && aaCoverage[i] != 255) {
+            Sk4f c = aaCoverage[i] * (1/255.0f);
+            r = d + (r-d)*c;
         }
+
+        SkNx_cast<uint8_t>(r).store(dst+i);
     }
 }