unroll srcover_1 for blending a single color
authorreed <reed@google.com>
Tue, 2 Feb 2016 19:00:55 +0000 (11:00 -0800)
committerCommit bot <commit-bot@chromium.org>
Tue, 2 Feb 2016 19:00:55 +0000 (11:00 -0800)
Before:
curr/maxrss loops min median mean max stddev samples    config bench
   8/8   MB 1 1.59ms 1.82ms 1.89ms 2.59ms 14% ▁█▃▃▃▃▃▃▃▃ nonrendering xfer4f_srcover_1_alpha_linear
   8/8   MB 1 3.25ms 4.25ms 4.16ms 5.87ms 21% ▁▅▂▁▁▄█▄▅▂ nonrendering xfer4f_srcover_1_alpha_srgb

After:
curr/maxrss loops min median mean max stddev samples    config bench
   8/8   MB 1 915µs 915µs 946µs 1.02ms 4% █▄▇▁▁▁▆▁▁▁ nonrendering xfer4f_srcover_1_alpha_linear
   8/8   MB 1 2.69ms 3.08ms 3.03ms 3.63ms 10% ▁▃▂▁▁█▄▄▄▆ nonrendering xfer4f_srcover_1_alpha_srgb

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search2?unt=true&query=source_type%3Dgm&master=false&issue=1653943002

Review URL: https://codereview.chromium.org/1653943002

src/core/SkXfermode4f.cpp

index 0485a5e..1bf66a2 100644 (file)
@@ -31,10 +31,22 @@ template <DstType D> Sk4f load_dst(SkPMColor dstC) {
     return (D == kSRGB_Dst) ? Sk4f_fromS32(dstC) : Sk4f_fromL32(dstC);
 }
 
+static Sk4f srgb_4b_to_linear_unit(SkPMColor dstC) {
+    return Sk4f_fromS32(dstC);
+}
+
 template <DstType D> uint32_t store_dst(const Sk4f& x4) {
     return (D == kSRGB_Dst) ? Sk4f_toS32(x4) : Sk4f_toL32(x4);
 }
 
+static uint32_t linear_unit_to_srgb_32(const Sk4f& l4) {
+    return Sk4f_toL32(l4);
+}
+
+static Sk4f linear_unit_to_srgb_255f(const Sk4f& l4) {
+    return linear_to_srgb(l4) * Sk4f(255) + Sk4f(0.5f);
+}
+
 ///////////////////////////////////////////////////////////////////////////////////////////////////
 
 static Sk4f scale_255_round(const SkPM4f& pm4) {
@@ -254,10 +266,53 @@ template <DstType D> void srcover_n(const SkXfermode::PM4fState& state, uint32_t
     }
 }
 
-template <DstType D> void srcover_1(const SkXfermode::PM4fState& state, uint32_t dst[],
-                                    const SkPM4f& src, int count, const SkAlpha aa[]) {
+static void srcover_linear_dst_1(const SkXfermode::PM4fState& state, uint32_t dst[],
+                                 const SkPM4f& src, int count, const SkAlpha aa[]) {
+    Sk4f s4 = Sk4f::Load(src.fVec);
+    Sk4f dst_scale = Sk4f(1 - get_alpha(s4));
+    
+    if (aa) {
+        for (int i = 0; i < count; ++i) {
+            unsigned a = aa[i];
+            if (0 == a) {
+                continue;
+            }
+            Sk4f d4 = Sk4f_fromL32(dst[i]);
+            Sk4f r4;
+            if (a != 0xFF) {
+                s4 = scale_by_coverage(s4, a);
+                r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));
+            } else {
+                r4 = s4 + d4 * dst_scale;
+            }
+            dst[i] = Sk4f_toL32(r4);
+        }
+    } else {
+        s4 = s4 * Sk4f(255) + Sk4f(0.5f);   // +0.5 to pre-bias for rounding
+        while (count >= 4) {
+            Sk4f d0 = to_4f(dst[0]);
+            Sk4f d1 = to_4f(dst[1]);
+            Sk4f d2 = to_4f(dst[2]);
+            Sk4f d3 = to_4f(dst[3]);
+            Sk4f_ToBytes((uint8_t*)dst,
+                         s4 + d0 * dst_scale,
+                         s4 + d1 * dst_scale,
+                         s4 + d2 * dst_scale,
+                         s4 + d3 * dst_scale);
+            dst += 4;
+            count -= 4;
+        }
+        for (int i = 0; i < count; ++i) {
+            Sk4f d4 = to_4f(dst[i]);
+            dst[i] = to_4b(s4 + d4 * dst_scale);
+        }
+    }
+}
+
+static void srcover_srgb_dst_1(const SkXfermode::PM4fState& state, uint32_t dst[],
+                               const SkPM4f& src, int count, const SkAlpha aa[]) {
     Sk4f s4 = Sk4f::Load(src.fVec);
-    Sk4f scale = Sk4f(1 - get_alpha(s4));
+    Sk4f dst_scale = Sk4f(1 - get_alpha(s4));
 
     if (aa) {
         for (int i = 0; i < count; ++i) {
@@ -265,30 +320,42 @@ template <DstType D> void srcover_1(const SkXfermode::PM4fState& state, uint32_t
             if (0 == a) {
                 continue;
             }
-            Sk4f d4 = load_dst<D>(dst[i]);
+            Sk4f d4 = srgb_4b_to_linear_unit(dst[i]);
             Sk4f r4;
             if (a != 0xFF) {
                 s4 = scale_by_coverage(s4, a);
                 r4 = s4 + d4 * Sk4f(1 - get_alpha(s4));
             } else {
-                r4 = s4 + d4 * scale;
+                r4 = s4 + d4 * dst_scale;
             }
-            dst[i] = store_dst<D>(r4);
+            dst[i] = linear_unit_to_srgb_32(r4);
         }
     } else {
+        while (count >= 4) {
+            Sk4f d0 = srgb_4b_to_linear_unit(dst[0]);
+            Sk4f d1 = srgb_4b_to_linear_unit(dst[1]);
+            Sk4f d2 = srgb_4b_to_linear_unit(dst[2]);
+            Sk4f d3 = srgb_4b_to_linear_unit(dst[3]);
+            Sk4f_ToBytes((uint8_t*)dst,
+                         linear_unit_to_srgb_255f(s4 + d0 * dst_scale),
+                         linear_unit_to_srgb_255f(s4 + d1 * dst_scale),
+                         linear_unit_to_srgb_255f(s4 + d2 * dst_scale),
+                         linear_unit_to_srgb_255f(s4 + d3 * dst_scale));
+            dst += 4;
+            count -= 4;
+        }
         for (int i = 0; i < count; ++i) {
-            Sk4f d4 = load_dst<D>(dst[i]);
-            Sk4f r4 = s4 + d4 * scale;
-            dst[i] = store_dst<D>(r4);
+            Sk4f d4 = srgb_4b_to_linear_unit(dst[i]);
+            dst[i] = to_4b(linear_unit_to_srgb_255f(s4 + d4 * dst_scale));
         }
     }
 }
 
 const XferProcPair gProcs_SrcOver[] = {
-    { srcover_1<kLinear_Dst>,   srcover_n<kLinear_Dst> },   // linear   alpha
-    { src_1<kLinear_Dst>,       src_n<kLinear_Dst>     },   // linear   opaque [ we are src-mode ]
-    { srcover_1<kSRGB_Dst>,     srcover_n<kSRGB_Dst>   },   // srgb     alpha
-    { src_1<kSRGB_Dst>,         src_n<kSRGB_Dst>       },   // srgb     opaque [ we are src-mode ]
+    { srcover_linear_dst_1, srcover_n<kLinear_Dst> },   // linear   alpha
+    { src_1<kLinear_Dst>,   src_n<kLinear_Dst>     },   // linear   opaque [ we are src-mode ]
+    { srcover_srgb_dst_1,   srcover_n<kSRGB_Dst>   },   // srgb     alpha
+    { src_1<kSRGB_Dst>,     src_n<kSRGB_Dst>       },   // srgb     opaque [ we are src-mode ]
 };
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////