Do loads and math in parallel in SkColorXform_opts
authormsarett <msarett@google.com>
Wed, 22 Jun 2016 21:55:51 +0000 (14:55 -0700)
committerCommit bot <commit-bot@chromium.org>
Wed, 22 Jun 2016 21:55:51 +0000 (14:55 -0700)
Note that baselines have changed a little since I
recently started using clang.

201295.jpg on HP z620 (300x280)

Skia Xform sRGB Dst Before    0.378 ms
Skia Xform sRGB Dst After     0.322 ms
                              1.17x

Skia Xform 2.2  Dst Before    0.428 ms
Skia Xform 2.2  Dst After     0.395 ms
                              1.08x

QCMS Xform                    0.418 ms

sRGB Dst vs QCMS              1.30x
2.2  Dst vs QCMS              1.06x

--------------------------------------------

Nexus 6P:
Skia Xform sRGB Dst Before    1.58 ms
Skia Xform sRGB Dst After     1.43 ms
Skia Xform 2.2  Dst Before    2.69 ms
Skia Xform 2.2  Dst After     2.62 ms

Dell Venue 8:
Skia Xform sRGB Dst Before    2.78 ms
Skia Xform sRGB Dst After     2.74 ms
Skia Xform 2.2  Dst Before    3.73 ms
Skia Xform 2.2  Dst After     3.64 ms

BUG=skia:
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2081933005
CQ_EXTRA_TRYBOTS=client.skia:Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Release-SKNX_NO_SIMD-Trybot

Review-Url: https://codereview.chromium.org/2081933005

src/opts/SkColorXform_opts.h

index b179311..74aa53c 100644 (file)
@@ -55,51 +55,64 @@ static Sk4f clamp_0_to_255(const Sk4f& x) {
 template <const float (&linear_from_curve)[256], Sk4f (*linear_to_curve)(const Sk4f&)>
 static void color_xform_RGB1(uint32_t* dst, const uint32_t* src, int len,
                              const float matrix[16]) {
-    // Load transformation matrix.
-    auto rXgXbX = Sk4f::Load(matrix + 0),
+    Sk4f rXgXbX = Sk4f::Load(matrix + 0),
          rYgYbY = Sk4f::Load(matrix + 4),
          rZgZbZ = Sk4f::Load(matrix + 8);
 
-    while (len >= 4) {
-        // Convert to linear.  The look-up table has perfect accuracy.
-        auto reds   = Sk4f{linear_from_curve[(src[0] >>  0) & 0xFF],
-                           linear_from_curve[(src[1] >>  0) & 0xFF],
-                           linear_from_curve[(src[2] >>  0) & 0xFF],
-                           linear_from_curve[(src[3] >>  0) & 0xFF]};
-        auto greens = Sk4f{linear_from_curve[(src[0] >>  8) & 0xFF],
-                           linear_from_curve[(src[1] >>  8) & 0xFF],
-                           linear_from_curve[(src[2] >>  8) & 0xFF],
-                           linear_from_curve[(src[3] >>  8) & 0xFF]};
-        auto blues  = Sk4f{linear_from_curve[(src[0] >> 16) & 0xFF],
-                           linear_from_curve[(src[1] >> 16) & 0xFF],
-                           linear_from_curve[(src[2] >> 16) & 0xFF],
-                           linear_from_curve[(src[3] >> 16) & 0xFF]};
-
-        // Apply the transformation matrix to dst gamut.
-        auto dstReds   = rXgXbX[0]*reds + rYgYbY[0]*greens + rZgZbZ[0]*blues,
-             dstGreens = rXgXbX[1]*reds + rYgYbY[1]*greens + rZgZbZ[1]*blues,
-             dstBlues  = rXgXbX[2]*reds + rYgYbY[2]*greens + rZgZbZ[2]*blues;
-
-        // Convert to dst gamma.
-        dstReds   = linear_to_curve(dstReds);
-        dstGreens = linear_to_curve(dstGreens);
-        dstBlues  = linear_to_curve(dstBlues);
-
-        // Clamp floats to byte range.
-        dstReds   = clamp_0_to_255(dstReds);
-        dstGreens = clamp_0_to_255(dstGreens);
-        dstBlues  = clamp_0_to_255(dstBlues);
-
-        // Convert to bytes and store to memory.
-        auto rgba = (Sk4i{(int)0xFF000000}          )
-                  | (SkNx_cast<int>(dstReds)        )
-                  | (SkNx_cast<int>(dstGreens) <<  8)
-                  | (SkNx_cast<int>(dstBlues)  << 16);
-        rgba.store(dst);
-
-        dst += 4;
-        src += 4;
-        len -= 4;
+    if (len >= 4) {
+        Sk4f reds, greens, blues;
+        auto load_next_4 = [&reds, &greens, &blues, &src, &len] {
+            reds   = Sk4f{linear_from_curve[(src[0] >>  0) & 0xFF],
+                          linear_from_curve[(src[1] >>  0) & 0xFF],
+                          linear_from_curve[(src[2] >>  0) & 0xFF],
+                          linear_from_curve[(src[3] >>  0) & 0xFF]};
+            greens = Sk4f{linear_from_curve[(src[0] >>  8) & 0xFF],
+                          linear_from_curve[(src[1] >>  8) & 0xFF],
+                          linear_from_curve[(src[2] >>  8) & 0xFF],
+                          linear_from_curve[(src[3] >>  8) & 0xFF]};
+            blues  = Sk4f{linear_from_curve[(src[0] >> 16) & 0xFF],
+                          linear_from_curve[(src[1] >> 16) & 0xFF],
+                          linear_from_curve[(src[2] >> 16) & 0xFF],
+                          linear_from_curve[(src[3] >> 16) & 0xFF]};
+            src += 4;
+            len -= 4;
+        };
+
+        Sk4f dstReds, dstGreens, dstBlues;
+        auto transform_4 = [&reds, &greens, &blues, &dstReds, &dstGreens, &dstBlues, &rXgXbX,
+                            &rYgYbY, &rZgZbZ] {
+            dstReds   = rXgXbX[0]*reds + rYgYbY[0]*greens + rZgZbZ[0]*blues;
+            dstGreens = rXgXbX[1]*reds + rYgYbY[1]*greens + rZgZbZ[1]*blues;
+            dstBlues  = rXgXbX[2]*reds + rYgYbY[2]*greens + rZgZbZ[2]*blues;
+        };
+
+        auto store_4 = [&dstReds, &dstGreens, &dstBlues, &dst] {
+            dstReds   = linear_to_curve(dstReds);
+            dstGreens = linear_to_curve(dstGreens);
+            dstBlues  = linear_to_curve(dstBlues);
+
+            dstReds   = clamp_0_to_255(dstReds);
+            dstGreens = clamp_0_to_255(dstGreens);
+            dstBlues  = clamp_0_to_255(dstBlues);
+
+            auto rgba = (Sk4i{(int)0xFF000000}          )
+                      | (SkNx_cast<int>(dstReds)        )
+                      | (SkNx_cast<int>(dstGreens) <<  8)
+                      | (SkNx_cast<int>(dstBlues)  << 16);
+            rgba.store(dst);
+            dst += 4;
+        };
+
+        load_next_4();
+
+        while (len >= 4) {
+            transform_4();
+            load_next_4();
+            store_4();
+        }
+
+        transform_4();
+        store_4();
     }
 
     while (len > 0) {