jumper, implement 2.2 stages with approx_powf
authorMike Klein <mtklein@chromium.org>
Fri, 21 Apr 2017 16:05:01 +0000 (12:05 -0400)
committerSkia Commit-Bot <skia-commit-bot@chromium.org>
Fri, 21 Apr 2017 17:16:07 +0000 (17:16 +0000)
My main interest is getting rid of weird code, but it's also faster.
The new bench drops from 667 to 412.

Change-Id: Ibf889601284cf925780320c828394f79937dc705
Reviewed-on: https://skia-review.googlesource.com/14035
Reviewed-by: Matt Sarett <msarett@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>

bench/SkRasterPipelineBench.cpp
src/jumper/SkJumper_generated.S
src/jumper/SkJumper_generated_win.S
src/jumper/SkJumper_stages.cpp
tests/ParametricStageTest.cpp

index 0ae48ee..fa2df4b 100644 (file)
@@ -90,3 +90,24 @@ public:
     }
 };
 DEF_BENCH( return (new SkRasterPipelineLegacyBench); )
+
+class SkRasterPipeline_2dot2 : public Benchmark {
+public:
+    bool isSuitableFor(Backend backend) override { return backend == kNonRendering_Backend; }
+    const char* onGetName() override {
+        return "SkRasterPipeline_2dot2";
+    }
+
+    void onDraw(int loops, SkCanvas*) override {
+        SkColor4f c = { 1.0f, 1.0f, 1.0f, 1.0f };
+        SkRasterPipeline p;
+        p.append(SkRasterPipeline::constant_color, &c);
+        p.append(SkRasterPipeline::from_2dot2);
+        p.append(SkRasterPipeline::to_2dot2);
+
+        while (loops --> 0) {
+            p.run(0,N);
+        }
+    }
+};
+DEF_BENCH( return (new SkRasterPipeline_2dot2); )
index 6b73406..d9540e9 100644 (file)
@@ -978,205 +978,186 @@ HIDDEN _sk_from_2dot2_aarch64
 .globl _sk_from_2dot2_aarch64
 FUNCTION(_sk_from_2dot2_aarch64)
 _sk_from_2dot2_aarch64:
-  .long  0x6ea1d810                          // frsqrte       v16.4s, v0.4s
-  .long  0x6ea1d832                          // frsqrte       v18.4s, v1.4s
-  .long  0x6e30de15                          // fmul          v21.4s, v16.4s, v16.4s
-  .long  0x6e20dc11                          // fmul          v17.4s, v0.4s, v0.4s
-  .long  0x6ea1d854                          // frsqrte       v20.4s, v2.4s
-  .long  0x6e32de56                          // fmul          v22.4s, v18.4s, v18.4s
-  .long  0x4eb5fc00                          // frsqrts       v0.4s, v0.4s, v21.4s
-  .long  0x6e21dc33                          // fmul          v19.4s, v1.4s, v1.4s
-  .long  0x6e34de97                          // fmul          v23.4s, v20.4s, v20.4s
-  .long  0x4eb6fc21                          // frsqrts       v1.4s, v1.4s, v22.4s
-  .long  0x6e20de00                          // fmul          v0.4s, v16.4s, v0.4s
-  .long  0x4eb7fc55                          // frsqrts       v21.4s, v2.4s, v23.4s
-  .long  0x6e21de41                          // fmul          v1.4s, v18.4s, v1.4s
-  .long  0x6ea1d812                          // frsqrte       v18.4s, v0.4s
-  .long  0x6e35de90                          // fmul          v16.4s, v20.4s, v21.4s
-  .long  0x6ea1d834                          // frsqrte       v20.4s, v1.4s
-  .long  0x6e32de56                          // fmul          v22.4s, v18.4s, v18.4s
-  .long  0x6ea1da15                          // frsqrte       v21.4s, v16.4s
-  .long  0x6e34de97                          // fmul          v23.4s, v20.4s, v20.4s
-  .long  0x4eb6fc00                          // frsqrts       v0.4s, v0.4s, v22.4s
-  .long  0x6e35deb6                          // fmul          v22.4s, v21.4s, v21.4s
-  .long  0x4eb7fc21                          // frsqrts       v1.4s, v1.4s, v23.4s
-  .long  0x6e20de40                          // fmul          v0.4s, v18.4s, v0.4s
-  .long  0x4eb6fe10                          // frsqrts       v16.4s, v16.4s, v22.4s
-  .long  0x6e21de81                          // fmul          v1.4s, v20.4s, v1.4s
-  .long  0x6ea1d812                          // frsqrte       v18.4s, v0.4s
-  .long  0x6e30deb0                          // fmul          v16.4s, v21.4s, v16.4s
-  .long  0x6ea1d834                          // frsqrte       v20.4s, v1.4s
-  .long  0x6e32de56                          // fmul          v22.4s, v18.4s, v18.4s
-  .long  0x6ea1da15                          // frsqrte       v21.4s, v16.4s
-  .long  0x6e34de97                          // fmul          v23.4s, v20.4s, v20.4s
-  .long  0x4eb6fc00                          // frsqrts       v0.4s, v0.4s, v22.4s
-  .long  0x6e35deb6                          // fmul          v22.4s, v21.4s, v21.4s
-  .long  0x4eb7fc21                          // frsqrts       v1.4s, v1.4s, v23.4s
-  .long  0x6e20de40                          // fmul          v0.4s, v18.4s, v0.4s
-  .long  0x4eb6fe10                          // frsqrts       v16.4s, v16.4s, v22.4s
-  .long  0x6e21de81                          // fmul          v1.4s, v20.4s, v1.4s
-  .long  0x6ea1d812                          // frsqrte       v18.4s, v0.4s
-  .long  0x6e30deb0                          // fmul          v16.4s, v21.4s, v16.4s
-  .long  0x6ea1d834                          // frsqrte       v20.4s, v1.4s
-  .long  0x6e32de56                          // fmul          v22.4s, v18.4s, v18.4s
-  .long  0x6ea1da15                          // frsqrte       v21.4s, v16.4s
-  .long  0x6e34de97                          // fmul          v23.4s, v20.4s, v20.4s
-  .long  0x4eb6fc00                          // frsqrts       v0.4s, v0.4s, v22.4s
-  .long  0x6e35deb6                          // fmul          v22.4s, v21.4s, v21.4s
-  .long  0x4eb7fc21                          // frsqrts       v1.4s, v1.4s, v23.4s
-  .long  0x6e20de40                          // fmul          v0.4s, v18.4s, v0.4s
-  .long  0x4eb6fe10                          // frsqrts       v16.4s, v16.4s, v22.4s
-  .long  0x6e21de81                          // fmul          v1.4s, v20.4s, v1.4s
-  .long  0x6ea1d812                          // frsqrte       v18.4s, v0.4s
-  .long  0x6e20dc14                          // fmul          v20.4s, v0.4s, v0.4s
-  .long  0x6e30deb0                          // fmul          v16.4s, v21.4s, v16.4s
-  .long  0x6ea1d835                          // frsqrte       v21.4s, v1.4s
-  .long  0x6e21dc36                          // fmul          v22.4s, v1.4s, v1.4s
-  .long  0x6e32de57                          // fmul          v23.4s, v18.4s, v18.4s
-  .long  0x6e34dc14                          // fmul          v20.4s, v0.4s, v20.4s
-  .long  0x4eb7fc00                          // frsqrts       v0.4s, v0.4s, v23.4s
-  .long  0x6ea1da17                          // frsqrte       v23.4s, v16.4s
-  .long  0x6e34de31                          // fmul          v17.4s, v17.4s, v20.4s
-  .long  0x6e35deb4                          // fmul          v20.4s, v21.4s, v21.4s
-  .long  0x6e36dc36                          // fmul          v22.4s, v1.4s, v22.4s
-  .long  0x4eb4fc21                          // frsqrts       v1.4s, v1.4s, v20.4s
-  .long  0x6e30de14                          // fmul          v20.4s, v16.4s, v16.4s
-  .long  0x6e36de73                          // fmul          v19.4s, v19.4s, v22.4s
-  .long  0x6e37def6                          // fmul          v22.4s, v23.4s, v23.4s
-  .long  0x6e20de40                          // fmul          v0.4s, v18.4s, v0.4s
-  .long  0x6e34de14                          // fmul          v20.4s, v16.4s, v20.4s
-  .long  0x4eb6fe10                          // frsqrts       v16.4s, v16.4s, v22.4s
-  .long  0x6e22dc42                          // fmul          v2.4s, v2.4s, v2.4s
-  .long  0x6e21dea1                          // fmul          v1.4s, v21.4s, v1.4s
-  .long  0x6ea1d812                          // frsqrte       v18.4s, v0.4s
-  .long  0x6e34dc42                          // fmul          v2.4s, v2.4s, v20.4s
-  .long  0x6e30def0                          // fmul          v16.4s, v23.4s, v16.4s
-  .long  0x6ea1d834                          // frsqrte       v20.4s, v1.4s
-  .long  0x6e32de56                          // fmul          v22.4s, v18.4s, v18.4s
-  .long  0x6ea1da15                          // frsqrte       v21.4s, v16.4s
-  .long  0x4eb6fc00                          // frsqrts       v0.4s, v0.4s, v22.4s
-  .long  0x6e34de96                          // fmul          v22.4s, v20.4s, v20.4s
-  .long  0x4eb6fc21                          // frsqrts       v1.4s, v1.4s, v22.4s
-  .long  0x6e35deb6                          // fmul          v22.4s, v21.4s, v21.4s
-  .long  0x4eb6fe10                          // frsqrts       v16.4s, v16.4s, v22.4s
+  .long  0x52b85f08                          // mov           w8, #0xc2f80000
+  .long  0x728e6ee8                          // movk          w8, #0x7377
+  .long  0x4e040d11                          // dup           v17.4s, w8
+  .long  0x52a7f7e8                          // mov           w8, #0x3fbf0000
+  .long  0x7297eea8                          // movk          w8, #0xbf75
+  .long  0x4e040d12                          // dup           v18.4s, w8
+  .long  0x52a7d688                          // mov           w8, #0x3eb40000
+  .long  0x72889f28                          // movk          w8, #0x44f9
+  .long  0x4e040d13                          // dup           v19.4s, w8
+  .long  0x52a7fb88                          // mov           w8, #0x3fdc0000
+  .long  0x729d3468                          // movk          w8, #0xe9a3
+  .long  0x4e040d14                          // dup           v20.4s, w8
+  .long  0x52a80188                          // mov           w8, #0x400c0000
+  .long  0x4f03d7fa                          // movi          v26.4s, #0x7f, msl #16
+  .long  0x729999a8                          // movk          w8, #0xcccd
+  .long  0x4e21d818                          // scvtf         v24.4s, v0.4s
+  .long  0x4f016690                          // movi          v16.4s, #0x34, lsl #24
+  .long  0x4e040d15                          // dup           v21.4s, w8
+  .long  0x52a85e48                          // mov           w8, #0x42f20000
+  .long  0x4e21d85b                          // scvtf         v27.4s, v2.4s
+  .long  0x4e3a1c00                          // and           v0.16b, v0.16b, v26.16b
+  .long  0x4e3a1c42                          // and           v2.16b, v2.16b, v26.16b
+  .long  0x4e3a1c3a                          // and           v26.16b, v1.16b, v26.16b
+  .long  0x72918a28                          // movk          w8, #0x8c51
+  .long  0x4eb11e3c                          // mov           v28.16b, v17.16b
+  .long  0x4eb11e3d                          // mov           v29.16b, v17.16b
+  .long  0x4e3bce11                          // fmla          v17.4s, v16.4s, v27.4s
+  .long  0x4e21d821                          // scvtf         v1.4s, v1.4s
+  .long  0x4f0177e0                          // orr           v0.4s, #0x3f, lsl #24
+  .long  0x4f0177fa                          // orr           v26.4s, #0x3f, lsl #24
+  .long  0x4f0177e2                          // orr           v2.4s, #0x3f, lsl #24
+  .long  0x4e040d17                          // dup           v23.4s, w8
+  .long  0x52a7f7c8                          // mov           w8, #0x3fbe0000
+  .long  0x4e38ce1c                          // fmla          v28.4s, v16.4s, v24.4s
+  .long  0x4e21ce1d                          // fmla          v29.4s, v16.4s, v1.4s
+  .long  0x4e33d401                          // fadd          v1.4s, v0.4s, v19.4s
+  .long  0x4e33d750                          // fadd          v16.4s, v26.4s, v19.4s
+  .long  0x4eb2cc51                          // fmls          v17.4s, v2.4s, v18.4s
+  .long  0x4e33d442                          // fadd          v2.4s, v2.4s, v19.4s
+  .long  0x729791a8                          // movk          w8, #0xbc8d
+  .long  0x4eb2cc1c                          // fmls          v28.4s, v0.4s, v18.4s
+  .long  0x6e21fe80                          // fdiv          v0.4s, v20.4s, v1.4s
+  .long  0x4eb2cf5d                          // fmls          v29.4s, v26.4s, v18.4s
+  .long  0x6e30fe81                          // fdiv          v1.4s, v20.4s, v16.4s
+  .long  0x6e22fe82                          // fdiv          v2.4s, v20.4s, v2.4s
+  .long  0x4e040d16                          // dup           v22.4s, w8
+  .long  0x52a81348                          // mov           w8, #0x409a0000
+  .long  0x4ea0d780                          // fsub          v0.4s, v28.4s, v0.4s
+  .long  0x4ea1d7a1                          // fsub          v1.4s, v29.4s, v1.4s
+  .long  0x4ea2d622                          // fsub          v2.4s, v17.4s, v2.4s
+  .long  0x729ebf08                          // movk          w8, #0xf5f8
+  .long  0x6e35dc00                          // fmul          v0.4s, v0.4s, v21.4s
+  .long  0x6e35dc21                          // fmul          v1.4s, v1.4s, v21.4s
+  .long  0x6e35dc42                          // fmul          v2.4s, v2.4s, v21.4s
+  .long  0x4e040d19                          // dup           v25.4s, w8
+  .long  0x52a83ba8                          // mov           w8, #0x41dd0000
+  .long  0x4e219810                          // frintm        v16.4s, v0.4s
+  .long  0x4e219832                          // frintm        v18.4s, v1.4s
+  .long  0x4e219854                          // frintm        v20.4s, v2.4s
+  .long  0x729a5fc8                          // movk          w8, #0xd2fe
+  .long  0x4e37d411                          // fadd          v17.4s, v0.4s, v23.4s
+  .long  0x4e37d433                          // fadd          v19.4s, v1.4s, v23.4s
+  .long  0x4e37d455                          // fadd          v21.4s, v2.4s, v23.4s
+  .long  0x4eb0d400                          // fsub          v0.4s, v0.4s, v16.4s
+  .long  0x4eb2d421                          // fsub          v1.4s, v1.4s, v18.4s
+  .long  0x4eb4d442                          // fsub          v2.4s, v2.4s, v20.4s
+  .long  0x4e040d18                          // dup           v24.4s, w8
+  .long  0x4eb6cc11                          // fmls          v17.4s, v0.4s, v22.4s
+  .long  0x4ea0d720                          // fsub          v0.4s, v25.4s, v0.4s
+  .long  0x4eb6cc33                          // fmls          v19.4s, v1.4s, v22.4s
+  .long  0x4ea1d721                          // fsub          v1.4s, v25.4s, v1.4s
+  .long  0x4eb6cc55                          // fmls          v21.4s, v2.4s, v22.4s
+  .long  0x4ea2d722                          // fsub          v2.4s, v25.4s, v2.4s
   .long  0xf8408423                          // ldr           x3, [x1], #8
-  .long  0x6e20de40                          // fmul          v0.4s, v18.4s, v0.4s
-  .long  0x6e21de81                          // fmul          v1.4s, v20.4s, v1.4s
-  .long  0x6e30deb0                          // fmul          v16.4s, v21.4s, v16.4s
-  .long  0x6f00e412                          // movi          v18.2d, #0x0
-  .long  0x6e20de20                          // fmul          v0.4s, v17.4s, v0.4s
-  .long  0x6e21de61                          // fmul          v1.4s, v19.4s, v1.4s
-  .long  0x6e30dc42                          // fmul          v2.4s, v2.4s, v16.4s
-  .long  0x4e32f400                          // fmax          v0.4s, v0.4s, v18.4s
-  .long  0x4e32f421                          // fmax          v1.4s, v1.4s, v18.4s
-  .long  0x4e32f442                          // fmax          v2.4s, v2.4s, v18.4s
+  .long  0x6e20ff00                          // fdiv          v0.4s, v24.4s, v0.4s
+  .long  0x6e21ff01                          // fdiv          v1.4s, v24.4s, v1.4s
+  .long  0x6e22ff02                          // fdiv          v2.4s, v24.4s, v2.4s
+  .long  0x4f02657b                          // movi          v27.4s, #0x4b, lsl #24
+  .long  0x4e20d620                          // fadd          v0.4s, v17.4s, v0.4s
+  .long  0x4e21d661                          // fadd          v1.4s, v19.4s, v1.4s
+  .long  0x4e22d6a2                          // fadd          v2.4s, v21.4s, v2.4s
+  .long  0x6e3bdc00                          // fmul          v0.4s, v0.4s, v27.4s
+  .long  0x6e3bdc21                          // fmul          v1.4s, v1.4s, v27.4s
+  .long  0x6e3bdc42                          // fmul          v2.4s, v2.4s, v27.4s
+  .long  0x6e21a800                          // fcvtnu        v0.4s, v0.4s
+  .long  0x6e21a821                          // fcvtnu        v1.4s, v1.4s
+  .long  0x6e21a842                          // fcvtnu        v2.4s, v2.4s
   .long  0xd61f0060                          // br            x3
 
 HIDDEN _sk_to_2dot2_aarch64
 .globl _sk_to_2dot2_aarch64
 FUNCTION(_sk_to_2dot2_aarch64)
 _sk_to_2dot2_aarch64:
-  .long  0x6ea1d810                          // frsqrte       v16.4s, v0.4s
-  .long  0x6e30de13                          // fmul          v19.4s, v16.4s, v16.4s
-  .long  0x6ea1d831                          // frsqrte       v17.4s, v1.4s
-  .long  0x4eb3fc00                          // frsqrts       v0.4s, v0.4s, v19.4s
-  .long  0x6ea1d852                          // frsqrte       v18.4s, v2.4s
-  .long  0x6e31de34                          // fmul          v20.4s, v17.4s, v17.4s
-  .long  0x6e20de00                          // fmul          v0.4s, v16.4s, v0.4s
-  .long  0x6e32de55                          // fmul          v21.4s, v18.4s, v18.4s
-  .long  0x4eb4fc21                          // frsqrts       v1.4s, v1.4s, v20.4s
-  .long  0x6ea1d810                          // frsqrte       v16.4s, v0.4s
-  .long  0x4eb5fc42                          // frsqrts       v2.4s, v2.4s, v21.4s
-  .long  0x6e21de21                          // fmul          v1.4s, v17.4s, v1.4s
-  .long  0x4ea1d811                          // frecpe        v17.4s, v0.4s
-  .long  0x6e30de16                          // fmul          v22.4s, v16.4s, v16.4s
-  .long  0x6e22de42                          // fmul          v2.4s, v18.4s, v2.4s
-  .long  0x6ea1d832                          // frsqrte       v18.4s, v1.4s
-  .long  0x4eb6fc16                          // frsqrts       v22.4s, v0.4s, v22.4s
-  .long  0x4e31fc00                          // frecps        v0.4s, v0.4s, v17.4s
-  .long  0x4ea1d833                          // frecpe        v19.4s, v1.4s
-  .long  0x6e20de20                          // fmul          v0.4s, v17.4s, v0.4s
-  .long  0x6e32de51                          // fmul          v17.4s, v18.4s, v18.4s
-  .long  0x6ea1d854                          // frsqrte       v20.4s, v2.4s
-  .long  0x4eb1fc31                          // frsqrts       v17.4s, v1.4s, v17.4s
-  .long  0x4e33fc21                          // frecps        v1.4s, v1.4s, v19.4s
-  .long  0x6e21de61                          // fmul          v1.4s, v19.4s, v1.4s
-  .long  0x6e34de93                          // fmul          v19.4s, v20.4s, v20.4s
-  .long  0x4eb3fc53                          // frsqrts       v19.4s, v2.4s, v19.4s
-  .long  0x6e36de10                          // fmul          v16.4s, v16.4s, v22.4s
-  .long  0x6e31de51                          // fmul          v17.4s, v18.4s, v17.4s
-  .long  0x6e33de92                          // fmul          v18.4s, v20.4s, v19.4s
-  .long  0x6ea1da13                          // frsqrte       v19.4s, v16.4s
-  .long  0x4ea1d855                          // frecpe        v21.4s, v2.4s
-  .long  0x6e33de76                          // fmul          v22.4s, v19.4s, v19.4s
-  .long  0x4e35fc42                          // frecps        v2.4s, v2.4s, v21.4s
-  .long  0x6ea1da34                          // frsqrte       v20.4s, v17.4s
-  .long  0x4eb6fe10                          // frsqrts       v16.4s, v16.4s, v22.4s
-  .long  0x6e22dea2                          // fmul          v2.4s, v21.4s, v2.4s
-  .long  0x6ea1da55                          // frsqrte       v21.4s, v18.4s
-  .long  0x6e34de96                          // fmul          v22.4s, v20.4s, v20.4s
-  .long  0x6e30de70                          // fmul          v16.4s, v19.4s, v16.4s
-  .long  0x4eb6fe31                          // frsqrts       v17.4s, v17.4s, v22.4s
-  .long  0x6e35deb6                          // fmul          v22.4s, v21.4s, v21.4s
-  .long  0x6ea1da13                          // frsqrte       v19.4s, v16.4s
-  .long  0x4eb6fe52                          // frsqrts       v18.4s, v18.4s, v22.4s
-  .long  0x6e31de91                          // fmul          v17.4s, v20.4s, v17.4s
-  .long  0x6e33de76                          // fmul          v22.4s, v19.4s, v19.4s
-  .long  0x6e32deb2                          // fmul          v18.4s, v21.4s, v18.4s
-  .long  0x6ea1da34                          // frsqrte       v20.4s, v17.4s
-  .long  0x4eb6fe10                          // frsqrts       v16.4s, v16.4s, v22.4s
-  .long  0x6ea1da55                          // frsqrte       v21.4s, v18.4s
-  .long  0x6e34de96                          // fmul          v22.4s, v20.4s, v20.4s
-  .long  0x6e30de70                          // fmul          v16.4s, v19.4s, v16.4s
-  .long  0x4eb6fe31                          // frsqrts       v17.4s, v17.4s, v22.4s
-  .long  0x6e35deb6                          // fmul          v22.4s, v21.4s, v21.4s
-  .long  0x6ea1da13                          // frsqrte       v19.4s, v16.4s
-  .long  0x4eb6fe52                          // frsqrts       v18.4s, v18.4s, v22.4s
-  .long  0x6e31de91                          // fmul          v17.4s, v20.4s, v17.4s
-  .long  0x6e33de76                          // fmul          v22.4s, v19.4s, v19.4s
-  .long  0x6e32deb2                          // fmul          v18.4s, v21.4s, v18.4s
-  .long  0x6ea1da34                          // frsqrte       v20.4s, v17.4s
-  .long  0x4eb6fe10                          // frsqrts       v16.4s, v16.4s, v22.4s
-  .long  0x6ea1da55                          // frsqrte       v21.4s, v18.4s
-  .long  0x6e34de96                          // fmul          v22.4s, v20.4s, v20.4s
-  .long  0x6e30de70                          // fmul          v16.4s, v19.4s, v16.4s
-  .long  0x4eb6fe31                          // frsqrts       v17.4s, v17.4s, v22.4s
-  .long  0x6e35deb6                          // fmul          v22.4s, v21.4s, v21.4s
-  .long  0x6ea1da13                          // frsqrte       v19.4s, v16.4s
-  .long  0x4eb6fe52                          // frsqrts       v18.4s, v18.4s, v22.4s
-  .long  0x6e31de91                          // fmul          v17.4s, v20.4s, v17.4s
-  .long  0x6e33de76                          // fmul          v22.4s, v19.4s, v19.4s
-  .long  0x6e20de00                          // fmul          v0.4s, v16.4s, v0.4s
-  .long  0x6ea1da34                          // frsqrte       v20.4s, v17.4s
-  .long  0x4eb6fe10                          // frsqrts       v16.4s, v16.4s, v22.4s
-  .long  0x6e32deb2                          // fmul          v18.4s, v21.4s, v18.4s
-  .long  0x6e34de96                          // fmul          v22.4s, v20.4s, v20.4s
-  .long  0x6e30de70                          // fmul          v16.4s, v19.4s, v16.4s
-  .long  0x6e21de21                          // fmul          v1.4s, v17.4s, v1.4s
-  .long  0x6ea1da55                          // frsqrte       v21.4s, v18.4s
-  .long  0x4eb6fe31                          // frsqrts       v17.4s, v17.4s, v22.4s
-  .long  0x4ea1da13                          // frecpe        v19.4s, v16.4s
-  .long  0x6e35deb6                          // fmul          v22.4s, v21.4s, v21.4s
-  .long  0x6e31de91                          // fmul          v17.4s, v20.4s, v17.4s
-  .long  0x4e33fe10                          // frecps        v16.4s, v16.4s, v19.4s
-  .long  0x6e22de42                          // fmul          v2.4s, v18.4s, v2.4s
-  .long  0x4eb6fe52                          // frsqrts       v18.4s, v18.4s, v22.4s
-  .long  0x6e30de70                          // fmul          v16.4s, v19.4s, v16.4s
-  .long  0x4ea1da33                          // frecpe        v19.4s, v17.4s
-  .long  0x6e32deb2                          // fmul          v18.4s, v21.4s, v18.4s
-  .long  0x4e33fe31                          // frecps        v17.4s, v17.4s, v19.4s
-  .long  0x6e31de71                          // fmul          v17.4s, v19.4s, v17.4s
-  .long  0x4ea1da53                          // frecpe        v19.4s, v18.4s
-  .long  0x4e33fe52                          // frecps        v18.4s, v18.4s, v19.4s
+  .long  0x52b85f08                          // mov           w8, #0xc2f80000
+  .long  0x728e6ee8                          // movk          w8, #0x7377
+  .long  0x4e040d11                          // dup           v17.4s, w8
+  .long  0x52a7f7e8                          // mov           w8, #0x3fbf0000
+  .long  0x7297eea8                          // movk          w8, #0xbf75
+  .long  0x4e040d12                          // dup           v18.4s, w8
+  .long  0x52a7d688                          // mov           w8, #0x3eb40000
+  .long  0x72889f28                          // movk          w8, #0x44f9
+  .long  0x4e040d13                          // dup           v19.4s, w8
+  .long  0x52a7fb88                          // mov           w8, #0x3fdc0000
+  .long  0x729d3468                          // movk          w8, #0xe9a3
+  .long  0x4e040d14                          // dup           v20.4s, w8
+  .long  0x52a7dd08                          // mov           w8, #0x3ee80000
+  .long  0x4f03d7fa                          // movi          v26.4s, #0x7f, msl #16
+  .long  0x729745c8                          // movk          w8, #0xba2e
+  .long  0x4e21d818                          // scvtf         v24.4s, v0.4s
+  .long  0x4f016690                          // movi          v16.4s, #0x34, lsl #24
+  .long  0x4e040d15                          // dup           v21.4s, w8
+  .long  0x52a85e48                          // mov           w8, #0x42f20000
+  .long  0x4e21d85b                          // scvtf         v27.4s, v2.4s
+  .long  0x4e3a1c00                          // and           v0.16b, v0.16b, v26.16b
+  .long  0x4e3a1c42                          // and           v2.16b, v2.16b, v26.16b
+  .long  0x4e3a1c3a                          // and           v26.16b, v1.16b, v26.16b
+  .long  0x72918a28                          // movk          w8, #0x8c51
+  .long  0x4eb11e3c                          // mov           v28.16b, v17.16b
+  .long  0x4eb11e3d                          // mov           v29.16b, v17.16b
+  .long  0x4e3bce11                          // fmla          v17.4s, v16.4s, v27.4s
+  .long  0x4e21d821                          // scvtf         v1.4s, v1.4s
+  .long  0x4f0177e0                          // orr           v0.4s, #0x3f, lsl #24
+  .long  0x4f0177fa                          // orr           v26.4s, #0x3f, lsl #24
+  .long  0x4f0177e2                          // orr           v2.4s, #0x3f, lsl #24
+  .long  0x4e040d17                          // dup           v23.4s, w8
+  .long  0x52a7f7c8                          // mov           w8, #0x3fbe0000
+  .long  0x4e38ce1c                          // fmla          v28.4s, v16.4s, v24.4s
+  .long  0x4e21ce1d                          // fmla          v29.4s, v16.4s, v1.4s
+  .long  0x4e33d401                          // fadd          v1.4s, v0.4s, v19.4s
+  .long  0x4e33d750                          // fadd          v16.4s, v26.4s, v19.4s
+  .long  0x4eb2cc51                          // fmls          v17.4s, v2.4s, v18.4s
+  .long  0x4e33d442                          // fadd          v2.4s, v2.4s, v19.4s
+  .long  0x729791a8                          // movk          w8, #0xbc8d
+  .long  0x4eb2cc1c                          // fmls          v28.4s, v0.4s, v18.4s
+  .long  0x6e21fe80                          // fdiv          v0.4s, v20.4s, v1.4s
+  .long  0x4eb2cf5d                          // fmls          v29.4s, v26.4s, v18.4s
+  .long  0x6e30fe81                          // fdiv          v1.4s, v20.4s, v16.4s
+  .long  0x6e22fe82                          // fdiv          v2.4s, v20.4s, v2.4s
+  .long  0x4e040d16                          // dup           v22.4s, w8
+  .long  0x52a81348                          // mov           w8, #0x409a0000
+  .long  0x4ea0d780                          // fsub          v0.4s, v28.4s, v0.4s
+  .long  0x4ea1d7a1                          // fsub          v1.4s, v29.4s, v1.4s
+  .long  0x4ea2d622                          // fsub          v2.4s, v17.4s, v2.4s
+  .long  0x729ebf08                          // movk          w8, #0xf5f8
+  .long  0x6e35dc00                          // fmul          v0.4s, v0.4s, v21.4s
+  .long  0x6e35dc21                          // fmul          v1.4s, v1.4s, v21.4s
+  .long  0x6e35dc42                          // fmul          v2.4s, v2.4s, v21.4s
+  .long  0x4e040d19                          // dup           v25.4s, w8
+  .long  0x52a83ba8                          // mov           w8, #0x41dd0000
+  .long  0x4e219810                          // frintm        v16.4s, v0.4s
+  .long  0x4e219832                          // frintm        v18.4s, v1.4s
+  .long  0x4e219854                          // frintm        v20.4s, v2.4s
+  .long  0x729a5fc8                          // movk          w8, #0xd2fe
+  .long  0x4e37d411                          // fadd          v17.4s, v0.4s, v23.4s
+  .long  0x4e37d433                          // fadd          v19.4s, v1.4s, v23.4s
+  .long  0x4e37d455                          // fadd          v21.4s, v2.4s, v23.4s
+  .long  0x4eb0d400                          // fsub          v0.4s, v0.4s, v16.4s
+  .long  0x4eb2d421                          // fsub          v1.4s, v1.4s, v18.4s
+  .long  0x4eb4d442                          // fsub          v2.4s, v2.4s, v20.4s
+  .long  0x4e040d18                          // dup           v24.4s, w8
+  .long  0x4eb6cc11                          // fmls          v17.4s, v0.4s, v22.4s
+  .long  0x4ea0d720                          // fsub          v0.4s, v25.4s, v0.4s
+  .long  0x4eb6cc33                          // fmls          v19.4s, v1.4s, v22.4s
+  .long  0x4ea1d721                          // fsub          v1.4s, v25.4s, v1.4s
+  .long  0x4eb6cc55                          // fmls          v21.4s, v2.4s, v22.4s
+  .long  0x4ea2d722                          // fsub          v2.4s, v25.4s, v2.4s
   .long  0xf8408423                          // ldr           x3, [x1], #8
-  .long  0x6e32de72                          // fmul          v18.4s, v19.4s, v18.4s
-  .long  0x6f00e413                          // movi          v19.2d, #0x0
-  .long  0x6e30dc00                          // fmul          v0.4s, v0.4s, v16.4s
-  .long  0x6e31dc21                          // fmul          v1.4s, v1.4s, v17.4s
-  .long  0x6e32dc42                          // fmul          v2.4s, v2.4s, v18.4s
-  .long  0x4e33f400                          // fmax          v0.4s, v0.4s, v19.4s
-  .long  0x4e33f421                          // fmax          v1.4s, v1.4s, v19.4s
-  .long  0x4e33f442                          // fmax          v2.4s, v2.4s, v19.4s
+  .long  0x6e20ff00                          // fdiv          v0.4s, v24.4s, v0.4s
+  .long  0x6e21ff01                          // fdiv          v1.4s, v24.4s, v1.4s
+  .long  0x6e22ff02                          // fdiv          v2.4s, v24.4s, v2.4s
+  .long  0x4f02657b                          // movi          v27.4s, #0x4b, lsl #24
+  .long  0x4e20d620                          // fadd          v0.4s, v17.4s, v0.4s
+  .long  0x4e21d661                          // fadd          v1.4s, v19.4s, v1.4s
+  .long  0x4e22d6a2                          // fadd          v2.4s, v21.4s, v2.4s
+  .long  0x6e3bdc00                          // fmul          v0.4s, v0.4s, v27.4s
+  .long  0x6e3bdc21                          // fmul          v1.4s, v1.4s, v27.4s
+  .long  0x6e3bdc42                          // fmul          v2.4s, v2.4s, v27.4s
+  .long  0x6e21a800                          // fcvtnu        v0.4s, v0.4s
+  .long  0x6e21a821                          // fcvtnu        v1.4s, v1.4s
+  .long  0x6e21a842                          // fcvtnu        v2.4s, v2.4s
   .long  0xd61f0060                          // br            x3
 
 HIDDEN _sk_rgb_to_hsl_aarch64
@@ -2415,9 +2396,9 @@ FUNCTION(_sk_gather_i8_aarch64)
 _sk_gather_i8_aarch64:
   .long  0xaa0103e8                          // mov           x8, x1
   .long  0xf8408429                          // ldr           x9, [x1], #8
-  .long  0xb4000069                          // cbz           x9, 2034 <sk_gather_i8_aarch64+0x14>
+  .long  0xb4000069                          // cbz           x9, 1fe8 <sk_gather_i8_aarch64+0x14>
   .long  0xaa0903ea                          // mov           x10, x9
-  .long  0x14000003                          // b             203c <sk_gather_i8_aarch64+0x1c>
+  .long  0x14000003                          // b             1ff0 <sk_gather_i8_aarch64+0x1c>
   .long  0xf940050a                          // ldr           x10, [x8, #8]
   .long  0x91004101                          // add           x1, x8, #0x10
   .long  0xf8410548                          // ldr           x8, [x10], #16
@@ -3266,7 +3247,7 @@ _sk_linear_gradient_aarch64:
   .long  0x4d40c902                          // ld1r          {v2.4s}, [x8]
   .long  0xf9400128                          // ldr           x8, [x9]
   .long  0x4d40c943                          // ld1r          {v3.4s}, [x10]
-  .long  0xb40006c8                          // cbz           x8, 2c08 <sk_linear_gradient_aarch64+0x100>
+  .long  0xb40006c8                          // cbz           x8, 2bbc <sk_linear_gradient_aarch64+0x100>
   .long  0x6dbf23e9                          // stp           d9, d8, [sp, #-16]!
   .long  0xf9400529                          // ldr           x9, [x9, #8]
   .long  0x6f00e413                          // movi          v19.2d, #0x0
@@ -3317,9 +3298,9 @@ _sk_linear_gradient_aarch64:
   .long  0xd1000508                          // sub           x8, x8, #0x1
   .long  0x6e771fd0                          // bsl           v16.16b, v30.16b, v23.16b
   .long  0x91009129                          // add           x9, x9, #0x24
-  .long  0xb5fffaa8                          // cbnz          x8, 2b50 <sk_linear_gradient_aarch64+0x48>
+  .long  0xb5fffaa8                          // cbnz          x8, 2b04 <sk_linear_gradient_aarch64+0x48>
   .long  0x6cc123e9                          // ldp           d9, d8, [sp], #16
-  .long  0x14000005                          // b             2c18 <sk_linear_gradient_aarch64+0x110>
+  .long  0x14000005                          // b             2bcc <sk_linear_gradient_aarch64+0x110>
   .long  0x6f00e414                          // movi          v20.2d, #0x0
   .long  0x6f00e412                          // movi          v18.2d, #0x0
   .long  0x6f00e411                          // movi          v17.2d, #0x0
@@ -4722,207 +4703,251 @@ HIDDEN _sk_from_2dot2_vfp4
 .globl _sk_from_2dot2_vfp4
 FUNCTION(_sk_from_2dot2_vfp4)
 _sk_from_2dot2_vfp4:
-  .long  0xf3fb0580                          // vrsqrte.f32   d16, d0
+  .long  0xed2d8b0a                          // vpush         {d8-d12}
+  .long  0xf2c70d1f                          // vmov.i32      d16, #8388607
+  .long  0xeddf3b62                          // vldr          d19, [pc, #392]
+  .long  0xed9faa71                          // vldr          s20, [pc, #452]
+  .long  0xf2c34614                          // vmov.i32      d20, #872415232
+  .long  0xf2402130                          // vand          d18, d0, d16
+  .long  0xeddf7b60                          // vldr          d23, [pc, #384]
+  .long  0xf2c3271f                          // vorr.i32      d18, #1056964608
+  .long  0xeddfab64                          // vldr          d26, [pc, #400]
+  .long  0xf2411130                          // vand          d17, d1, d16
   .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf3fb1581                          // vrsqrte.f32   d17, d1
-  .long  0xf3fb2582                          // vrsqrte.f32   d18, d2
-  .long  0xf3403db0                          // vmul.f32      d19, d16, d16
-  .long  0xf3414db1                          // vmul.f32      d20, d17, d17
-  .long  0xf3425db2                          // vmul.f32      d21, d18, d18
-  .long  0xf2603f33                          // vrsqrts.f32   d19, d0, d19
-  .long  0xf2614f34                          // vrsqrts.f32   d20, d1, d20
-  .long  0xf2625f35                          // vrsqrts.f32   d21, d2, d21
+  .long  0xf2029da3                          // vadd.f32      d9, d18, d19
+  .long  0xf2420130                          // vand          d16, d2, d16
+  .long  0xf2c3171f                          // vorr.i32      d17, #1056964608
+  .long  0xf2c3071f                          // vorr.i32      d16, #1056964608
+  .long  0xf201bda3                          // vadd.f32      d11, d17, d19
+  .long  0xeeca8a29                          // vdiv.f32      s17, s20, s19
+  .long  0xf200cda3                          // vadd.f32      d12, d16, d19
+  .long  0xf3fb3600                          // vcvt.f32.s32  d19, d0
+  .long  0xee8a8a09                          // vdiv.f32      s16, s20, s18
+  .long  0xeeca9a2b                          // vdiv.f32      s19, s20, s23
+  .long  0xeeca0a2c                          // vdiv.f32      s1, s20, s25
+  .long  0xee8a9a0b                          // vdiv.f32      s18, s20, s22
+  .long  0xee8a0a0c                          // vdiv.f32      s0, s20, s24
+  .long  0xf3fb5602                          // vcvt.f32.s32  d21, d2
+  .long  0xf3fb6601                          // vcvt.f32.s32  d22, d1
+  .long  0xed9f1a5b                          // vldr          s2, [pc, #364]
+  .long  0xf3433db4                          // vmul.f32      d19, d19, d20
+  .long  0xf3422db7                          // vmul.f32      d18, d18, d23
+  .long  0xf3455db4                          // vmul.f32      d21, d21, d20
+  .long  0xf3464db4                          // vmul.f32      d20, d22, d20
+  .long  0xeddf6b49                          // vldr          d22, [pc, #292]
+  .long  0xf2433da6                          // vadd.f32      d19, d19, d22
+  .long  0xf3411db7                          // vmul.f32      d17, d17, d23
+  .long  0xf3400db7                          // vmul.f32      d16, d16, d23
+  .long  0xf2444da6                          // vadd.f32      d20, d20, d22
+  .long  0xf2632da2                          // vsub.f32      d18, d19, d18
+  .long  0xeddf3b45                          // vldr          d19, [pc, #276]
+  .long  0xf2455da6                          // vadd.f32      d21, d21, d22
+  .long  0xf2c09010                          // vmov.i32      d25, #0
+  .long  0xf2641da1                          // vsub.f32      d17, d20, d17
+  .long  0xf2622d88                          // vsub.f32      d18, d18, d8
+  .long  0xf2650da0                          // vsub.f32      d16, d21, d16
+  .long  0xf2611d89                          // vsub.f32      d17, d17, d9
+  .long  0xf3422db3                          // vmul.f32      d18, d18, d19
+  .long  0xf2600d80                          // vsub.f32      d16, d16, d0
+  .long  0xf3411db3                          // vmul.f32      d17, d17, d19
   .long  0xf3400db3                          // vmul.f32      d16, d16, d19
-  .long  0xf3411db4                          // vmul.f32      d17, d17, d20
-  .long  0xf3422db5                          // vmul.f32      d18, d18, d21
-  .long  0xf3fb35a0                          // vrsqrte.f32   d19, d16
-  .long  0xf3fb45a1                          // vrsqrte.f32   d20, d17
-  .long  0xf3fb55a2                          // vrsqrte.f32   d21, d18
-  .long  0xf3436db3                          // vmul.f32      d22, d19, d19
-  .long  0xf3447db4                          // vmul.f32      d23, d20, d20
-  .long  0xf3458db5                          // vmul.f32      d24, d21, d21
-  .long  0xf2600fb6                          // vrsqrts.f32   d16, d16, d22
-  .long  0xf2611fb7                          // vrsqrts.f32   d17, d17, d23
-  .long  0xf2622fb8                          // vrsqrts.f32   d18, d18, d24
-  .long  0xf3430db0                          // vmul.f32      d16, d19, d16
-  .long  0xf3441db1                          // vmul.f32      d17, d20, d17
-  .long  0xf3452db2                          // vmul.f32      d18, d21, d18
-  .long  0xf3fb35a0                          // vrsqrte.f32   d19, d16
-  .long  0xf3fb45a1                          // vrsqrte.f32   d20, d17
-  .long  0xf3fb55a2                          // vrsqrte.f32   d21, d18
-  .long  0xf3436db3                          // vmul.f32      d22, d19, d19
-  .long  0xf3447db4                          // vmul.f32      d23, d20, d20
-  .long  0xf3458db5                          // vmul.f32      d24, d21, d21
-  .long  0xf2600fb6                          // vrsqrts.f32   d16, d16, d22
-  .long  0xf2611fb7                          // vrsqrts.f32   d17, d17, d23
-  .long  0xf2622fb8                          // vrsqrts.f32   d18, d18, d24
-  .long  0xf3430db0                          // vmul.f32      d16, d19, d16
-  .long  0xf3441db1                          // vmul.f32      d17, d20, d17
-  .long  0xf3452db2                          // vmul.f32      d18, d21, d18
-  .long  0xf3fb35a0                          // vrsqrte.f32   d19, d16
-  .long  0xf3fb45a1                          // vrsqrte.f32   d20, d17
-  .long  0xf3fb55a2                          // vrsqrte.f32   d21, d18
-  .long  0xf3436db3                          // vmul.f32      d22, d19, d19
-  .long  0xf3447db4                          // vmul.f32      d23, d20, d20
-  .long  0xf3458db5                          // vmul.f32      d24, d21, d21
-  .long  0xf2600fb6                          // vrsqrts.f32   d16, d16, d22
-  .long  0xf2611fb7                          // vrsqrts.f32   d17, d17, d23
-  .long  0xf2622fb8                          // vrsqrts.f32   d18, d18, d24
-  .long  0xf3430db0                          // vmul.f32      d16, d19, d16
-  .long  0xf3441db1                          // vmul.f32      d17, d20, d17
-  .long  0xf3452db2                          // vmul.f32      d18, d21, d18
-  .long  0xf3fb35a0                          // vrsqrte.f32   d19, d16
-  .long  0xf3fb45a1                          // vrsqrte.f32   d20, d17
-  .long  0xf3fb55a2                          // vrsqrte.f32   d21, d18
-  .long  0xf340bdb0                          // vmul.f32      d27, d16, d16
-  .long  0xf341ddb1                          // vmul.f32      d29, d17, d17
-  .long  0xf3436db3                          // vmul.f32      d22, d19, d19
-  .long  0xf3447db4                          // vmul.f32      d23, d20, d20
-  .long  0xf3458db5                          // vmul.f32      d24, d21, d21
-  .long  0xf2606fb6                          // vrsqrts.f32   d22, d16, d22
-  .long  0xf2617fb7                          // vrsqrts.f32   d23, d17, d23
-  .long  0xf2628fb8                          // vrsqrts.f32   d24, d18, d24
-  .long  0xf3400dbb                          // vmul.f32      d16, d16, d27
-  .long  0xf3411dbd                          // vmul.f32      d17, d17, d29
-  .long  0xf341bd11                          // vmul.f32      d27, d1, d1
+  .long  0xf3fb3722                          // vcvt.s32.f32  d19, d18
+  .long  0xf3fb4721                          // vcvt.s32.f32  d20, d17
+  .long  0xf3fb5720                          // vcvt.s32.f32  d21, d16
+  .long  0xf3fb3623                          // vcvt.f32.s32  d19, d19
+  .long  0xf3fb4624                          // vcvt.f32.s32  d20, d20
+  .long  0xf3fb5625                          // vcvt.f32.s32  d21, d21
+  .long  0xf3636ea2                          // vcgt.f32      d22, d19, d18
+  .long  0xf3647ea1                          // vcgt.f32      d23, d20, d17
+  .long  0xf3658ea0                          // vcgt.f32      d24, d21, d16
+  .long  0xf35a61b9                          // vbsl          d22, d26, d25
+  .long  0xf35a71b9                          // vbsl          d23, d26, d25
+  .long  0xf2633da6                          // vsub.f32      d19, d19, d22
+  .long  0xeddf6b32                          // vldr          d22, [pc, #200]
+  .long  0xf2644da7                          // vsub.f32      d20, d20, d23
+  .long  0xeddf7b34                          // vldr          d23, [pc, #208]
+  .long  0xf35a81b9                          // vbsl          d24, d26, d25
+  .long  0xf2623da3                          // vsub.f32      d19, d18, d19
+  .long  0xf2614da4                          // vsub.f32      d20, d17, d20
+  .long  0xf2655da8                          // vsub.f32      d21, d21, d24
+  .long  0xf2422da7                          // vadd.f32      d18, d18, d23
+  .long  0xf2260da3                          // vsub.f32      d0, d22, d19
+  .long  0xf2262da4                          // vsub.f32      d2, d22, d20
+  .long  0xf2605da5                          // vsub.f32      d21, d16, d21
+  .long  0xf2411da7                          // vadd.f32      d17, d17, d23
+  .long  0xf2400da7                          // vadd.f32      d16, d16, d23
+  .long  0xeec19a20                          // vdiv.f32      s19, s2, s1
+  .long  0xee819a00                          // vdiv.f32      s18, s2, s0
+  .long  0xeec10a22                          // vdiv.f32      s1, s2, s5
+  .long  0xf2268da5                          // vsub.f32      d8, d22, d21
+  .long  0xeddf6b23                          // vldr          d22, [pc, #140]
+  .long  0xee810a02                          // vdiv.f32      s0, s2, s4
+  .long  0xeec12a28                          // vdiv.f32      s5, s2, s17
+  .long  0xee812a08                          // vdiv.f32      s4, s2, s16
   .long  0xf3433db6                          // vmul.f32      d19, d19, d22
-  .long  0xf3444db7                          // vmul.f32      d20, d20, d23
-  .long  0xf3455db8                          // vmul.f32      d21, d21, d24
-  .long  0xf34b1db1                          // vmul.f32      d17, d27, d17
-  .long  0xf3fb65a3                          // vrsqrte.f32   d22, d19
-  .long  0xf3fb75a4                          // vrsqrte.f32   d23, d20
-  .long  0xf3fb85a5                          // vrsqrte.f32   d24, d21
-  .long  0xf3469db6                          // vmul.f32      d25, d22, d22
-  .long  0xf347adb7                          // vmul.f32      d26, d23, d23
-  .long  0xf348cdb8                          // vmul.f32      d28, d24, d24
-  .long  0xf2633fb9                          // vrsqrts.f32   d19, d19, d25
-  .long  0xf2644fba                          // vrsqrts.f32   d20, d20, d26
-  .long  0xf3429db2                          // vmul.f32      d25, d18, d18
-  .long  0xf2655fbc                          // vrsqrts.f32   d21, d21, d28
-  .long  0xf340ad10                          // vmul.f32      d26, d0, d0
-  .long  0xf3422db9                          // vmul.f32      d18, d18, d25
-  .long  0xf3429d12                          // vmul.f32      d25, d2, d2
-  .long  0xf3463db3                          // vmul.f32      d19, d22, d19
-  .long  0xf3474db4                          // vmul.f32      d20, d23, d20
-  .long  0xf34a0db0                          // vmul.f32      d16, d26, d16
-  .long  0xf3485db5                          // vmul.f32      d21, d24, d21
-  .long  0xf3492db2                          // vmul.f32      d18, d25, d18
-  .long  0xf3400db3                          // vmul.f32      d16, d16, d19
-  .long  0xf3411db4                          // vmul.f32      d17, d17, d20
-  .long  0xf3422db5                          // vmul.f32      d18, d18, d21
-  .long  0xf2c03010                          // vmov.i32      d19, #0
-  .long  0xf2000fa3                          // vmax.f32      d0, d16, d19
-  .long  0xf2011fa3                          // vmax.f32      d1, d17, d19
-  .long  0xf2022fa3                          // vmax.f32      d2, d18, d19
+  .long  0xf3444db6                          // vmul.f32      d20, d20, d22
+  .long  0xf3455db6                          // vmul.f32      d21, d21, d22
+  .long  0xf2622da3                          // vsub.f32      d18, d18, d19
+  .long  0xf2611da4                          // vsub.f32      d17, d17, d20
+  .long  0xf2600da5                          // vsub.f32      d16, d16, d21
+  .long  0xf2c4361b                          // vmov.i32      d19, #1258291200
+  .long  0xf2422d89                          // vadd.f32      d18, d18, d9
+  .long  0xf2411d80                          // vadd.f32      d17, d17, d0
+  .long  0xf2400d82                          // vadd.f32      d16, d16, d2
+  .long  0xf2c3561f                          // vmov.i32      d21, #1056964608
+  .long  0xf2c3461f                          // vmov.i32      d20, #1056964608
+  .long  0xf2425cb3                          // vfma.f32      d21, d18, d19
+  .long  0xf2c3261f                          // vmov.i32      d18, #1056964608
+  .long  0xf2412cb3                          // vfma.f32      d18, d17, d19
+  .long  0xf2404cb3                          // vfma.f32      d20, d16, d19
+  .long  0xf3bb07a5                          // vcvt.u32.f32  d0, d21
+  .long  0xf3bb17a2                          // vcvt.u32.f32  d1, d18
+  .long  0xf3bb27a4                          // vcvt.u32.f32  d2, d20
+  .long  0xecbd8b0a                          // vpop          {d8-d12}
   .long  0xe12fff13                          // bx            r3
+  .long  0x3eb444f9                          // .word         0x3eb444f9
+  .long  0x3eb444f9                          // .word         0x3eb444f9
+  .long  0x3fbfbf75                          // .word         0x3fbfbf75
+  .long  0x3fbfbf75                          // .word         0x3fbfbf75
+  .long  0xc2f87377                          // .word         0xc2f87377
+  .long  0xc2f87377                          // .word         0xc2f87377
+  .long  0x400ccccd                          // .word         0x400ccccd
+  .long  0x400ccccd                          // .word         0x400ccccd
+  .long  0x3f800000                          // .word         0x3f800000
+  .long  0x3f800000                          // .word         0x3f800000
+  .long  0x409af5f8                          // .word         0x409af5f8
+  .long  0x409af5f8                          // .word         0x409af5f8
+  .long  0x3fbebc8d                          // .word         0x3fbebc8d
+  .long  0x3fbebc8d                          // .word         0x3fbebc8d
+  .long  0x42f28c51                          // .word         0x42f28c51
+  .long  0x42f28c51                          // .word         0x42f28c51
+  .long  0x3fdce9a3                          // .word         0x3fdce9a3
+  .long  0x41ddd2fe                          // .word         0x41ddd2fe
 
 HIDDEN _sk_to_2dot2_vfp4
 .globl _sk_to_2dot2_vfp4
 FUNCTION(_sk_to_2dot2_vfp4)
 _sk_to_2dot2_vfp4:
-  .long  0xf3fb0580                          // vrsqrte.f32   d16, d0
+  .long  0xed2d8b0a                          // vpush         {d8-d12}
+  .long  0xf2c70d1f                          // vmov.i32      d16, #8388607
+  .long  0xeddf3b62                          // vldr          d19, [pc, #392]
+  .long  0xed9faa71                          // vldr          s20, [pc, #452]
+  .long  0xf2c34614                          // vmov.i32      d20, #872415232
+  .long  0xf2402130                          // vand          d18, d0, d16
+  .long  0xeddf7b60                          // vldr          d23, [pc, #384]
+  .long  0xf2c3271f                          // vorr.i32      d18, #1056964608
+  .long  0xeddfab64                          // vldr          d26, [pc, #400]
+  .long  0xf2411130                          // vand          d17, d1, d16
   .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf3fb1581                          // vrsqrte.f32   d17, d1
-  .long  0xf3fb3582                          // vrsqrte.f32   d19, d2
-  .long  0xf3402db0                          // vmul.f32      d18, d16, d16
-  .long  0xf3414db1                          // vmul.f32      d20, d17, d17
-  .long  0xf3435db3                          // vmul.f32      d21, d19, d19
-  .long  0xf2602f32                          // vrsqrts.f32   d18, d0, d18
-  .long  0xf2614f34                          // vrsqrts.f32   d20, d1, d20
-  .long  0xf2625f35                          // vrsqrts.f32   d21, d2, d21
-  .long  0xf3402db2                          // vmul.f32      d18, d16, d18
-  .long  0xf3411db4                          // vmul.f32      d17, d17, d20
-  .long  0xf3430db5                          // vmul.f32      d16, d19, d21
-  .long  0xf3fb35a2                          // vrsqrte.f32   d19, d18
-  .long  0xf3fb45a1                          // vrsqrte.f32   d20, d17
-  .long  0xf3fb55a0                          // vrsqrte.f32   d21, d16
-  .long  0xf3fbc522                          // vrecpe.f32    d28, d18
-  .long  0xf3436db3                          // vmul.f32      d22, d19, d19
-  .long  0xf3447db4                          // vmul.f32      d23, d20, d20
-  .long  0xf3458db5                          // vmul.f32      d24, d21, d21
-  .long  0xf2626fb6                          // vrsqrts.f32   d22, d18, d22
-  .long  0xf2617fb7                          // vrsqrts.f32   d23, d17, d23
-  .long  0xf2608fb8                          // vrsqrts.f32   d24, d16, d24
-  .long  0xf2422fbc                          // vrecps.f32    d18, d18, d28
-  .long  0xf3433db6                          // vmul.f32      d19, d19, d22
-  .long  0xf3444db7                          // vmul.f32      d20, d20, d23
-  .long  0xf3455db8                          // vmul.f32      d21, d21, d24
-  .long  0xf34c2db2                          // vmul.f32      d18, d28, d18
-  .long  0xf3fb65a3                          // vrsqrte.f32   d22, d19
-  .long  0xf3fb75a4                          // vrsqrte.f32   d23, d20
-  .long  0xf3fb85a5                          // vrsqrte.f32   d24, d21
-  .long  0xf3469db6                          // vmul.f32      d25, d22, d22
-  .long  0xf347adb7                          // vmul.f32      d26, d23, d23
-  .long  0xf348bdb8                          // vmul.f32      d27, d24, d24
-  .long  0xf2633fb9                          // vrsqrts.f32   d19, d19, d25
-  .long  0xf2644fba                          // vrsqrts.f32   d20, d20, d26
-  .long  0xf2655fbb                          // vrsqrts.f32   d21, d21, d27
-  .long  0xf3463db3                          // vmul.f32      d19, d22, d19
-  .long  0xf3474db4                          // vmul.f32      d20, d23, d20
-  .long  0xf3485db5                          // vmul.f32      d21, d24, d21
-  .long  0xf3fb65a3                          // vrsqrte.f32   d22, d19
-  .long  0xf3fb75a4                          // vrsqrte.f32   d23, d20
-  .long  0xf3fb85a5                          // vrsqrte.f32   d24, d21
-  .long  0xf3469db6                          // vmul.f32      d25, d22, d22
-  .long  0xf347adb7                          // vmul.f32      d26, d23, d23
-  .long  0xf348bdb8                          // vmul.f32      d27, d24, d24
-  .long  0xf2633fb9                          // vrsqrts.f32   d19, d19, d25
-  .long  0xf2644fba                          // vrsqrts.f32   d20, d20, d26
-  .long  0xf2655fbb                          // vrsqrts.f32   d21, d21, d27
-  .long  0xf3463db3                          // vmul.f32      d19, d22, d19
-  .long  0xf3474db4                          // vmul.f32      d20, d23, d20
-  .long  0xf3485db5                          // vmul.f32      d21, d24, d21
-  .long  0xf3fb65a3                          // vrsqrte.f32   d22, d19
-  .long  0xf3fb75a4                          // vrsqrte.f32   d23, d20
-  .long  0xf3fb85a5                          // vrsqrte.f32   d24, d21
-  .long  0xf3469db6                          // vmul.f32      d25, d22, d22
-  .long  0xf347adb7                          // vmul.f32      d26, d23, d23
-  .long  0xf348bdb8                          // vmul.f32      d27, d24, d24
-  .long  0xf2633fb9                          // vrsqrts.f32   d19, d19, d25
-  .long  0xf2644fba                          // vrsqrts.f32   d20, d20, d26
-  .long  0xf2655fbb                          // vrsqrts.f32   d21, d21, d27
-  .long  0xf3463db3                          // vmul.f32      d19, d22, d19
-  .long  0xf3474db4                          // vmul.f32      d20, d23, d20
-  .long  0xf3485db5                          // vmul.f32      d21, d24, d21
-  .long  0xf3fb65a3                          // vrsqrte.f32   d22, d19
-  .long  0xf3fb75a4                          // vrsqrte.f32   d23, d20
-  .long  0xf3fb85a5                          // vrsqrte.f32   d24, d21
-  .long  0xf3432db2                          // vmul.f32      d18, d19, d18
-  .long  0xf3469db6                          // vmul.f32      d25, d22, d22
-  .long  0xf347adb7                          // vmul.f32      d26, d23, d23
-  .long  0xf348bdb8                          // vmul.f32      d27, d24, d24
-  .long  0xf2639fb9                          // vrsqrts.f32   d25, d19, d25
-  .long  0xf264afba                          // vrsqrts.f32   d26, d20, d26
-  .long  0xf265bfbb                          // vrsqrts.f32   d27, d21, d27
-  .long  0xf3466db9                          // vmul.f32      d22, d22, d25
-  .long  0xf3fb9521                          // vrecpe.f32    d25, d17
-  .long  0xf3477dba                          // vmul.f32      d23, d23, d26
-  .long  0xf3fba520                          // vrecpe.f32    d26, d16
-  .long  0xf3488dbb                          // vmul.f32      d24, d24, d27
-  .long  0xf2411fb9                          // vrecps.f32    d17, d17, d25
-  .long  0xf3fbb526                          // vrecpe.f32    d27, d22
-  .long  0xf3fbd527                          // vrecpe.f32    d29, d23
-  .long  0xf2400fba                          // vrecps.f32    d16, d16, d26
-  .long  0xf3fbe528                          // vrecpe.f32    d30, d24
-  .long  0xf2466fbb                          // vrecps.f32    d22, d22, d27
-  .long  0xf2477fbd                          // vrecps.f32    d23, d23, d29
-  .long  0xf2488fbe                          // vrecps.f32    d24, d24, d30
-  .long  0xf3491db1                          // vmul.f32      d17, d25, d17
-  .long  0xf34a0db0                          // vmul.f32      d16, d26, d16
-  .long  0xf34b6db6                          // vmul.f32      d22, d27, d22
-  .long  0xf3441db1                          // vmul.f32      d17, d20, d17
-  .long  0xf34d3db7                          // vmul.f32      d19, d29, d23
-  .long  0xf34e4db8                          // vmul.f32      d20, d30, d24
-  .long  0xf3450db0                          // vmul.f32      d16, d21, d16
-  .long  0xf3422db6                          // vmul.f32      d18, d18, d22
+  .long  0xf2029da3                          // vadd.f32      d9, d18, d19
+  .long  0xf2420130                          // vand          d16, d2, d16
+  .long  0xf2c3171f                          // vorr.i32      d17, #1056964608
+  .long  0xf2c3071f                          // vorr.i32      d16, #1056964608
+  .long  0xf201bda3                          // vadd.f32      d11, d17, d19
+  .long  0xeeca8a29                          // vdiv.f32      s17, s20, s19
+  .long  0xf200cda3                          // vadd.f32      d12, d16, d19
+  .long  0xf3fb3600                          // vcvt.f32.s32  d19, d0
+  .long  0xee8a8a09                          // vdiv.f32      s16, s20, s18
+  .long  0xeeca9a2b                          // vdiv.f32      s19, s20, s23
+  .long  0xeeca0a2c                          // vdiv.f32      s1, s20, s25
+  .long  0xee8a9a0b                          // vdiv.f32      s18, s20, s22
+  .long  0xee8a0a0c                          // vdiv.f32      s0, s20, s24
+  .long  0xf3fb5602                          // vcvt.f32.s32  d21, d2
+  .long  0xf3fb6601                          // vcvt.f32.s32  d22, d1
+  .long  0xed9f1a5b                          // vldr          s2, [pc, #364]
+  .long  0xf3433db4                          // vmul.f32      d19, d19, d20
+  .long  0xf3422db7                          // vmul.f32      d18, d18, d23
+  .long  0xf3455db4                          // vmul.f32      d21, d21, d20
+  .long  0xf3464db4                          // vmul.f32      d20, d22, d20
+  .long  0xeddf6b49                          // vldr          d22, [pc, #292]
+  .long  0xf2433da6                          // vadd.f32      d19, d19, d22
+  .long  0xf3411db7                          // vmul.f32      d17, d17, d23
+  .long  0xf3400db7                          // vmul.f32      d16, d16, d23
+  .long  0xf2444da6                          // vadd.f32      d20, d20, d22
+  .long  0xf2632da2                          // vsub.f32      d18, d19, d18
+  .long  0xeddf3b45                          // vldr          d19, [pc, #276]
+  .long  0xf2455da6                          // vadd.f32      d21, d21, d22
+  .long  0xf2c09010                          // vmov.i32      d25, #0
+  .long  0xf2641da1                          // vsub.f32      d17, d20, d17
+  .long  0xf2622d88                          // vsub.f32      d18, d18, d8
+  .long  0xf2650da0                          // vsub.f32      d16, d21, d16
+  .long  0xf2611d89                          // vsub.f32      d17, d17, d9
+  .long  0xf3422db3                          // vmul.f32      d18, d18, d19
+  .long  0xf2600d80                          // vsub.f32      d16, d16, d0
   .long  0xf3411db3                          // vmul.f32      d17, d17, d19
-  .long  0xf3400db4                          // vmul.f32      d16, d16, d20
-  .long  0xf2c03010                          // vmov.i32      d19, #0
-  .long  0xf2020fa3                          // vmax.f32      d0, d18, d19
-  .long  0xf2011fa3                          // vmax.f32      d1, d17, d19
-  .long  0xf2002fa3                          // vmax.f32      d2, d16, d19
+  .long  0xf3400db3                          // vmul.f32      d16, d16, d19
+  .long  0xf3fb3722                          // vcvt.s32.f32  d19, d18
+  .long  0xf3fb4721                          // vcvt.s32.f32  d20, d17
+  .long  0xf3fb5720                          // vcvt.s32.f32  d21, d16
+  .long  0xf3fb3623                          // vcvt.f32.s32  d19, d19
+  .long  0xf3fb4624                          // vcvt.f32.s32  d20, d20
+  .long  0xf3fb5625                          // vcvt.f32.s32  d21, d21
+  .long  0xf3636ea2                          // vcgt.f32      d22, d19, d18
+  .long  0xf3647ea1                          // vcgt.f32      d23, d20, d17
+  .long  0xf3658ea0                          // vcgt.f32      d24, d21, d16
+  .long  0xf35a61b9                          // vbsl          d22, d26, d25
+  .long  0xf35a71b9                          // vbsl          d23, d26, d25
+  .long  0xf2633da6                          // vsub.f32      d19, d19, d22
+  .long  0xeddf6b32                          // vldr          d22, [pc, #200]
+  .long  0xf2644da7                          // vsub.f32      d20, d20, d23
+  .long  0xeddf7b34                          // vldr          d23, [pc, #208]
+  .long  0xf35a81b9                          // vbsl          d24, d26, d25
+  .long  0xf2623da3                          // vsub.f32      d19, d18, d19
+  .long  0xf2614da4                          // vsub.f32      d20, d17, d20
+  .long  0xf2655da8                          // vsub.f32      d21, d21, d24
+  .long  0xf2422da7                          // vadd.f32      d18, d18, d23
+  .long  0xf2260da3                          // vsub.f32      d0, d22, d19
+  .long  0xf2262da4                          // vsub.f32      d2, d22, d20
+  .long  0xf2605da5                          // vsub.f32      d21, d16, d21
+  .long  0xf2411da7                          // vadd.f32      d17, d17, d23
+  .long  0xf2400da7                          // vadd.f32      d16, d16, d23
+  .long  0xeec19a20                          // vdiv.f32      s19, s2, s1
+  .long  0xee819a00                          // vdiv.f32      s18, s2, s0
+  .long  0xeec10a22                          // vdiv.f32      s1, s2, s5
+  .long  0xf2268da5                          // vsub.f32      d8, d22, d21
+  .long  0xeddf6b23                          // vldr          d22, [pc, #140]
+  .long  0xee810a02                          // vdiv.f32      s0, s2, s4
+  .long  0xeec12a28                          // vdiv.f32      s5, s2, s17
+  .long  0xee812a08                          // vdiv.f32      s4, s2, s16
+  .long  0xf3433db6                          // vmul.f32      d19, d19, d22
+  .long  0xf3444db6                          // vmul.f32      d20, d20, d22
+  .long  0xf3455db6                          // vmul.f32      d21, d21, d22
+  .long  0xf2622da3                          // vsub.f32      d18, d18, d19
+  .long  0xf2611da4                          // vsub.f32      d17, d17, d20
+  .long  0xf2600da5                          // vsub.f32      d16, d16, d21
+  .long  0xf2c4361b                          // vmov.i32      d19, #1258291200
+  .long  0xf2422d89                          // vadd.f32      d18, d18, d9
+  .long  0xf2411d80                          // vadd.f32      d17, d17, d0
+  .long  0xf2400d82                          // vadd.f32      d16, d16, d2
+  .long  0xf2c3561f                          // vmov.i32      d21, #1056964608
+  .long  0xf2c3461f                          // vmov.i32      d20, #1056964608
+  .long  0xf2425cb3                          // vfma.f32      d21, d18, d19
+  .long  0xf2c3261f                          // vmov.i32      d18, #1056964608
+  .long  0xf2412cb3                          // vfma.f32      d18, d17, d19
+  .long  0xf2404cb3                          // vfma.f32      d20, d16, d19
+  .long  0xf3bb07a5                          // vcvt.u32.f32  d0, d21
+  .long  0xf3bb17a2                          // vcvt.u32.f32  d1, d18
+  .long  0xf3bb27a4                          // vcvt.u32.f32  d2, d20
+  .long  0xecbd8b0a                          // vpop          {d8-d12}
   .long  0xe12fff13                          // bx            r3
-  .long  0xe320f000                          // nop           {0}
+  .long  0x3eb444f9                          // .word         0x3eb444f9
+  .long  0x3eb444f9                          // .word         0x3eb444f9
+  .long  0x3fbfbf75                          // .word         0x3fbfbf75
+  .long  0x3fbfbf75                          // .word         0x3fbfbf75
+  .long  0xc2f87377                          // .word         0xc2f87377
+  .long  0xc2f87377                          // .word         0xc2f87377
+  .long  0x3ee8ba2e                          // .word         0x3ee8ba2e
+  .long  0x3ee8ba2e                          // .word         0x3ee8ba2e
+  .long  0x3f800000                          // .word         0x3f800000
+  .long  0x3f800000                          // .word         0x3f800000
+  .long  0x409af5f8                          // .word         0x409af5f8
+  .long  0x409af5f8                          // .word         0x409af5f8
+  .long  0x3fbebc8d                          // .word         0x3fbebc8d
+  .long  0x3fbebc8d                          // .word         0x3fbebc8d
+  .long  0x42f28c51                          // .word         0x42f28c51
+  .long  0x42f28c51                          // .word         0x42f28c51
+  .long  0x3fdce9a3                          // .word         0x3fdce9a3
+  .long  0x41ddd2fe                          // .word         0x41ddd2fe
 
 HIDDEN _sk_rgb_to_hsl_vfp4
 .globl _sk_rgb_to_hsl_vfp4
@@ -7217,7 +7242,7 @@ _sk_linear_gradient_vfp4:
   .long  0xe494c00c                          // ldr           ip, [r4], #12
   .long  0xf4a41c9f                          // vld1.32       {d1[]}, [r4 :32]
   .long  0xe35c0000                          // cmp           ip, #0
-  .long  0x0a000036                          // beq           3080 <sk_linear_gradient_vfp4+0x110>
+  .long  0x0a000036                          // beq           3130 <sk_linear_gradient_vfp4+0x110>
   .long  0xe59e3004                          // ldr           r3, [lr, #4]
   .long  0xf2c01010                          // vmov.i32      d17, #0
   .long  0xf2c07010                          // vmov.i32      d23, #0
@@ -7267,12 +7292,12 @@ _sk_linear_gradient_vfp4:
   .long  0xf26371b3                          // vorr          d23, d19, d19
   .long  0xf26481b4                          // vorr          d24, d20, d20
   .long  0xf26561b5                          // vorr          d22, d21, d21
-  .long  0x1affffd3                          // bne           2fbc <sk_linear_gradient_vfp4+0x4c>
+  .long  0x1affffd3                          // bne           306c <sk_linear_gradient_vfp4+0x4c>
   .long  0xf26c01bc                          // vorr          d16, d28, d28
   .long  0xf22b11bb                          // vorr          d1, d27, d27
   .long  0xf22a21ba                          // vorr          d2, d26, d26
   .long  0xf22931b9                          // vorr          d3, d25, d25
-  .long  0xea000003                          // b             3090 <sk_linear_gradient_vfp4+0x120>
+  .long  0xea000003                          // b             3140 <sk_linear_gradient_vfp4+0x120>
   .long  0xf2c05010                          // vmov.i32      d21, #0
   .long  0xf2c04010                          // vmov.i32      d20, #0
   .long  0xf2c03010                          // vmov.i32      d19, #0
@@ -8656,85 +8681,232 @@ HIDDEN _sk_from_2dot2_hsw
 .globl _sk_from_2dot2_hsw
 FUNCTION(_sk_from_2dot2_hsw)
 _sk_from_2dot2_hsw:
-  .byte  197,124,82,192                      // vrsqrtps      %ymm0,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,200                   // vrsqrtps      %ymm8,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  197,252,89,192                      // vmulps        %ymm0,%ymm0,%ymm0
-  .byte  196,65,60,89,208                    // vmulps        %ymm8,%ymm8,%ymm10
-  .byte  196,65,60,89,194                    // vmulps        %ymm10,%ymm8,%ymm8
-  .byte  196,193,124,89,192                  // vmulps        %ymm8,%ymm0,%ymm0
-  .byte  197,180,89,192                      // vmulps        %ymm0,%ymm9,%ymm0
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  196,193,124,95,192                  // vmaxps        %ymm8,%ymm0,%ymm0
-  .byte  197,124,82,201                      // vrsqrtps      %ymm1,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,209                   // vrsqrtps      %ymm9,%ymm10
-  .byte  196,65,124,82,210                   // vrsqrtps      %ymm10,%ymm10
-  .byte  197,244,89,201                      // vmulps        %ymm1,%ymm1,%ymm1
-  .byte  196,65,52,89,217                    // vmulps        %ymm9,%ymm9,%ymm11
-  .byte  196,65,52,89,203                    // vmulps        %ymm11,%ymm9,%ymm9
-  .byte  196,193,116,89,201                  // vmulps        %ymm9,%ymm1,%ymm1
-  .byte  197,172,89,201                      // vmulps        %ymm1,%ymm10,%ymm1
-  .byte  196,193,116,95,200                  // vmaxps        %ymm8,%ymm1,%ymm1
-  .byte  197,124,82,202                      // vrsqrtps      %ymm2,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,209                   // vrsqrtps      %ymm9,%ymm10
-  .byte  196,65,124,82,210                   // vrsqrtps      %ymm10,%ymm10
-  .byte  197,236,89,210                      // vmulps        %ymm2,%ymm2,%ymm2
-  .byte  196,65,52,89,217                    // vmulps        %ymm9,%ymm9,%ymm11
-  .byte  196,65,52,89,203                    // vmulps        %ymm11,%ymm9,%ymm9
-  .byte  196,193,108,89,209                  // vmulps        %ymm9,%ymm2,%ymm2
-  .byte  197,172,89,210                      // vmulps        %ymm2,%ymm10,%ymm2
-  .byte  196,193,108,95,208                  // vmaxps        %ymm8,%ymm2,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,131,236,88                       // sub           $0x58,%rsp
+  .byte  197,252,17,124,36,32                // vmovups       %ymm7,0x20(%rsp)
+  .byte  197,252,17,52,36                    // vmovups       %ymm6,(%rsp)
+  .byte  197,252,17,108,36,224               // vmovups       %ymm5,-0x20(%rsp)
+  .byte  197,252,17,100,36,192               // vmovups       %ymm4,-0x40(%rsp)
+  .byte  197,252,17,92,36,160                // vmovups       %ymm3,-0x60(%rsp)
+  .byte  197,124,40,225                      // vmovaps       %ymm1,%ymm12
+  .byte  65,184,205,204,12,64                // mov           $0x400ccccd,%r8d
+  .byte  197,124,91,208                      // vcvtdq2ps     %ymm0,%ymm10
+  .byte  184,0,0,0,52                        // mov           $0x34000000,%eax
+  .byte  197,121,110,192                     // vmovd         %eax,%xmm8
+  .byte  196,66,125,88,216                   // vpbroadcastd  %xmm8,%ymm11
+  .byte  184,255,255,127,0                   // mov           $0x7fffff,%eax
+  .byte  197,121,110,192                     // vmovd         %eax,%xmm8
+  .byte  196,194,125,88,216                  // vpbroadcastd  %xmm8,%ymm3
+  .byte  197,254,127,92,36,128               // vmovdqu       %ymm3,-0x80(%rsp)
+  .byte  197,101,219,200                     // vpand         %ymm0,%ymm3,%ymm9
+  .byte  184,0,0,0,63                        // mov           $0x3f000000,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,98,125,88,248                   // vpbroadcastd  %xmm0,%ymm15
+  .byte  196,193,53,235,223                  // vpor          %ymm15,%ymm9,%ymm3
+  .byte  184,119,115,248,66                  // mov           $0x42f87377,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,98,125,88,232                   // vpbroadcastd  %xmm0,%ymm13
+  .byte  196,66,37,170,213                   // vfmsub213ps   %ymm13,%ymm11,%ymm10
+  .byte  184,117,191,191,63                  // mov           $0x3fbfbf75,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,98,125,88,200                   // vpbroadcastd  %xmm0,%ymm9
+  .byte  196,66,101,188,209                  // vfnmadd231ps  %ymm9,%ymm3,%ymm10
+  .byte  184,163,233,220,63                  // mov           $0x3fdce9a3,%eax
+  .byte  196,65,124,91,244                   // vcvtdq2ps     %ymm12,%ymm14
+  .byte  196,66,37,170,245                   // vfmsub213ps   %ymm13,%ymm11,%ymm14
+  .byte  197,252,91,202                      // vcvtdq2ps     %ymm2,%ymm1
+  .byte  197,124,40,194                      // vmovaps       %ymm2,%ymm8
+  .byte  196,194,37,170,205                  // vfmsub213ps   %ymm13,%ymm11,%ymm1
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,226,125,88,192                  // vpbroadcastd  %xmm0,%ymm0
+  .byte  184,249,68,180,62                   // mov           $0x3eb444f9,%eax
+  .byte  197,249,110,248                     // vmovd         %eax,%xmm7
+  .byte  196,226,125,88,255                  // vpbroadcastd  %xmm7,%ymm7
+  .byte  197,100,88,223                      // vaddps        %ymm7,%ymm3,%ymm11
+  .byte  196,65,124,94,219                   // vdivps        %ymm11,%ymm0,%ymm11
+  .byte  196,65,44,92,211                    // vsubps        %ymm11,%ymm10,%ymm10
+  .byte  196,193,121,110,240                 // vmovd         %r8d,%xmm6
+  .byte  196,226,125,88,246                  // vpbroadcastd  %xmm6,%ymm6
+  .byte  196,65,76,89,210                    // vmulps        %ymm10,%ymm6,%ymm10
+  .byte  196,67,125,8,218,1                  // vroundps      $0x1,%ymm10,%ymm11
+  .byte  196,65,44,92,219                    // vsubps        %ymm11,%ymm10,%ymm11
+  .byte  65,184,0,0,0,75                     // mov           $0x4b000000,%r8d
+  .byte  184,81,140,242,66                   // mov           $0x42f28c51,%eax
+  .byte  197,249,110,232                     // vmovd         %eax,%xmm5
+  .byte  196,226,125,88,237                  // vpbroadcastd  %xmm5,%ymm5
+  .byte  196,65,84,88,210                    // vaddps        %ymm10,%ymm5,%ymm10
+  .byte  184,141,188,190,63                  // mov           $0x3fbebc8d,%eax
+  .byte  197,249,110,224                     // vmovd         %eax,%xmm4
+  .byte  196,226,125,88,228                  // vpbroadcastd  %xmm4,%ymm4
+  .byte  196,66,93,188,211                   // vfnmadd231ps  %ymm11,%ymm4,%ymm10
+  .byte  184,254,210,221,65                  // mov           $0x41ddd2fe,%eax
+  .byte  197,249,110,216                     // vmovd         %eax,%xmm3
+  .byte  196,226,125,88,219                  // vpbroadcastd  %xmm3,%ymm3
+  .byte  184,248,245,154,64                  // mov           $0x409af5f8,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,226,125,88,210                  // vpbroadcastd  %xmm2,%ymm2
+  .byte  196,65,108,92,219                   // vsubps        %ymm11,%ymm2,%ymm11
+  .byte  196,65,100,94,219                   // vdivps        %ymm11,%ymm3,%ymm11
+  .byte  196,65,44,88,211                    // vaddps        %ymm11,%ymm10,%ymm10
+  .byte  197,124,16,108,36,128               // vmovups       -0x80(%rsp),%ymm13
+  .byte  196,65,20,84,220                    // vandps        %ymm12,%ymm13,%ymm11
+  .byte  196,65,36,86,223                    // vorps         %ymm15,%ymm11,%ymm11
+  .byte  196,66,37,188,241                   // vfnmadd231ps  %ymm9,%ymm11,%ymm14
+  .byte  197,36,88,223                       // vaddps        %ymm7,%ymm11,%ymm11
+  .byte  196,65,124,94,219                   // vdivps        %ymm11,%ymm0,%ymm11
+  .byte  196,65,12,92,219                    // vsubps        %ymm11,%ymm14,%ymm11
+  .byte  196,65,76,89,219                    // vmulps        %ymm11,%ymm6,%ymm11
+  .byte  196,67,125,8,227,1                  // vroundps      $0x1,%ymm11,%ymm12
+  .byte  196,65,36,92,228                    // vsubps        %ymm12,%ymm11,%ymm12
+  .byte  196,65,84,88,219                    // vaddps        %ymm11,%ymm5,%ymm11
+  .byte  196,66,93,188,220                   // vfnmadd231ps  %ymm12,%ymm4,%ymm11
+  .byte  196,65,108,92,228                   // vsubps        %ymm12,%ymm2,%ymm12
+  .byte  196,65,100,94,228                   // vdivps        %ymm12,%ymm3,%ymm12
+  .byte  196,65,36,88,220                    // vaddps        %ymm12,%ymm11,%ymm11
+  .byte  196,65,20,84,192                    // vandps        %ymm8,%ymm13,%ymm8
+  .byte  196,65,60,86,199                    // vorps         %ymm15,%ymm8,%ymm8
+  .byte  196,194,61,188,201                  // vfnmadd231ps  %ymm9,%ymm8,%ymm1
+  .byte  197,188,88,255                      // vaddps        %ymm7,%ymm8,%ymm7
+  .byte  197,252,94,199                      // vdivps        %ymm7,%ymm0,%ymm0
+  .byte  197,244,92,192                      // vsubps        %ymm0,%ymm1,%ymm0
+  .byte  197,204,89,192                      // vmulps        %ymm0,%ymm6,%ymm0
+  .byte  196,227,125,8,200,1                 // vroundps      $0x1,%ymm0,%ymm1
+  .byte  197,252,92,201                      // vsubps        %ymm1,%ymm0,%ymm1
+  .byte  197,212,88,192                      // vaddps        %ymm0,%ymm5,%ymm0
+  .byte  196,226,117,172,224                 // vfnmadd213ps  %ymm0,%ymm1,%ymm4
+  .byte  197,236,92,193                      // vsubps        %ymm1,%ymm2,%ymm0
+  .byte  197,228,94,192                      // vdivps        %ymm0,%ymm3,%ymm0
+  .byte  197,220,88,192                      // vaddps        %ymm0,%ymm4,%ymm0
+  .byte  196,193,121,110,200                 // vmovd         %r8d,%xmm1
+  .byte  196,226,125,88,201                  // vpbroadcastd  %xmm1,%ymm1
+  .byte  196,193,116,89,210                  // vmulps        %ymm10,%ymm1,%ymm2
+  .byte  196,193,116,89,219                  // vmulps        %ymm11,%ymm1,%ymm3
+  .byte  197,244,89,224                      // vmulps        %ymm0,%ymm1,%ymm4
+  .byte  197,253,91,194                      // vcvtps2dq     %ymm2,%ymm0
+  .byte  197,253,91,203                      // vcvtps2dq     %ymm3,%ymm1
+  .byte  197,253,91,212                      // vcvtps2dq     %ymm4,%ymm2
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  197,252,16,92,36,160                // vmovups       -0x60(%rsp),%ymm3
+  .byte  197,252,16,100,36,192               // vmovups       -0x40(%rsp),%ymm4
+  .byte  197,252,16,108,36,224               // vmovups       -0x20(%rsp),%ymm5
+  .byte  197,252,16,52,36                    // vmovups       (%rsp),%ymm6
+  .byte  197,252,16,124,36,32                // vmovups       0x20(%rsp),%ymm7
+  .byte  72,131,196,88                       // add           $0x58,%rsp
   .byte  255,224                             // jmpq          *%rax
 
 HIDDEN _sk_to_2dot2_hsw
 .globl _sk_to_2dot2_hsw
 FUNCTION(_sk_to_2dot2_hsw)
 _sk_to_2dot2_hsw:
-  .byte  197,252,82,192                      // vrsqrtps      %ymm0,%ymm0
-  .byte  197,124,82,192                      // vrsqrtps      %ymm0,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,200                   // vrsqrtps      %ymm8,%ymm9
-  .byte  197,252,83,192                      // vrcpps        %ymm0,%ymm0
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  196,65,124,83,193                   // vrcpps        %ymm9,%ymm8
-  .byte  196,193,124,89,192                  // vmulps        %ymm8,%ymm0,%ymm0
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  196,193,124,95,192                  // vmaxps        %ymm8,%ymm0,%ymm0
-  .byte  197,252,82,201                      // vrsqrtps      %ymm1,%ymm1
-  .byte  197,124,82,201                      // vrsqrtps      %ymm1,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,209                   // vrsqrtps      %ymm9,%ymm10
-  .byte  197,252,83,201                      // vrcpps        %ymm1,%ymm1
-  .byte  197,180,89,201                      // vmulps        %ymm1,%ymm9,%ymm1
-  .byte  196,65,124,83,202                   // vrcpps        %ymm10,%ymm9
-  .byte  196,193,116,89,201                  // vmulps        %ymm9,%ymm1,%ymm1
-  .byte  196,193,116,95,200                  // vmaxps        %ymm8,%ymm1,%ymm1
-  .byte  197,252,82,210                      // vrsqrtps      %ymm2,%ymm2
-  .byte  197,124,82,202                      // vrsqrtps      %ymm2,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,209                   // vrsqrtps      %ymm9,%ymm10
-  .byte  197,252,83,210                      // vrcpps        %ymm2,%ymm2
-  .byte  197,180,89,210                      // vmulps        %ymm2,%ymm9,%ymm2
-  .byte  196,65,124,83,202                   // vrcpps        %ymm10,%ymm9
-  .byte  196,193,108,89,209                  // vmulps        %ymm9,%ymm2,%ymm2
-  .byte  196,193,108,95,208                  // vmaxps        %ymm8,%ymm2,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,131,236,88                       // sub           $0x58,%rsp
+  .byte  197,252,17,124,36,32                // vmovups       %ymm7,0x20(%rsp)
+  .byte  197,252,17,52,36                    // vmovups       %ymm6,(%rsp)
+  .byte  197,252,17,108,36,224               // vmovups       %ymm5,-0x20(%rsp)
+  .byte  197,252,17,100,36,192               // vmovups       %ymm4,-0x40(%rsp)
+  .byte  197,252,17,92,36,160                // vmovups       %ymm3,-0x60(%rsp)
+  .byte  197,124,40,225                      // vmovaps       %ymm1,%ymm12
+  .byte  65,184,46,186,232,62                // mov           $0x3ee8ba2e,%r8d
+  .byte  197,124,91,208                      // vcvtdq2ps     %ymm0,%ymm10
+  .byte  184,0,0,0,52                        // mov           $0x34000000,%eax
+  .byte  197,121,110,192                     // vmovd         %eax,%xmm8
+  .byte  196,66,125,88,216                   // vpbroadcastd  %xmm8,%ymm11
+  .byte  184,255,255,127,0                   // mov           $0x7fffff,%eax
+  .byte  197,121,110,192                     // vmovd         %eax,%xmm8
+  .byte  196,194,125,88,216                  // vpbroadcastd  %xmm8,%ymm3
+  .byte  197,254,127,92,36,128               // vmovdqu       %ymm3,-0x80(%rsp)
+  .byte  197,101,219,200                     // vpand         %ymm0,%ymm3,%ymm9
+  .byte  184,0,0,0,63                        // mov           $0x3f000000,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,98,125,88,248                   // vpbroadcastd  %xmm0,%ymm15
+  .byte  196,193,53,235,223                  // vpor          %ymm15,%ymm9,%ymm3
+  .byte  184,119,115,248,66                  // mov           $0x42f87377,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,98,125,88,232                   // vpbroadcastd  %xmm0,%ymm13
+  .byte  196,66,37,170,213                   // vfmsub213ps   %ymm13,%ymm11,%ymm10
+  .byte  184,117,191,191,63                  // mov           $0x3fbfbf75,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,98,125,88,200                   // vpbroadcastd  %xmm0,%ymm9
+  .byte  196,66,101,188,209                  // vfnmadd231ps  %ymm9,%ymm3,%ymm10
+  .byte  184,163,233,220,63                  // mov           $0x3fdce9a3,%eax
+  .byte  196,65,124,91,244                   // vcvtdq2ps     %ymm12,%ymm14
+  .byte  196,66,37,170,245                   // vfmsub213ps   %ymm13,%ymm11,%ymm14
+  .byte  197,252,91,202                      // vcvtdq2ps     %ymm2,%ymm1
+  .byte  197,124,40,194                      // vmovaps       %ymm2,%ymm8
+  .byte  196,194,37,170,205                  // vfmsub213ps   %ymm13,%ymm11,%ymm1
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,226,125,88,192                  // vpbroadcastd  %xmm0,%ymm0
+  .byte  184,249,68,180,62                   // mov           $0x3eb444f9,%eax
+  .byte  197,249,110,248                     // vmovd         %eax,%xmm7
+  .byte  196,226,125,88,255                  // vpbroadcastd  %xmm7,%ymm7
+  .byte  197,100,88,223                      // vaddps        %ymm7,%ymm3,%ymm11
+  .byte  196,65,124,94,219                   // vdivps        %ymm11,%ymm0,%ymm11
+  .byte  196,65,44,92,211                    // vsubps        %ymm11,%ymm10,%ymm10
+  .byte  196,193,121,110,240                 // vmovd         %r8d,%xmm6
+  .byte  196,226,125,88,246                  // vpbroadcastd  %xmm6,%ymm6
+  .byte  196,65,76,89,210                    // vmulps        %ymm10,%ymm6,%ymm10
+  .byte  196,67,125,8,218,1                  // vroundps      $0x1,%ymm10,%ymm11
+  .byte  196,65,44,92,219                    // vsubps        %ymm11,%ymm10,%ymm11
+  .byte  65,184,0,0,0,75                     // mov           $0x4b000000,%r8d
+  .byte  184,81,140,242,66                   // mov           $0x42f28c51,%eax
+  .byte  197,249,110,232                     // vmovd         %eax,%xmm5
+  .byte  196,226,125,88,237                  // vpbroadcastd  %xmm5,%ymm5
+  .byte  196,65,84,88,210                    // vaddps        %ymm10,%ymm5,%ymm10
+  .byte  184,141,188,190,63                  // mov           $0x3fbebc8d,%eax
+  .byte  197,249,110,224                     // vmovd         %eax,%xmm4
+  .byte  196,226,125,88,228                  // vpbroadcastd  %xmm4,%ymm4
+  .byte  196,66,93,188,211                   // vfnmadd231ps  %ymm11,%ymm4,%ymm10
+  .byte  184,254,210,221,65                  // mov           $0x41ddd2fe,%eax
+  .byte  197,249,110,216                     // vmovd         %eax,%xmm3
+  .byte  196,226,125,88,219                  // vpbroadcastd  %xmm3,%ymm3
+  .byte  184,248,245,154,64                  // mov           $0x409af5f8,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,226,125,88,210                  // vpbroadcastd  %xmm2,%ymm2
+  .byte  196,65,108,92,219                   // vsubps        %ymm11,%ymm2,%ymm11
+  .byte  196,65,100,94,219                   // vdivps        %ymm11,%ymm3,%ymm11
+  .byte  196,65,44,88,211                    // vaddps        %ymm11,%ymm10,%ymm10
+  .byte  197,124,16,108,36,128               // vmovups       -0x80(%rsp),%ymm13
+  .byte  196,65,20,84,220                    // vandps        %ymm12,%ymm13,%ymm11
+  .byte  196,65,36,86,223                    // vorps         %ymm15,%ymm11,%ymm11
+  .byte  196,66,37,188,241                   // vfnmadd231ps  %ymm9,%ymm11,%ymm14
+  .byte  197,36,88,223                       // vaddps        %ymm7,%ymm11,%ymm11
+  .byte  196,65,124,94,219                   // vdivps        %ymm11,%ymm0,%ymm11
+  .byte  196,65,12,92,219                    // vsubps        %ymm11,%ymm14,%ymm11
+  .byte  196,65,76,89,219                    // vmulps        %ymm11,%ymm6,%ymm11
+  .byte  196,67,125,8,227,1                  // vroundps      $0x1,%ymm11,%ymm12
+  .byte  196,65,36,92,228                    // vsubps        %ymm12,%ymm11,%ymm12
+  .byte  196,65,84,88,219                    // vaddps        %ymm11,%ymm5,%ymm11
+  .byte  196,66,93,188,220                   // vfnmadd231ps  %ymm12,%ymm4,%ymm11
+  .byte  196,65,108,92,228                   // vsubps        %ymm12,%ymm2,%ymm12
+  .byte  196,65,100,94,228                   // vdivps        %ymm12,%ymm3,%ymm12
+  .byte  196,65,36,88,220                    // vaddps        %ymm12,%ymm11,%ymm11
+  .byte  196,65,20,84,192                    // vandps        %ymm8,%ymm13,%ymm8
+  .byte  196,65,60,86,199                    // vorps         %ymm15,%ymm8,%ymm8
+  .byte  196,194,61,188,201                  // vfnmadd231ps  %ymm9,%ymm8,%ymm1
+  .byte  197,188,88,255                      // vaddps        %ymm7,%ymm8,%ymm7
+  .byte  197,252,94,199                      // vdivps        %ymm7,%ymm0,%ymm0
+  .byte  197,244,92,192                      // vsubps        %ymm0,%ymm1,%ymm0
+  .byte  197,204,89,192                      // vmulps        %ymm0,%ymm6,%ymm0
+  .byte  196,227,125,8,200,1                 // vroundps      $0x1,%ymm0,%ymm1
+  .byte  197,252,92,201                      // vsubps        %ymm1,%ymm0,%ymm1
+  .byte  197,212,88,192                      // vaddps        %ymm0,%ymm5,%ymm0
+  .byte  196,226,117,172,224                 // vfnmadd213ps  %ymm0,%ymm1,%ymm4
+  .byte  197,236,92,193                      // vsubps        %ymm1,%ymm2,%ymm0
+  .byte  197,228,94,192                      // vdivps        %ymm0,%ymm3,%ymm0
+  .byte  197,220,88,192                      // vaddps        %ymm0,%ymm4,%ymm0
+  .byte  196,193,121,110,200                 // vmovd         %r8d,%xmm1
+  .byte  196,226,125,88,201                  // vpbroadcastd  %xmm1,%ymm1
+  .byte  196,193,116,89,210                  // vmulps        %ymm10,%ymm1,%ymm2
+  .byte  196,193,116,89,219                  // vmulps        %ymm11,%ymm1,%ymm3
+  .byte  197,244,89,224                      // vmulps        %ymm0,%ymm1,%ymm4
+  .byte  197,253,91,194                      // vcvtps2dq     %ymm2,%ymm0
+  .byte  197,253,91,203                      // vcvtps2dq     %ymm3,%ymm1
+  .byte  197,253,91,212                      // vcvtps2dq     %ymm4,%ymm2
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  197,252,16,92,36,160                // vmovups       -0x60(%rsp),%ymm3
+  .byte  197,252,16,100,36,192               // vmovups       -0x40(%rsp),%ymm4
+  .byte  197,252,16,108,36,224               // vmovups       -0x20(%rsp),%ymm5
+  .byte  197,252,16,52,36                    // vmovups       (%rsp),%ymm6
+  .byte  197,252,16,124,36,32                // vmovups       0x20(%rsp),%ymm7
+  .byte  72,131,196,88                       // add           $0x58,%rsp
   .byte  255,224                             // jmpq          *%rax
 
 HIDDEN _sk_rgb_to_hsl_hsw
@@ -8930,7 +9102,7 @@ _sk_scale_u8_hsw:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,1,248                            // add           %rdi,%rax
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,56                              // jne           11bc <_sk_scale_u8_hsw+0x48>
+  .byte  117,56                              // jne           148c <_sk_scale_u8_hsw+0x48>
   .byte  197,122,126,0                       // vmovq         (%rax),%xmm8
   .byte  196,66,125,49,192                   // vpmovzxbd     %xmm8,%ymm8
   .byte  196,65,124,91,192                   // vcvtdq2ps     %ymm8,%ymm8
@@ -8954,9 +9126,9 @@ _sk_scale_u8_hsw:
   .byte  77,9,217                            // or            %r11,%r9
   .byte  72,131,193,8                        // add           $0x8,%rcx
   .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           11c4 <_sk_scale_u8_hsw+0x50>
+  .byte  117,234                             // jne           1494 <_sk_scale_u8_hsw+0x50>
   .byte  196,65,249,110,193                  // vmovq         %r9,%xmm8
-  .byte  235,167                             // jmp           1188 <_sk_scale_u8_hsw+0x14>
+  .byte  235,167                             // jmp           1458 <_sk_scale_u8_hsw+0x14>
 
 HIDDEN _sk_lerp_1_float_hsw
 .globl _sk_lerp_1_float_hsw
@@ -8984,7 +9156,7 @@ _sk_lerp_u8_hsw:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,1,248                            // add           %rdi,%rax
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,76                              // jne           126c <_sk_lerp_u8_hsw+0x5c>
+  .byte  117,76                              // jne           153c <_sk_lerp_u8_hsw+0x5c>
   .byte  197,122,126,0                       // vmovq         (%rax),%xmm8
   .byte  196,66,125,49,192                   // vpmovzxbd     %xmm8,%ymm8
   .byte  196,65,124,91,192                   // vcvtdq2ps     %ymm8,%ymm8
@@ -9012,9 +9184,9 @@ _sk_lerp_u8_hsw:
   .byte  77,9,217                            // or            %r11,%r9
   .byte  72,131,193,8                        // add           $0x8,%rcx
   .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           1274 <_sk_lerp_u8_hsw+0x64>
+  .byte  117,234                             // jne           1544 <_sk_lerp_u8_hsw+0x64>
   .byte  196,65,249,110,193                  // vmovq         %r9,%xmm8
-  .byte  235,147                             // jmp           1224 <_sk_lerp_u8_hsw+0x14>
+  .byte  235,147                             // jmp           14f4 <_sk_lerp_u8_hsw+0x14>
 
 HIDDEN _sk_lerp_565_hsw
 .globl _sk_lerp_565_hsw
@@ -9023,7 +9195,7 @@ _sk_lerp_565_hsw:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,16                           // mov           (%rax),%r10
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,179,0,0,0                    // jne           1352 <_sk_lerp_565_hsw+0xc1>
+  .byte  15,133,179,0,0,0                    // jne           1622 <_sk_lerp_565_hsw+0xc1>
   .byte  196,193,122,111,28,122              // vmovdqu       (%r10,%rdi,2),%xmm3
   .byte  196,98,125,51,195                   // vpmovzxwd     %xmm3,%ymm8
   .byte  184,0,248,0,0                       // mov           $0xf800,%eax
@@ -9069,9 +9241,9 @@ _sk_lerp_565_hsw:
   .byte  197,225,239,219                     // vpxor         %xmm3,%xmm3,%xmm3
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,59,255,255,255               // ja            12a5 <_sk_lerp_565_hsw+0x14>
+  .byte  15,135,59,255,255,255               // ja            1575 <_sk_lerp_565_hsw+0x14>
   .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  76,141,13,75,0,0,0                  // lea           0x4b(%rip),%r9        # 13c0 <_sk_lerp_565_hsw+0x12f>
+  .byte  76,141,13,75,0,0,0                  // lea           0x4b(%rip),%r9        # 1690 <_sk_lerp_565_hsw+0x12f>
   .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
   .byte  76,1,200                            // add           %r9,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -9083,7 +9255,7 @@ _sk_lerp_565_hsw:
   .byte  196,193,97,196,92,122,4,2           // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
   .byte  196,193,97,196,92,122,2,1           // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
   .byte  196,193,97,196,28,122,0             // vpinsrw       $0x0,(%r10,%rdi,2),%xmm3,%xmm3
-  .byte  233,231,254,255,255                 // jmpq          12a5 <_sk_lerp_565_hsw+0x14>
+  .byte  233,231,254,255,255                 // jmpq          1575 <_sk_lerp_565_hsw+0x14>
   .byte  102,144                             // xchg          %ax,%ax
   .byte  242,255                             // repnz         (bad)
   .byte  255                                 // (bad)
@@ -9118,7 +9290,7 @@ _sk_load_tables_hsw:
   .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
   .byte  76,3,8                              // add           (%rax),%r9
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,121                             // jne           146a <_sk_load_tables_hsw+0x8e>
+  .byte  117,121                             // jne           173a <_sk_load_tables_hsw+0x8e>
   .byte  196,193,126,111,25                  // vmovdqu       (%r9),%ymm3
   .byte  185,255,0,0,0                       // mov           $0xff,%ecx
   .byte  197,249,110,193                     // vmovd         %ecx,%xmm0
@@ -9154,7 +9326,7 @@ _sk_load_tables_hsw:
   .byte  196,193,249,110,194                 // vmovq         %r10,%xmm0
   .byte  196,226,125,33,192                  // vpmovsxbd     %xmm0,%ymm0
   .byte  196,194,125,140,25                  // vpmaskmovd    (%r9),%ymm0,%ymm3
-  .byte  233,99,255,255,255                  // jmpq          13f6 <_sk_load_tables_hsw+0x1a>
+  .byte  233,99,255,255,255                  // jmpq          16c6 <_sk_load_tables_hsw+0x1a>
 
 HIDDEN _sk_load_tables_u16_be_hsw
 .globl _sk_load_tables_u16_be_hsw
@@ -9164,7 +9336,7 @@ _sk_load_tables_u16_be_hsw:
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,216,0,0,0                    // jne           1581 <_sk_load_tables_u16_be_hsw+0xee>
+  .byte  15,133,216,0,0,0                    // jne           1851 <_sk_load_tables_u16_be_hsw+0xee>
   .byte  196,1,121,16,4,72                   // vmovupd       (%r8,%r9,2),%xmm8
   .byte  196,129,121,16,84,72,16             // vmovupd       0x10(%r8,%r9,2),%xmm2
   .byte  196,129,121,16,92,72,32             // vmovupd       0x20(%r8,%r9,2),%xmm3
@@ -9213,29 +9385,29 @@ _sk_load_tables_u16_be_hsw:
   .byte  196,1,123,16,4,72                   // vmovsd        (%r8,%r9,2),%xmm8
   .byte  196,65,49,239,201                   // vpxor         %xmm9,%xmm9,%xmm9
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,85                              // je            15e7 <_sk_load_tables_u16_be_hsw+0x154>
+  .byte  116,85                              // je            18b7 <_sk_load_tables_u16_be_hsw+0x154>
   .byte  196,1,57,22,68,72,8                 // vmovhpd       0x8(%r8,%r9,2),%xmm8,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,72                              // jb            15e7 <_sk_load_tables_u16_be_hsw+0x154>
+  .byte  114,72                              // jb            18b7 <_sk_load_tables_u16_be_hsw+0x154>
   .byte  196,129,123,16,84,72,16             // vmovsd        0x10(%r8,%r9,2),%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  116,72                              // je            15f4 <_sk_load_tables_u16_be_hsw+0x161>
+  .byte  116,72                              // je            18c4 <_sk_load_tables_u16_be_hsw+0x161>
   .byte  196,129,105,22,84,72,24             // vmovhpd       0x18(%r8,%r9,2),%xmm2,%xmm2
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,59                              // jb            15f4 <_sk_load_tables_u16_be_hsw+0x161>
+  .byte  114,59                              // jb            18c4 <_sk_load_tables_u16_be_hsw+0x161>
   .byte  196,129,123,16,92,72,32             // vmovsd        0x20(%r8,%r9,2),%xmm3
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  15,132,250,254,255,255              // je            14c4 <_sk_load_tables_u16_be_hsw+0x31>
+  .byte  15,132,250,254,255,255              // je            1794 <_sk_load_tables_u16_be_hsw+0x31>
   .byte  196,129,97,22,92,72,40              // vmovhpd       0x28(%r8,%r9,2),%xmm3,%xmm3
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,233,254,255,255              // jb            14c4 <_sk_load_tables_u16_be_hsw+0x31>
+  .byte  15,130,233,254,255,255              // jb            1794 <_sk_load_tables_u16_be_hsw+0x31>
   .byte  196,1,122,126,76,72,48              // vmovq         0x30(%r8,%r9,2),%xmm9
-  .byte  233,221,254,255,255                 // jmpq          14c4 <_sk_load_tables_u16_be_hsw+0x31>
+  .byte  233,221,254,255,255                 // jmpq          1794 <_sk_load_tables_u16_be_hsw+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
   .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,208,254,255,255                 // jmpq          14c4 <_sk_load_tables_u16_be_hsw+0x31>
+  .byte  233,208,254,255,255                 // jmpq          1794 <_sk_load_tables_u16_be_hsw+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  233,199,254,255,255                 // jmpq          14c4 <_sk_load_tables_u16_be_hsw+0x31>
+  .byte  233,199,254,255,255                 // jmpq          1794 <_sk_load_tables_u16_be_hsw+0x31>
 
 HIDDEN _sk_load_tables_rgb_u16_be_hsw
 .globl _sk_load_tables_rgb_u16_be_hsw
@@ -9245,7 +9417,7 @@ _sk_load_tables_rgb_u16_be_hsw:
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  76,141,12,127                       // lea           (%rdi,%rdi,2),%r9
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,207,0,0,0                    // jne           16de <_sk_load_tables_rgb_u16_be_hsw+0xe1>
+  .byte  15,133,207,0,0,0                    // jne           19ae <_sk_load_tables_rgb_u16_be_hsw+0xe1>
   .byte  196,129,122,111,4,72                // vmovdqu       (%r8,%r9,2),%xmm0
   .byte  196,129,122,111,84,72,12            // vmovdqu       0xc(%r8,%r9,2),%xmm2
   .byte  196,129,122,111,76,72,24            // vmovdqu       0x18(%r8,%r9,2),%xmm1
@@ -9292,36 +9464,36 @@ _sk_load_tables_rgb_u16_be_hsw:
   .byte  196,129,121,110,4,72                // vmovd         (%r8,%r9,2),%xmm0
   .byte  196,129,121,196,68,72,4,2           // vpinsrw       $0x2,0x4(%r8,%r9,2),%xmm0,%xmm0
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  117,5                               // jne           16f7 <_sk_load_tables_rgb_u16_be_hsw+0xfa>
-  .byte  233,76,255,255,255                  // jmpq          1643 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  .byte  117,5                               // jne           19c7 <_sk_load_tables_rgb_u16_be_hsw+0xfa>
+  .byte  233,76,255,255,255                  // jmpq          1913 <_sk_load_tables_rgb_u16_be_hsw+0x46>
   .byte  196,129,121,110,76,72,6             // vmovd         0x6(%r8,%r9,2),%xmm1
   .byte  196,1,113,196,68,72,10,2            // vpinsrw       $0x2,0xa(%r8,%r9,2),%xmm1,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,26                              // jb            1726 <_sk_load_tables_rgb_u16_be_hsw+0x129>
+  .byte  114,26                              // jb            19f6 <_sk_load_tables_rgb_u16_be_hsw+0x129>
   .byte  196,129,121,110,76,72,12            // vmovd         0xc(%r8,%r9,2),%xmm1
   .byte  196,129,113,196,84,72,16,2          // vpinsrw       $0x2,0x10(%r8,%r9,2),%xmm1,%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  117,10                              // jne           172b <_sk_load_tables_rgb_u16_be_hsw+0x12e>
-  .byte  233,29,255,255,255                  // jmpq          1643 <_sk_load_tables_rgb_u16_be_hsw+0x46>
-  .byte  233,24,255,255,255                  // jmpq          1643 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  .byte  117,10                              // jne           19fb <_sk_load_tables_rgb_u16_be_hsw+0x12e>
+  .byte  233,29,255,255,255                  // jmpq          1913 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  .byte  233,24,255,255,255                  // jmpq          1913 <_sk_load_tables_rgb_u16_be_hsw+0x46>
   .byte  196,129,121,110,76,72,18            // vmovd         0x12(%r8,%r9,2),%xmm1
   .byte  196,1,113,196,76,72,22,2            // vpinsrw       $0x2,0x16(%r8,%r9,2),%xmm1,%xmm9
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,26                              // jb            175a <_sk_load_tables_rgb_u16_be_hsw+0x15d>
+  .byte  114,26                              // jb            1a2a <_sk_load_tables_rgb_u16_be_hsw+0x15d>
   .byte  196,129,121,110,76,72,24            // vmovd         0x18(%r8,%r9,2),%xmm1
   .byte  196,129,113,196,76,72,28,2          // vpinsrw       $0x2,0x1c(%r8,%r9,2),%xmm1,%xmm1
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  117,10                              // jne           175f <_sk_load_tables_rgb_u16_be_hsw+0x162>
-  .byte  233,233,254,255,255                 // jmpq          1643 <_sk_load_tables_rgb_u16_be_hsw+0x46>
-  .byte  233,228,254,255,255                 // jmpq          1643 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  .byte  117,10                              // jne           1a2f <_sk_load_tables_rgb_u16_be_hsw+0x162>
+  .byte  233,233,254,255,255                 // jmpq          1913 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  .byte  233,228,254,255,255                 // jmpq          1913 <_sk_load_tables_rgb_u16_be_hsw+0x46>
   .byte  196,129,121,110,92,72,30            // vmovd         0x1e(%r8,%r9,2),%xmm3
   .byte  196,1,97,196,92,72,34,2             // vpinsrw       $0x2,0x22(%r8,%r9,2),%xmm3,%xmm11
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,20                              // jb            1788 <_sk_load_tables_rgb_u16_be_hsw+0x18b>
+  .byte  114,20                              // jb            1a58 <_sk_load_tables_rgb_u16_be_hsw+0x18b>
   .byte  196,129,121,110,92,72,36            // vmovd         0x24(%r8,%r9,2),%xmm3
   .byte  196,129,97,196,92,72,40,2           // vpinsrw       $0x2,0x28(%r8,%r9,2),%xmm3,%xmm3
-  .byte  233,187,254,255,255                 // jmpq          1643 <_sk_load_tables_rgb_u16_be_hsw+0x46>
-  .byte  233,182,254,255,255                 // jmpq          1643 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  .byte  233,187,254,255,255                 // jmpq          1913 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  .byte  233,182,254,255,255                 // jmpq          1913 <_sk_load_tables_rgb_u16_be_hsw+0x46>
 
 HIDDEN _sk_byte_tables_hsw
 .globl _sk_byte_tables_hsw
@@ -10084,7 +10256,7 @@ _sk_load_a8_hsw:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,1,248                            // add           %rdi,%rax
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,50                              // jne           2485 <_sk_load_a8_hsw+0x42>
+  .byte  117,50                              // jne           2755 <_sk_load_a8_hsw+0x42>
   .byte  197,250,126,0                       // vmovq         (%rax),%xmm0
   .byte  196,226,125,49,192                  // vpmovzxbd     %xmm0,%ymm0
   .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
@@ -10107,9 +10279,9 @@ _sk_load_a8_hsw:
   .byte  77,9,217                            // or            %r11,%r9
   .byte  72,131,193,8                        // add           $0x8,%rcx
   .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           248d <_sk_load_a8_hsw+0x4a>
+  .byte  117,234                             // jne           275d <_sk_load_a8_hsw+0x4a>
   .byte  196,193,249,110,193                 // vmovq         %r9,%xmm0
-  .byte  235,173                             // jmp           2457 <_sk_load_a8_hsw+0x14>
+  .byte  235,173                             // jmp           2727 <_sk_load_a8_hsw+0x14>
 
 HIDDEN _sk_gather_a8_hsw
 .globl _sk_gather_a8_hsw
@@ -10184,7 +10356,7 @@ _sk_store_a8_hsw:
   .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
   .byte  196,65,57,103,192                   // vpackuswb     %xmm8,%xmm8,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           25c2 <_sk_store_a8_hsw+0x3b>
+  .byte  117,10                              // jne           2892 <_sk_store_a8_hsw+0x3b>
   .byte  196,65,123,17,4,57                  // vmovsd        %xmm8,(%r9,%rdi,1)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
@@ -10192,10 +10364,10 @@ _sk_store_a8_hsw:
   .byte  65,128,224,7                        // and           $0x7,%r8b
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            25be <_sk_store_a8_hsw+0x37>
+  .byte  119,236                             // ja            288e <_sk_store_a8_hsw+0x37>
   .byte  196,66,121,48,192                   // vpmovzxbw     %xmm8,%xmm8
   .byte  65,15,182,192                       // movzbl        %r8b,%eax
-  .byte  76,141,5,66,0,0,0                   // lea           0x42(%rip),%r8        # 2624 <_sk_store_a8_hsw+0x9d>
+  .byte  76,141,5,66,0,0,0                   // lea           0x42(%rip),%r8        # 28f4 <_sk_store_a8_hsw+0x9d>
   .byte  73,99,4,128                         // movslq        (%r8,%rax,4),%rax
   .byte  76,1,192                            // add           %r8,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -10206,7 +10378,7 @@ _sk_store_a8_hsw:
   .byte  196,67,121,20,68,57,2,4             // vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
   .byte  196,67,121,20,68,57,1,2             // vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
   .byte  196,67,121,20,4,57,0                // vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
-  .byte  235,154                             // jmp           25be <_sk_store_a8_hsw+0x37>
+  .byte  235,154                             // jmp           288e <_sk_store_a8_hsw+0x37>
   .byte  247,255                             // idiv          %edi
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
@@ -10239,7 +10411,7 @@ _sk_load_g8_hsw:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,1,248                            // add           %rdi,%rax
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,60                              // jne           268c <_sk_load_g8_hsw+0x4c>
+  .byte  117,60                              // jne           295c <_sk_load_g8_hsw+0x4c>
   .byte  197,250,126,0                       // vmovq         (%rax),%xmm0
   .byte  196,226,125,49,192                  // vpmovzxbd     %xmm0,%ymm0
   .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
@@ -10264,9 +10436,9 @@ _sk_load_g8_hsw:
   .byte  77,9,217                            // or            %r11,%r9
   .byte  72,131,193,8                        // add           $0x8,%rcx
   .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           2694 <_sk_load_g8_hsw+0x54>
+  .byte  117,234                             // jne           2964 <_sk_load_g8_hsw+0x54>
   .byte  196,193,249,110,193                 // vmovq         %r9,%xmm0
-  .byte  235,163                             // jmp           2654 <_sk_load_g8_hsw+0x14>
+  .byte  235,163                             // jmp           2924 <_sk_load_g8_hsw+0x14>
 
 HIDDEN _sk_gather_g8_hsw
 .globl _sk_gather_g8_hsw
@@ -10335,9 +10507,9 @@ _sk_gather_i8_hsw:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  73,137,192                          // mov           %rax,%r8
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  116,5                               // je            27a7 <_sk_gather_i8_hsw+0xf>
+  .byte  116,5                               // je            2a77 <_sk_gather_i8_hsw+0xf>
   .byte  76,137,192                          // mov           %r8,%rax
-  .byte  235,2                               // jmp           27a9 <_sk_gather_i8_hsw+0x11>
+  .byte  235,2                               // jmp           2a79 <_sk_gather_i8_hsw+0x11>
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  65,87                               // push          %r15
   .byte  65,86                               // push          %r14
@@ -10410,7 +10582,7 @@ _sk_load_565_hsw:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,16                           // mov           (%rax),%r10
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,149,0,0,0                    // jne           295b <_sk_load_565_hsw+0xa3>
+  .byte  15,133,149,0,0,0                    // jne           2c2b <_sk_load_565_hsw+0xa3>
   .byte  196,193,122,111,4,122               // vmovdqu       (%r10,%rdi,2),%xmm0
   .byte  196,226,125,51,208                  // vpmovzxwd     %xmm0,%ymm2
   .byte  184,0,248,0,0                       // mov           $0xf800,%eax
@@ -10450,9 +10622,9 @@ _sk_load_565_hsw:
   .byte  197,249,239,192                     // vpxor         %xmm0,%xmm0,%xmm0
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,89,255,255,255               // ja            28cc <_sk_load_565_hsw+0x14>
+  .byte  15,135,89,255,255,255               // ja            2b9c <_sk_load_565_hsw+0x14>
   .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  76,141,13,74,0,0,0                  // lea           0x4a(%rip),%r9        # 29c8 <_sk_load_565_hsw+0x110>
+  .byte  76,141,13,74,0,0,0                  // lea           0x4a(%rip),%r9        # 2c98 <_sk_load_565_hsw+0x110>
   .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
   .byte  76,1,200                            // add           %r9,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -10464,12 +10636,12 @@ _sk_load_565_hsw:
   .byte  196,193,121,196,68,122,4,2          // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
   .byte  196,193,121,196,68,122,2,1          // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
   .byte  196,193,121,196,4,122,0             // vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  233,5,255,255,255                   // jmpq          28cc <_sk_load_565_hsw+0x14>
+  .byte  233,5,255,255,255                   // jmpq          2b9c <_sk_load_565_hsw+0x14>
   .byte  144                                 // nop
   .byte  243,255                             // repz          (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  235,255                             // jmp           29cd <_sk_load_565_hsw+0x115>
+  .byte  235,255                             // jmp           2c9d <_sk_load_565_hsw+0x115>
   .byte  255                                 // (bad)
   .byte  255,227                             // jmpq          *%rbx
   .byte  255                                 // (bad)
@@ -10596,7 +10768,7 @@ _sk_store_565_hsw:
   .byte  196,67,125,57,193,1                 // vextracti128  $0x1,%ymm8,%xmm9
   .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           2b93 <_sk_store_565_hsw+0x6c>
+  .byte  117,10                              // jne           2e63 <_sk_store_565_hsw+0x6c>
   .byte  196,65,122,127,4,121                // vmovdqu       %xmm8,(%r9,%rdi,2)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
@@ -10604,9 +10776,9 @@ _sk_store_565_hsw:
   .byte  65,128,224,7                        // and           $0x7,%r8b
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            2b8f <_sk_store_565_hsw+0x68>
+  .byte  119,236                             // ja            2e5f <_sk_store_565_hsw+0x68>
   .byte  65,15,182,192                       // movzbl        %r8b,%eax
-  .byte  76,141,5,66,0,0,0                   // lea           0x42(%rip),%r8        # 2bf0 <_sk_store_565_hsw+0xc9>
+  .byte  76,141,5,66,0,0,0                   // lea           0x42(%rip),%r8        # 2ec0 <_sk_store_565_hsw+0xc9>
   .byte  73,99,4,128                         // movslq        (%r8,%rax,4),%rax
   .byte  76,1,192                            // add           %r8,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -10617,7 +10789,7 @@ _sk_store_565_hsw:
   .byte  196,67,121,21,68,121,4,2            // vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
   .byte  196,67,121,21,68,121,2,1            // vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
   .byte  196,67,121,21,4,121,0               // vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
-  .byte  235,159                             // jmp           2b8f <_sk_store_565_hsw+0x68>
+  .byte  235,159                             // jmp           2e5f <_sk_store_565_hsw+0x68>
   .byte  247,255                             // idiv          %edi
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
@@ -10648,7 +10820,7 @@ _sk_load_4444_hsw:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,16                           // mov           (%rax),%r10
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,179,0,0,0                    // jne           2ccd <_sk_load_4444_hsw+0xc1>
+  .byte  15,133,179,0,0,0                    // jne           2f9d <_sk_load_4444_hsw+0xc1>
   .byte  196,193,122,111,4,122               // vmovdqu       (%r10,%rdi,2),%xmm0
   .byte  196,98,125,51,200                   // vpmovzxwd     %xmm0,%ymm9
   .byte  184,0,240,0,0                       // mov           $0xf000,%eax
@@ -10694,9 +10866,9 @@ _sk_load_4444_hsw:
   .byte  197,249,239,192                     // vpxor         %xmm0,%xmm0,%xmm0
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,59,255,255,255               // ja            2c20 <_sk_load_4444_hsw+0x14>
+  .byte  15,135,59,255,255,255               // ja            2ef0 <_sk_load_4444_hsw+0x14>
   .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  76,141,13,76,0,0,0                  // lea           0x4c(%rip),%r9        # 2d3c <_sk_load_4444_hsw+0x130>
+  .byte  76,141,13,76,0,0,0                  // lea           0x4c(%rip),%r9        # 300c <_sk_load_4444_hsw+0x130>
   .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
   .byte  76,1,200                            // add           %r9,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -10708,13 +10880,13 @@ _sk_load_4444_hsw:
   .byte  196,193,121,196,68,122,4,2          // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
   .byte  196,193,121,196,68,122,2,1          // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
   .byte  196,193,121,196,4,122,0             // vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  233,231,254,255,255                 // jmpq          2c20 <_sk_load_4444_hsw+0x14>
+  .byte  233,231,254,255,255                 // jmpq          2ef0 <_sk_load_4444_hsw+0x14>
   .byte  15,31,0                             // nopl          (%rax)
   .byte  241                                 // icebp
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  233,255,255,255,225                 // jmpq          ffffffffe2002d44 <_sk_callback_hsw+0xffffffffe1ffeace>
+  .byte  233,255,255,255,225                 // jmpq          ffffffffe2003014 <_sk_callback_hsw+0xffffffffe1ffeace>
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
@@ -10846,7 +11018,7 @@ _sk_store_4444_hsw:
   .byte  196,67,125,57,193,1                 // vextracti128  $0x1,%ymm8,%xmm9
   .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           2f2b <_sk_store_4444_hsw+0x72>
+  .byte  117,10                              // jne           31fb <_sk_store_4444_hsw+0x72>
   .byte  196,65,122,127,4,121                // vmovdqu       %xmm8,(%r9,%rdi,2)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
@@ -10854,9 +11026,9 @@ _sk_store_4444_hsw:
   .byte  65,128,224,7                        // and           $0x7,%r8b
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            2f27 <_sk_store_4444_hsw+0x6e>
+  .byte  119,236                             // ja            31f7 <_sk_store_4444_hsw+0x6e>
   .byte  65,15,182,192                       // movzbl        %r8b,%eax
-  .byte  76,141,5,66,0,0,0                   // lea           0x42(%rip),%r8        # 2f88 <_sk_store_4444_hsw+0xcf>
+  .byte  76,141,5,66,0,0,0                   // lea           0x42(%rip),%r8        # 3258 <_sk_store_4444_hsw+0xcf>
   .byte  73,99,4,128                         // movslq        (%r8,%rax,4),%rax
   .byte  76,1,192                            // add           %r8,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -10867,7 +11039,7 @@ _sk_store_4444_hsw:
   .byte  196,67,121,21,68,121,4,2            // vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
   .byte  196,67,121,21,68,121,2,1            // vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
   .byte  196,67,121,21,4,121,0               // vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
-  .byte  235,159                             // jmp           2f27 <_sk_store_4444_hsw+0x6e>
+  .byte  235,159                             // jmp           31f7 <_sk_store_4444_hsw+0x6e>
   .byte  247,255                             // idiv          %edi
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
@@ -10900,7 +11072,7 @@ _sk_load_8888_hsw:
   .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
   .byte  76,3,8                              // add           (%rax),%r9
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,104                             // jne           3021 <_sk_load_8888_hsw+0x7d>
+  .byte  117,104                             // jne           32f1 <_sk_load_8888_hsw+0x7d>
   .byte  196,193,126,111,25                  // vmovdqu       (%r9),%ymm3
   .byte  184,255,0,0,0                       // mov           $0xff,%eax
   .byte  197,249,110,192                     // vmovd         %eax,%xmm0
@@ -10933,7 +11105,7 @@ _sk_load_8888_hsw:
   .byte  196,225,249,110,192                 // vmovq         %rax,%xmm0
   .byte  196,226,125,33,192                  // vpmovsxbd     %xmm0,%ymm0
   .byte  196,194,125,140,25                  // vpmaskmovd    (%r9),%ymm0,%ymm3
-  .byte  233,116,255,255,255                 // jmpq          2fbe <_sk_load_8888_hsw+0x1a>
+  .byte  233,116,255,255,255                 // jmpq          328e <_sk_load_8888_hsw+0x1a>
 
 HIDDEN _sk_gather_8888_hsw
 .globl _sk_gather_8888_hsw
@@ -10997,7 +11169,7 @@ _sk_store_8888_hsw:
   .byte  196,65,45,235,192                   // vpor          %ymm8,%ymm10,%ymm8
   .byte  196,65,53,235,192                   // vpor          %ymm8,%ymm9,%ymm8
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,12                              // jne           3144 <_sk_store_8888_hsw+0x74>
+  .byte  117,12                              // jne           3414 <_sk_store_8888_hsw+0x74>
   .byte  196,65,126,127,1                    // vmovdqu       %ymm8,(%r9)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,137,193                          // mov           %r8,%rcx
@@ -11010,7 +11182,7 @@ _sk_store_8888_hsw:
   .byte  196,97,249,110,200                  // vmovq         %rax,%xmm9
   .byte  196,66,125,33,201                   // vpmovsxbd     %xmm9,%ymm9
   .byte  196,66,53,142,1                     // vpmaskmovd    %ymm8,%ymm9,(%r9)
-  .byte  235,211                             // jmp           313d <_sk_store_8888_hsw+0x6d>
+  .byte  235,211                             // jmp           340d <_sk_store_8888_hsw+0x6d>
 
 HIDDEN _sk_load_f16_hsw
 .globl _sk_load_f16_hsw
@@ -11019,7 +11191,7 @@ _sk_load_f16_hsw:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,97                              // jne           31d5 <_sk_load_f16_hsw+0x6b>
+  .byte  117,97                              // jne           34a5 <_sk_load_f16_hsw+0x6b>
   .byte  197,121,16,4,248                    // vmovupd       (%rax,%rdi,8),%xmm8
   .byte  197,249,16,84,248,16                // vmovupd       0x10(%rax,%rdi,8),%xmm2
   .byte  197,249,16,92,248,32                // vmovupd       0x20(%rax,%rdi,8),%xmm3
@@ -11045,29 +11217,29 @@ _sk_load_f16_hsw:
   .byte  197,123,16,4,248                    // vmovsd        (%rax,%rdi,8),%xmm8
   .byte  196,65,49,239,201                   // vpxor         %xmm9,%xmm9,%xmm9
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,79                              // je            3234 <_sk_load_f16_hsw+0xca>
+  .byte  116,79                              // je            3504 <_sk_load_f16_hsw+0xca>
   .byte  197,57,22,68,248,8                  // vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,67                              // jb            3234 <_sk_load_f16_hsw+0xca>
+  .byte  114,67                              // jb            3504 <_sk_load_f16_hsw+0xca>
   .byte  197,251,16,84,248,16                // vmovsd        0x10(%rax,%rdi,8),%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  116,68                              // je            3241 <_sk_load_f16_hsw+0xd7>
+  .byte  116,68                              // je            3511 <_sk_load_f16_hsw+0xd7>
   .byte  197,233,22,84,248,24                // vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,56                              // jb            3241 <_sk_load_f16_hsw+0xd7>
+  .byte  114,56                              // jb            3511 <_sk_load_f16_hsw+0xd7>
   .byte  197,251,16,92,248,32                // vmovsd        0x20(%rax,%rdi,8),%xmm3
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  15,132,114,255,255,255              // je            318b <_sk_load_f16_hsw+0x21>
+  .byte  15,132,114,255,255,255              // je            345b <_sk_load_f16_hsw+0x21>
   .byte  197,225,22,92,248,40                // vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,98,255,255,255               // jb            318b <_sk_load_f16_hsw+0x21>
+  .byte  15,130,98,255,255,255               // jb            345b <_sk_load_f16_hsw+0x21>
   .byte  197,122,126,76,248,48               // vmovq         0x30(%rax,%rdi,8),%xmm9
-  .byte  233,87,255,255,255                  // jmpq          318b <_sk_load_f16_hsw+0x21>
+  .byte  233,87,255,255,255                  // jmpq          345b <_sk_load_f16_hsw+0x21>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
   .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,74,255,255,255                  // jmpq          318b <_sk_load_f16_hsw+0x21>
+  .byte  233,74,255,255,255                  // jmpq          345b <_sk_load_f16_hsw+0x21>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  233,65,255,255,255                  // jmpq          318b <_sk_load_f16_hsw+0x21>
+  .byte  233,65,255,255,255                  // jmpq          345b <_sk_load_f16_hsw+0x21>
 
 HIDDEN _sk_gather_f16_hsw
 .globl _sk_gather_f16_hsw
@@ -11125,7 +11297,7 @@ _sk_store_f16_hsw:
   .byte  196,65,57,98,205                    // vpunpckldq    %xmm13,%xmm8,%xmm9
   .byte  196,65,57,106,197                   // vpunpckhdq    %xmm13,%xmm8,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,27                              // jne           3339 <_sk_store_f16_hsw+0x65>
+  .byte  117,27                              // jne           3609 <_sk_store_f16_hsw+0x65>
   .byte  197,120,17,28,248                   // vmovups       %xmm11,(%rax,%rdi,8)
   .byte  197,120,17,84,248,16                // vmovups       %xmm10,0x10(%rax,%rdi,8)
   .byte  197,120,17,76,248,32                // vmovups       %xmm9,0x20(%rax,%rdi,8)
@@ -11134,22 +11306,22 @@ _sk_store_f16_hsw:
   .byte  255,224                             // jmpq          *%rax
   .byte  197,121,214,28,248                  // vmovq         %xmm11,(%rax,%rdi,8)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,241                             // je            3335 <_sk_store_f16_hsw+0x61>
+  .byte  116,241                             // je            3605 <_sk_store_f16_hsw+0x61>
   .byte  197,121,23,92,248,8                 // vmovhpd       %xmm11,0x8(%rax,%rdi,8)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,229                             // jb            3335 <_sk_store_f16_hsw+0x61>
+  .byte  114,229                             // jb            3605 <_sk_store_f16_hsw+0x61>
   .byte  197,121,214,84,248,16               // vmovq         %xmm10,0x10(%rax,%rdi,8)
-  .byte  116,221                             // je            3335 <_sk_store_f16_hsw+0x61>
+  .byte  116,221                             // je            3605 <_sk_store_f16_hsw+0x61>
   .byte  197,121,23,84,248,24                // vmovhpd       %xmm10,0x18(%rax,%rdi,8)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,209                             // jb            3335 <_sk_store_f16_hsw+0x61>
+  .byte  114,209                             // jb            3605 <_sk_store_f16_hsw+0x61>
   .byte  197,121,214,76,248,32               // vmovq         %xmm9,0x20(%rax,%rdi,8)
-  .byte  116,201                             // je            3335 <_sk_store_f16_hsw+0x61>
+  .byte  116,201                             // je            3605 <_sk_store_f16_hsw+0x61>
   .byte  197,121,23,76,248,40                // vmovhpd       %xmm9,0x28(%rax,%rdi,8)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,189                             // jb            3335 <_sk_store_f16_hsw+0x61>
+  .byte  114,189                             // jb            3605 <_sk_store_f16_hsw+0x61>
   .byte  197,121,214,68,248,48               // vmovq         %xmm8,0x30(%rax,%rdi,8)
-  .byte  235,181                             // jmp           3335 <_sk_store_f16_hsw+0x61>
+  .byte  235,181                             // jmp           3605 <_sk_store_f16_hsw+0x61>
 
 HIDDEN _sk_load_u16_be_hsw
 .globl _sk_load_u16_be_hsw
@@ -11159,7 +11331,7 @@ _sk_load_u16_be_hsw:
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  72,141,4,189,0,0,0,0                // lea           0x0(,%rdi,4),%rax
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,205,0,0,0                    // jne           3463 <_sk_load_u16_be_hsw+0xe3>
+  .byte  15,133,205,0,0,0                    // jne           3733 <_sk_load_u16_be_hsw+0xe3>
   .byte  196,65,121,16,4,64                  // vmovupd       (%r8,%rax,2),%xmm8
   .byte  196,193,121,16,84,64,16             // vmovupd       0x10(%r8,%rax,2),%xmm2
   .byte  196,193,121,16,92,64,32             // vmovupd       0x20(%r8,%rax,2),%xmm3
@@ -11208,29 +11380,29 @@ _sk_load_u16_be_hsw:
   .byte  196,65,123,16,4,64                  // vmovsd        (%r8,%rax,2),%xmm8
   .byte  196,65,49,239,201                   // vpxor         %xmm9,%xmm9,%xmm9
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,85                              // je            34c9 <_sk_load_u16_be_hsw+0x149>
+  .byte  116,85                              // je            3799 <_sk_load_u16_be_hsw+0x149>
   .byte  196,65,57,22,68,64,8                // vmovhpd       0x8(%r8,%rax,2),%xmm8,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,72                              // jb            34c9 <_sk_load_u16_be_hsw+0x149>
+  .byte  114,72                              // jb            3799 <_sk_load_u16_be_hsw+0x149>
   .byte  196,193,123,16,84,64,16             // vmovsd        0x10(%r8,%rax,2),%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  116,72                              // je            34d6 <_sk_load_u16_be_hsw+0x156>
+  .byte  116,72                              // je            37a6 <_sk_load_u16_be_hsw+0x156>
   .byte  196,193,105,22,84,64,24             // vmovhpd       0x18(%r8,%rax,2),%xmm2,%xmm2
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,59                              // jb            34d6 <_sk_load_u16_be_hsw+0x156>
+  .byte  114,59                              // jb            37a6 <_sk_load_u16_be_hsw+0x156>
   .byte  196,193,123,16,92,64,32             // vmovsd        0x20(%r8,%rax,2),%xmm3
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  15,132,5,255,255,255                // je            33b1 <_sk_load_u16_be_hsw+0x31>
+  .byte  15,132,5,255,255,255                // je            3681 <_sk_load_u16_be_hsw+0x31>
   .byte  196,193,97,22,92,64,40              // vmovhpd       0x28(%r8,%rax,2),%xmm3,%xmm3
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,244,254,255,255              // jb            33b1 <_sk_load_u16_be_hsw+0x31>
+  .byte  15,130,244,254,255,255              // jb            3681 <_sk_load_u16_be_hsw+0x31>
   .byte  196,65,122,126,76,64,48             // vmovq         0x30(%r8,%rax,2),%xmm9
-  .byte  233,232,254,255,255                 // jmpq          33b1 <_sk_load_u16_be_hsw+0x31>
+  .byte  233,232,254,255,255                 // jmpq          3681 <_sk_load_u16_be_hsw+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
   .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,219,254,255,255                 // jmpq          33b1 <_sk_load_u16_be_hsw+0x31>
+  .byte  233,219,254,255,255                 // jmpq          3681 <_sk_load_u16_be_hsw+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  233,210,254,255,255                 // jmpq          33b1 <_sk_load_u16_be_hsw+0x31>
+  .byte  233,210,254,255,255                 // jmpq          3681 <_sk_load_u16_be_hsw+0x31>
 
 HIDDEN _sk_load_rgb_u16_be_hsw
 .globl _sk_load_rgb_u16_be_hsw
@@ -11240,7 +11412,7 @@ _sk_load_rgb_u16_be_hsw:
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  72,141,4,127                        // lea           (%rdi,%rdi,2),%rax
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,211,0,0,0                    // jne           35c4 <_sk_load_rgb_u16_be_hsw+0xe5>
+  .byte  15,133,211,0,0,0                    // jne           3894 <_sk_load_rgb_u16_be_hsw+0xe5>
   .byte  196,193,122,111,4,64                // vmovdqu       (%r8,%rax,2),%xmm0
   .byte  196,193,122,111,84,64,12            // vmovdqu       0xc(%r8,%rax,2),%xmm2
   .byte  196,193,122,111,76,64,24            // vmovdqu       0x18(%r8,%rax,2),%xmm1
@@ -11290,36 +11462,36 @@ _sk_load_rgb_u16_be_hsw:
   .byte  196,193,121,110,4,64                // vmovd         (%r8,%rax,2),%xmm0
   .byte  196,193,121,196,68,64,4,2           // vpinsrw       $0x2,0x4(%r8,%rax,2),%xmm0,%xmm0
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  117,5                               // jne           35dd <_sk_load_rgb_u16_be_hsw+0xfe>
-  .byte  233,72,255,255,255                  // jmpq          3525 <_sk_load_rgb_u16_be_hsw+0x46>
+  .byte  117,5                               // jne           38ad <_sk_load_rgb_u16_be_hsw+0xfe>
+  .byte  233,72,255,255,255                  // jmpq          37f5 <_sk_load_rgb_u16_be_hsw+0x46>
   .byte  196,193,121,110,76,64,6             // vmovd         0x6(%r8,%rax,2),%xmm1
   .byte  196,65,113,196,68,64,10,2           // vpinsrw       $0x2,0xa(%r8,%rax,2),%xmm1,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,26                              // jb            360c <_sk_load_rgb_u16_be_hsw+0x12d>
+  .byte  114,26                              // jb            38dc <_sk_load_rgb_u16_be_hsw+0x12d>
   .byte  196,193,121,110,76,64,12            // vmovd         0xc(%r8,%rax,2),%xmm1
   .byte  196,193,113,196,84,64,16,2          // vpinsrw       $0x2,0x10(%r8,%rax,2),%xmm1,%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  117,10                              // jne           3611 <_sk_load_rgb_u16_be_hsw+0x132>
-  .byte  233,25,255,255,255                  // jmpq          3525 <_sk_load_rgb_u16_be_hsw+0x46>
-  .byte  233,20,255,255,255                  // jmpq          3525 <_sk_load_rgb_u16_be_hsw+0x46>
+  .byte  117,10                              // jne           38e1 <_sk_load_rgb_u16_be_hsw+0x132>
+  .byte  233,25,255,255,255                  // jmpq          37f5 <_sk_load_rgb_u16_be_hsw+0x46>
+  .byte  233,20,255,255,255                  // jmpq          37f5 <_sk_load_rgb_u16_be_hsw+0x46>
   .byte  196,193,121,110,76,64,18            // vmovd         0x12(%r8,%rax,2),%xmm1
   .byte  196,65,113,196,76,64,22,2           // vpinsrw       $0x2,0x16(%r8,%rax,2),%xmm1,%xmm9
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,26                              // jb            3640 <_sk_load_rgb_u16_be_hsw+0x161>
+  .byte  114,26                              // jb            3910 <_sk_load_rgb_u16_be_hsw+0x161>
   .byte  196,193,121,110,76,64,24            // vmovd         0x18(%r8,%rax,2),%xmm1
   .byte  196,193,113,196,76,64,28,2          // vpinsrw       $0x2,0x1c(%r8,%rax,2),%xmm1,%xmm1
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  117,10                              // jne           3645 <_sk_load_rgb_u16_be_hsw+0x166>
-  .byte  233,229,254,255,255                 // jmpq          3525 <_sk_load_rgb_u16_be_hsw+0x46>
-  .byte  233,224,254,255,255                 // jmpq          3525 <_sk_load_rgb_u16_be_hsw+0x46>
+  .byte  117,10                              // jne           3915 <_sk_load_rgb_u16_be_hsw+0x166>
+  .byte  233,229,254,255,255                 // jmpq          37f5 <_sk_load_rgb_u16_be_hsw+0x46>
+  .byte  233,224,254,255,255                 // jmpq          37f5 <_sk_load_rgb_u16_be_hsw+0x46>
   .byte  196,193,121,110,92,64,30            // vmovd         0x1e(%r8,%rax,2),%xmm3
   .byte  196,65,97,196,92,64,34,2            // vpinsrw       $0x2,0x22(%r8,%rax,2),%xmm3,%xmm11
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,20                              // jb            366e <_sk_load_rgb_u16_be_hsw+0x18f>
+  .byte  114,20                              // jb            393e <_sk_load_rgb_u16_be_hsw+0x18f>
   .byte  196,193,121,110,92,64,36            // vmovd         0x24(%r8,%rax,2),%xmm3
   .byte  196,193,97,196,92,64,40,2           // vpinsrw       $0x2,0x28(%r8,%rax,2),%xmm3,%xmm3
-  .byte  233,183,254,255,255                 // jmpq          3525 <_sk_load_rgb_u16_be_hsw+0x46>
-  .byte  233,178,254,255,255                 // jmpq          3525 <_sk_load_rgb_u16_be_hsw+0x46>
+  .byte  233,183,254,255,255                 // jmpq          37f5 <_sk_load_rgb_u16_be_hsw+0x46>
+  .byte  233,178,254,255,255                 // jmpq          37f5 <_sk_load_rgb_u16_be_hsw+0x46>
 
 HIDDEN _sk_store_u16_be_hsw
 .globl _sk_store_u16_be_hsw
@@ -11368,7 +11540,7 @@ _sk_store_u16_be_hsw:
   .byte  196,65,17,98,200                    // vpunpckldq    %xmm8,%xmm13,%xmm9
   .byte  196,65,17,106,192                   // vpunpckhdq    %xmm8,%xmm13,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,31                              // jne           376e <_sk_store_u16_be_hsw+0xfb>
+  .byte  117,31                              // jne           3a3e <_sk_store_u16_be_hsw+0xfb>
   .byte  196,1,120,17,28,72                  // vmovups       %xmm11,(%r8,%r9,2)
   .byte  196,1,120,17,84,72,16               // vmovups       %xmm10,0x10(%r8,%r9,2)
   .byte  196,1,120,17,76,72,32               // vmovups       %xmm9,0x20(%r8,%r9,2)
@@ -11377,22 +11549,22 @@ _sk_store_u16_be_hsw:
   .byte  255,224                             // jmpq          *%rax
   .byte  196,1,121,214,28,72                 // vmovq         %xmm11,(%r8,%r9,2)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            376a <_sk_store_u16_be_hsw+0xf7>
+  .byte  116,240                             // je            3a3a <_sk_store_u16_be_hsw+0xf7>
   .byte  196,1,121,23,92,72,8                // vmovhpd       %xmm11,0x8(%r8,%r9,2)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            376a <_sk_store_u16_be_hsw+0xf7>
+  .byte  114,227                             // jb            3a3a <_sk_store_u16_be_hsw+0xf7>
   .byte  196,1,121,214,84,72,16              // vmovq         %xmm10,0x10(%r8,%r9,2)
-  .byte  116,218                             // je            376a <_sk_store_u16_be_hsw+0xf7>
+  .byte  116,218                             // je            3a3a <_sk_store_u16_be_hsw+0xf7>
   .byte  196,1,121,23,84,72,24               // vmovhpd       %xmm10,0x18(%r8,%r9,2)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            376a <_sk_store_u16_be_hsw+0xf7>
+  .byte  114,205                             // jb            3a3a <_sk_store_u16_be_hsw+0xf7>
   .byte  196,1,121,214,76,72,32              // vmovq         %xmm9,0x20(%r8,%r9,2)
-  .byte  116,196                             // je            376a <_sk_store_u16_be_hsw+0xf7>
+  .byte  116,196                             // je            3a3a <_sk_store_u16_be_hsw+0xf7>
   .byte  196,1,121,23,76,72,40               // vmovhpd       %xmm9,0x28(%r8,%r9,2)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,183                             // jb            376a <_sk_store_u16_be_hsw+0xf7>
+  .byte  114,183                             // jb            3a3a <_sk_store_u16_be_hsw+0xf7>
   .byte  196,1,121,214,68,72,48              // vmovq         %xmm8,0x30(%r8,%r9,2)
-  .byte  235,174                             // jmp           376a <_sk_store_u16_be_hsw+0xf7>
+  .byte  235,174                             // jmp           3a3a <_sk_store_u16_be_hsw+0xf7>
 
 HIDDEN _sk_load_f32_hsw
 .globl _sk_load_f32_hsw
@@ -11400,10 +11572,10 @@ FUNCTION(_sk_load_f32_hsw)
 _sk_load_f32_hsw:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  119,110                             // ja            3832 <_sk_load_f32_hsw+0x76>
+  .byte  119,110                             // ja            3b02 <_sk_load_f32_hsw+0x76>
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
-  .byte  76,141,21,134,0,0,0                 // lea           0x86(%rip),%r10        # 385c <_sk_load_f32_hsw+0xa0>
+  .byte  76,141,21,134,0,0,0                 // lea           0x86(%rip),%r10        # 3b2c <_sk_load_f32_hsw+0xa0>
   .byte  73,99,4,138                         // movslq        (%r10,%rcx,4),%rax
   .byte  76,1,208                            // add           %r10,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -11462,7 +11634,7 @@ _sk_store_f32_hsw:
   .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
   .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,55                              // jne           38e9 <_sk_store_f32_hsw+0x6d>
+  .byte  117,55                              // jne           3bb9 <_sk_store_f32_hsw+0x6d>
   .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -11475,22 +11647,22 @@ _sk_store_f32_hsw:
   .byte  255,224                             // jmpq          *%rax
   .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            38e5 <_sk_store_f32_hsw+0x69>
+  .byte  116,240                             // je            3bb5 <_sk_store_f32_hsw+0x69>
   .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            38e5 <_sk_store_f32_hsw+0x69>
+  .byte  114,227                             // jb            3bb5 <_sk_store_f32_hsw+0x69>
   .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
-  .byte  116,218                             // je            38e5 <_sk_store_f32_hsw+0x69>
+  .byte  116,218                             // je            3bb5 <_sk_store_f32_hsw+0x69>
   .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            38e5 <_sk_store_f32_hsw+0x69>
+  .byte  114,205                             // jb            3bb5 <_sk_store_f32_hsw+0x69>
   .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  .byte  116,195                             // je            38e5 <_sk_store_f32_hsw+0x69>
+  .byte  116,195                             // je            3bb5 <_sk_store_f32_hsw+0x69>
   .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,181                             // jb            38e5 <_sk_store_f32_hsw+0x69>
+  .byte  114,181                             // jb            3bb5 <_sk_store_f32_hsw+0x69>
   .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  .byte  235,171                             // jmp           38e5 <_sk_store_f32_hsw+0x69>
+  .byte  235,171                             // jmp           3bb5 <_sk_store_f32_hsw+0x69>
 
 HIDDEN _sk_clamp_x_hsw
 .globl _sk_clamp_x_hsw
@@ -11755,7 +11927,7 @@ _sk_linear_gradient_hsw:
   .byte  196,98,125,24,72,28                 // vbroadcastss  0x1c(%rax),%ymm9
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  15,132,143,0,0,0                    // je            3d75 <_sk_linear_gradient_hsw+0xb5>
+  .byte  15,132,143,0,0,0                    // je            4045 <_sk_linear_gradient_hsw+0xb5>
   .byte  72,139,64,8                         // mov           0x8(%rax),%rax
   .byte  72,131,192,32                       // add           $0x20,%rax
   .byte  196,65,28,87,228                    // vxorps        %ymm12,%ymm12,%ymm12
@@ -11782,8 +11954,8 @@ _sk_linear_gradient_hsw:
   .byte  196,67,13,74,201,208                // vblendvps     %ymm13,%ymm9,%ymm14,%ymm9
   .byte  72,131,192,36                       // add           $0x24,%rax
   .byte  73,255,200                          // dec           %r8
-  .byte  117,140                             // jne           3cff <_sk_linear_gradient_hsw+0x3f>
-  .byte  235,17                              // jmp           3d86 <_sk_linear_gradient_hsw+0xc6>
+  .byte  117,140                             // jne           3fcf <_sk_linear_gradient_hsw+0x3f>
+  .byte  235,17                              // jmp           4056 <_sk_linear_gradient_hsw+0xc6>
   .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
   .byte  197,236,87,210                      // vxorps        %ymm2,%ymm2,%ymm2
   .byte  197,228,87,219                      // vxorps        %ymm3,%ymm3,%ymm3
@@ -13298,85 +13470,274 @@ HIDDEN _sk_from_2dot2_avx
 .globl _sk_from_2dot2_avx
 FUNCTION(_sk_from_2dot2_avx)
 _sk_from_2dot2_avx:
-  .byte  197,124,82,192                      // vrsqrtps      %ymm0,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,200                   // vrsqrtps      %ymm8,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  197,252,89,192                      // vmulps        %ymm0,%ymm0,%ymm0
-  .byte  196,65,60,89,208                    // vmulps        %ymm8,%ymm8,%ymm10
-  .byte  196,65,60,89,194                    // vmulps        %ymm10,%ymm8,%ymm8
-  .byte  196,193,124,89,192                  // vmulps        %ymm8,%ymm0,%ymm0
-  .byte  197,180,89,192                      // vmulps        %ymm0,%ymm9,%ymm0
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  196,193,124,95,192                  // vmaxps        %ymm8,%ymm0,%ymm0
-  .byte  197,124,82,201                      // vrsqrtps      %ymm1,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,209                   // vrsqrtps      %ymm9,%ymm10
-  .byte  196,65,124,82,210                   // vrsqrtps      %ymm10,%ymm10
-  .byte  197,244,89,201                      // vmulps        %ymm1,%ymm1,%ymm1
-  .byte  196,65,52,89,217                    // vmulps        %ymm9,%ymm9,%ymm11
-  .byte  196,65,52,89,203                    // vmulps        %ymm11,%ymm9,%ymm9
-  .byte  196,193,116,89,201                  // vmulps        %ymm9,%ymm1,%ymm1
-  .byte  197,172,89,201                      // vmulps        %ymm1,%ymm10,%ymm1
-  .byte  196,193,116,95,200                  // vmaxps        %ymm8,%ymm1,%ymm1
-  .byte  197,124,82,202                      // vrsqrtps      %ymm2,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,209                   // vrsqrtps      %ymm9,%ymm10
-  .byte  196,65,124,82,210                   // vrsqrtps      %ymm10,%ymm10
-  .byte  197,236,89,210                      // vmulps        %ymm2,%ymm2,%ymm2
-  .byte  196,65,52,89,217                    // vmulps        %ymm9,%ymm9,%ymm11
-  .byte  196,65,52,89,203                    // vmulps        %ymm11,%ymm9,%ymm9
-  .byte  196,193,108,89,209                  // vmulps        %ymm9,%ymm2,%ymm2
-  .byte  197,172,89,210                      // vmulps        %ymm2,%ymm10,%ymm2
-  .byte  196,193,108,95,208                  // vmaxps        %ymm8,%ymm2,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,131,236,88                       // sub           $0x58,%rsp
+  .byte  197,252,17,124,36,32                // vmovups       %ymm7,0x20(%rsp)
+  .byte  197,252,17,52,36                    // vmovups       %ymm6,(%rsp)
+  .byte  197,252,17,108,36,224               // vmovups       %ymm5,-0x20(%rsp)
+  .byte  197,252,17,100,36,192               // vmovups       %ymm4,-0x40(%rsp)
+  .byte  197,252,17,92,36,160                // vmovups       %ymm3,-0x60(%rsp)
+  .byte  197,252,17,84,36,128                // vmovups       %ymm2,-0x80(%rsp)
+  .byte  197,252,40,241                      // vmovaps       %ymm1,%ymm6
+  .byte  65,184,205,204,12,64                // mov           $0x400ccccd,%r8d
+  .byte  197,252,91,200                      // vcvtdq2ps     %ymm0,%ymm1
+  .byte  184,0,0,0,52                        // mov           $0x34000000,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,194,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm8
+  .byte  196,193,116,89,200                  // vmulps        %ymm8,%ymm1,%ymm1
+  .byte  184,255,255,127,0                   // mov           $0x7fffff,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  197,249,112,210,0                   // vpshufd       $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,202,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm9
+  .byte  197,180,84,192                      // vandps        %ymm0,%ymm9,%ymm0
+  .byte  184,0,0,0,63                        // mov           $0x3f000000,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  197,249,112,210,0                   // vpshufd       $0x0,%xmm2,%xmm2
+  .byte  196,227,109,24,234,1                // vinsertf128   $0x1,%xmm2,%ymm2,%ymm5
+  .byte  197,252,86,197                      // vorps         %ymm5,%ymm0,%ymm0
+  .byte  184,119,115,248,66                  // mov           $0x42f87377,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,210,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm10
+  .byte  196,193,116,92,202                  // vsubps        %ymm10,%ymm1,%ymm1
+  .byte  184,117,191,191,63                  // mov           $0x3fbfbf75,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,218,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm11
+  .byte  196,193,124,89,211                  // vmulps        %ymm11,%ymm0,%ymm2
+  .byte  197,244,92,202                      // vsubps        %ymm2,%ymm1,%ymm1
+  .byte  184,163,233,220,63                  // mov           $0x3fdce9a3,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,226,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm12
+  .byte  184,249,68,180,62                   // mov           $0x3eb444f9,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,234,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm13
+  .byte  196,193,124,88,197                  // vaddps        %ymm13,%ymm0,%ymm0
+  .byte  197,156,94,192                      // vdivps        %ymm0,%ymm12,%ymm0
+  .byte  197,244,92,192                      // vsubps        %ymm0,%ymm1,%ymm0
+  .byte  196,193,121,110,200                 // vmovd         %r8d,%xmm1
+  .byte  196,227,121,4,201,0                 // vpermilps     $0x0,%xmm1,%xmm1
+  .byte  196,99,117,24,241,1                 // vinsertf128   $0x1,%xmm1,%ymm1,%ymm14
+  .byte  197,140,89,192                      // vmulps        %ymm0,%ymm14,%ymm0
+  .byte  196,227,125,8,200,1                 // vroundps      $0x1,%ymm0,%ymm1
+  .byte  197,252,92,225                      // vsubps        %ymm1,%ymm0,%ymm4
+  .byte  65,184,0,0,0,75                     // mov           $0x4b000000,%r8d
+  .byte  184,81,140,242,66                   // mov           $0x42f28c51,%eax
+  .byte  197,249,110,200                     // vmovd         %eax,%xmm1
+  .byte  196,227,121,4,201,0                 // vpermilps     $0x0,%xmm1,%xmm1
+  .byte  196,99,117,24,249,1                 // vinsertf128   $0x1,%xmm1,%ymm1,%ymm15
+  .byte  197,132,88,192                      // vaddps        %ymm0,%ymm15,%ymm0
+  .byte  184,141,188,190,63                  // mov           $0x3fbebc8d,%eax
+  .byte  197,249,110,200                     // vmovd         %eax,%xmm1
+  .byte  196,227,121,4,201,0                 // vpermilps     $0x0,%xmm1,%xmm1
+  .byte  196,227,117,24,217,1                // vinsertf128   $0x1,%xmm1,%ymm1,%ymm3
+  .byte  197,228,89,204                      // vmulps        %ymm4,%ymm3,%ymm1
+  .byte  197,252,92,209                      // vsubps        %ymm1,%ymm0,%ymm2
+  .byte  184,254,210,221,65                  // mov           $0x41ddd2fe,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,227,121,4,192,0                 // vpermilps     $0x0,%xmm0,%xmm0
+  .byte  196,227,125,24,200,1                // vinsertf128   $0x1,%xmm0,%ymm0,%ymm1
+  .byte  184,248,245,154,64                  // mov           $0x409af5f8,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,227,121,4,192,0                 // vpermilps     $0x0,%xmm0,%xmm0
+  .byte  196,227,125,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  .byte  197,252,92,228                      // vsubps        %ymm4,%ymm0,%ymm4
+  .byte  197,244,94,228                      // vdivps        %ymm4,%ymm1,%ymm4
+  .byte  197,236,88,228                      // vaddps        %ymm4,%ymm2,%ymm4
+  .byte  197,252,91,214                      // vcvtdq2ps     %ymm6,%ymm2
+  .byte  196,193,108,89,208                  // vmulps        %ymm8,%ymm2,%ymm2
+  .byte  197,180,84,246                      // vandps        %ymm6,%ymm9,%ymm6
+  .byte  197,204,86,245                      // vorps         %ymm5,%ymm6,%ymm6
+  .byte  196,193,108,92,210                  // vsubps        %ymm10,%ymm2,%ymm2
+  .byte  196,193,76,89,251                   // vmulps        %ymm11,%ymm6,%ymm7
+  .byte  197,236,92,215                      // vsubps        %ymm7,%ymm2,%ymm2
+  .byte  196,193,76,88,245                   // vaddps        %ymm13,%ymm6,%ymm6
+  .byte  197,156,94,246                      // vdivps        %ymm6,%ymm12,%ymm6
+  .byte  197,236,92,214                      // vsubps        %ymm6,%ymm2,%ymm2
+  .byte  197,140,89,210                      // vmulps        %ymm2,%ymm14,%ymm2
+  .byte  196,227,125,8,242,1                 // vroundps      $0x1,%ymm2,%ymm6
+  .byte  197,236,92,246                      // vsubps        %ymm6,%ymm2,%ymm6
+  .byte  197,132,88,210                      // vaddps        %ymm2,%ymm15,%ymm2
+  .byte  197,228,89,254                      // vmulps        %ymm6,%ymm3,%ymm7
+  .byte  197,236,92,215                      // vsubps        %ymm7,%ymm2,%ymm2
+  .byte  197,252,92,246                      // vsubps        %ymm6,%ymm0,%ymm6
+  .byte  197,244,94,246                      // vdivps        %ymm6,%ymm1,%ymm6
+  .byte  197,236,88,214                      // vaddps        %ymm6,%ymm2,%ymm2
+  .byte  197,252,16,124,36,128               // vmovups       -0x80(%rsp),%ymm7
+  .byte  197,252,91,247                      // vcvtdq2ps     %ymm7,%ymm6
+  .byte  196,193,76,89,240                   // vmulps        %ymm8,%ymm6,%ymm6
+  .byte  197,180,84,255                      // vandps        %ymm7,%ymm9,%ymm7
+  .byte  197,196,86,237                      // vorps         %ymm5,%ymm7,%ymm5
+  .byte  196,193,76,92,242                   // vsubps        %ymm10,%ymm6,%ymm6
+  .byte  196,193,84,89,251                   // vmulps        %ymm11,%ymm5,%ymm7
+  .byte  197,204,92,247                      // vsubps        %ymm7,%ymm6,%ymm6
+  .byte  196,193,84,88,237                   // vaddps        %ymm13,%ymm5,%ymm5
+  .byte  197,156,94,237                      // vdivps        %ymm5,%ymm12,%ymm5
+  .byte  197,204,92,237                      // vsubps        %ymm5,%ymm6,%ymm5
+  .byte  197,140,89,237                      // vmulps        %ymm5,%ymm14,%ymm5
+  .byte  196,227,125,8,245,1                 // vroundps      $0x1,%ymm5,%ymm6
+  .byte  197,212,92,246                      // vsubps        %ymm6,%ymm5,%ymm6
+  .byte  197,132,88,237                      // vaddps        %ymm5,%ymm15,%ymm5
+  .byte  197,228,89,222                      // vmulps        %ymm6,%ymm3,%ymm3
+  .byte  197,212,92,219                      // vsubps        %ymm3,%ymm5,%ymm3
+  .byte  197,252,92,198                      // vsubps        %ymm6,%ymm0,%ymm0
+  .byte  197,244,94,192                      // vdivps        %ymm0,%ymm1,%ymm0
+  .byte  197,228,88,192                      // vaddps        %ymm0,%ymm3,%ymm0
+  .byte  196,193,121,110,200                 // vmovd         %r8d,%xmm1
+  .byte  196,227,121,4,201,0                 // vpermilps     $0x0,%xmm1,%xmm1
+  .byte  196,227,117,24,201,1                // vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
+  .byte  197,244,89,220                      // vmulps        %ymm4,%ymm1,%ymm3
+  .byte  197,244,89,210                      // vmulps        %ymm2,%ymm1,%ymm2
+  .byte  197,244,89,224                      // vmulps        %ymm0,%ymm1,%ymm4
+  .byte  197,253,91,195                      // vcvtps2dq     %ymm3,%ymm0
+  .byte  197,253,91,202                      // vcvtps2dq     %ymm2,%ymm1
+  .byte  197,253,91,212                      // vcvtps2dq     %ymm4,%ymm2
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  197,252,16,92,36,160                // vmovups       -0x60(%rsp),%ymm3
+  .byte  197,252,16,100,36,192               // vmovups       -0x40(%rsp),%ymm4
+  .byte  197,252,16,108,36,224               // vmovups       -0x20(%rsp),%ymm5
+  .byte  197,252,16,52,36                    // vmovups       (%rsp),%ymm6
+  .byte  197,252,16,124,36,32                // vmovups       0x20(%rsp),%ymm7
+  .byte  72,131,196,88                       // add           $0x58,%rsp
   .byte  255,224                             // jmpq          *%rax
 
 HIDDEN _sk_to_2dot2_avx
 .globl _sk_to_2dot2_avx
 FUNCTION(_sk_to_2dot2_avx)
 _sk_to_2dot2_avx:
-  .byte  197,252,82,192                      // vrsqrtps      %ymm0,%ymm0
-  .byte  197,124,82,192                      // vrsqrtps      %ymm0,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,192                   // vrsqrtps      %ymm8,%ymm8
-  .byte  196,65,124,82,200                   // vrsqrtps      %ymm8,%ymm9
-  .byte  197,252,83,192                      // vrcpps        %ymm0,%ymm0
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  196,65,124,83,193                   // vrcpps        %ymm9,%ymm8
-  .byte  196,193,124,89,192                  // vmulps        %ymm8,%ymm0,%ymm0
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  196,193,124,95,192                  // vmaxps        %ymm8,%ymm0,%ymm0
-  .byte  197,252,82,201                      // vrsqrtps      %ymm1,%ymm1
-  .byte  197,124,82,201                      // vrsqrtps      %ymm1,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,209                   // vrsqrtps      %ymm9,%ymm10
-  .byte  197,252,83,201                      // vrcpps        %ymm1,%ymm1
-  .byte  197,180,89,201                      // vmulps        %ymm1,%ymm9,%ymm1
-  .byte  196,65,124,83,202                   // vrcpps        %ymm10,%ymm9
-  .byte  196,193,116,89,201                  // vmulps        %ymm9,%ymm1,%ymm1
-  .byte  196,193,116,95,200                  // vmaxps        %ymm8,%ymm1,%ymm1
-  .byte  197,252,82,210                      // vrsqrtps      %ymm2,%ymm2
-  .byte  197,124,82,202                      // vrsqrtps      %ymm2,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,124,82,209                   // vrsqrtps      %ymm9,%ymm10
-  .byte  197,252,83,210                      // vrcpps        %ymm2,%ymm2
-  .byte  197,180,89,210                      // vmulps        %ymm2,%ymm9,%ymm2
-  .byte  196,65,124,83,202                   // vrcpps        %ymm10,%ymm9
-  .byte  196,193,108,89,209                  // vmulps        %ymm9,%ymm2,%ymm2
-  .byte  196,193,108,95,208                  // vmaxps        %ymm8,%ymm2,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  72,131,236,88                       // sub           $0x58,%rsp
+  .byte  197,252,17,124,36,32                // vmovups       %ymm7,0x20(%rsp)
+  .byte  197,252,17,52,36                    // vmovups       %ymm6,(%rsp)
+  .byte  197,252,17,108,36,224               // vmovups       %ymm5,-0x20(%rsp)
+  .byte  197,252,17,100,36,192               // vmovups       %ymm4,-0x40(%rsp)
+  .byte  197,252,17,92,36,160                // vmovups       %ymm3,-0x60(%rsp)
+  .byte  197,252,17,84,36,128                // vmovups       %ymm2,-0x80(%rsp)
+  .byte  197,252,40,241                      // vmovaps       %ymm1,%ymm6
+  .byte  65,184,46,186,232,62                // mov           $0x3ee8ba2e,%r8d
+  .byte  197,252,91,200                      // vcvtdq2ps     %ymm0,%ymm1
+  .byte  184,0,0,0,52                        // mov           $0x34000000,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,194,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm8
+  .byte  196,193,116,89,200                  // vmulps        %ymm8,%ymm1,%ymm1
+  .byte  184,255,255,127,0                   // mov           $0x7fffff,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  197,249,112,210,0                   // vpshufd       $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,202,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm9
+  .byte  197,180,84,192                      // vandps        %ymm0,%ymm9,%ymm0
+  .byte  184,0,0,0,63                        // mov           $0x3f000000,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  197,249,112,210,0                   // vpshufd       $0x0,%xmm2,%xmm2
+  .byte  196,227,109,24,234,1                // vinsertf128   $0x1,%xmm2,%ymm2,%ymm5
+  .byte  197,252,86,197                      // vorps         %ymm5,%ymm0,%ymm0
+  .byte  184,119,115,248,66                  // mov           $0x42f87377,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,210,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm10
+  .byte  196,193,116,92,202                  // vsubps        %ymm10,%ymm1,%ymm1
+  .byte  184,117,191,191,63                  // mov           $0x3fbfbf75,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,218,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm11
+  .byte  196,193,124,89,211                  // vmulps        %ymm11,%ymm0,%ymm2
+  .byte  197,244,92,202                      // vsubps        %ymm2,%ymm1,%ymm1
+  .byte  184,163,233,220,63                  // mov           $0x3fdce9a3,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,226,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm12
+  .byte  184,249,68,180,62                   // mov           $0x3eb444f9,%eax
+  .byte  197,249,110,208                     // vmovd         %eax,%xmm2
+  .byte  196,227,121,4,210,0                 // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  196,99,109,24,234,1                 // vinsertf128   $0x1,%xmm2,%ymm2,%ymm13
+  .byte  196,193,124,88,197                  // vaddps        %ymm13,%ymm0,%ymm0
+  .byte  197,156,94,192                      // vdivps        %ymm0,%ymm12,%ymm0
+  .byte  197,244,92,192                      // vsubps        %ymm0,%ymm1,%ymm0
+  .byte  196,193,121,110,200                 // vmovd         %r8d,%xmm1
+  .byte  196,227,121,4,201,0                 // vpermilps     $0x0,%xmm1,%xmm1
+  .byte  196,99,117,24,241,1                 // vinsertf128   $0x1,%xmm1,%ymm1,%ymm14
+  .byte  197,140,89,192                      // vmulps        %ymm0,%ymm14,%ymm0
+  .byte  196,227,125,8,200,1                 // vroundps      $0x1,%ymm0,%ymm1
+  .byte  197,252,92,225                      // vsubps        %ymm1,%ymm0,%ymm4
+  .byte  65,184,0,0,0,75                     // mov           $0x4b000000,%r8d
+  .byte  184,81,140,242,66                   // mov           $0x42f28c51,%eax
+  .byte  197,249,110,200                     // vmovd         %eax,%xmm1
+  .byte  196,227,121,4,201,0                 // vpermilps     $0x0,%xmm1,%xmm1
+  .byte  196,99,117,24,249,1                 // vinsertf128   $0x1,%xmm1,%ymm1,%ymm15
+  .byte  197,132,88,192                      // vaddps        %ymm0,%ymm15,%ymm0
+  .byte  184,141,188,190,63                  // mov           $0x3fbebc8d,%eax
+  .byte  197,249,110,200                     // vmovd         %eax,%xmm1
+  .byte  196,227,121,4,201,0                 // vpermilps     $0x0,%xmm1,%xmm1
+  .byte  196,227,117,24,217,1                // vinsertf128   $0x1,%xmm1,%ymm1,%ymm3
+  .byte  197,228,89,204                      // vmulps        %ymm4,%ymm3,%ymm1
+  .byte  197,252,92,209                      // vsubps        %ymm1,%ymm0,%ymm2
+  .byte  184,254,210,221,65                  // mov           $0x41ddd2fe,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,227,121,4,192,0                 // vpermilps     $0x0,%xmm0,%xmm0
+  .byte  196,227,125,24,200,1                // vinsertf128   $0x1,%xmm0,%ymm0,%ymm1
+  .byte  184,248,245,154,64                  // mov           $0x409af5f8,%eax
+  .byte  197,249,110,192                     // vmovd         %eax,%xmm0
+  .byte  196,227,121,4,192,0                 // vpermilps     $0x0,%xmm0,%xmm0
+  .byte  196,227,125,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  .byte  197,252,92,228                      // vsubps        %ymm4,%ymm0,%ymm4
+  .byte  197,244,94,228                      // vdivps        %ymm4,%ymm1,%ymm4
+  .byte  197,236,88,228                      // vaddps        %ymm4,%ymm2,%ymm4
+  .byte  197,252,91,214                      // vcvtdq2ps     %ymm6,%ymm2
+  .byte  196,193,108,89,208                  // vmulps        %ymm8,%ymm2,%ymm2
+  .byte  197,180,84,246                      // vandps        %ymm6,%ymm9,%ymm6
+  .byte  197,204,86,245                      // vorps         %ymm5,%ymm6,%ymm6
+  .byte  196,193,108,92,210                  // vsubps        %ymm10,%ymm2,%ymm2
+  .byte  196,193,76,89,251                   // vmulps        %ymm11,%ymm6,%ymm7
+  .byte  197,236,92,215                      // vsubps        %ymm7,%ymm2,%ymm2
+  .byte  196,193,76,88,245                   // vaddps        %ymm13,%ymm6,%ymm6
+  .byte  197,156,94,246                      // vdivps        %ymm6,%ymm12,%ymm6
+  .byte  197,236,92,214                      // vsubps        %ymm6,%ymm2,%ymm2
+  .byte  197,140,89,210                      // vmulps        %ymm2,%ymm14,%ymm2
+  .byte  196,227,125,8,242,1                 // vroundps      $0x1,%ymm2,%ymm6
+  .byte  197,236,92,246                      // vsubps        %ymm6,%ymm2,%ymm6
+  .byte  197,132,88,210                      // vaddps        %ymm2,%ymm15,%ymm2
+  .byte  197,228,89,254                      // vmulps        %ymm6,%ymm3,%ymm7
+  .byte  197,236,92,215                      // vsubps        %ymm7,%ymm2,%ymm2
+  .byte  197,252,92,246                      // vsubps        %ymm6,%ymm0,%ymm6
+  .byte  197,244,94,246                      // vdivps        %ymm6,%ymm1,%ymm6
+  .byte  197,236,88,214                      // vaddps        %ymm6,%ymm2,%ymm2
+  .byte  197,252,16,124,36,128               // vmovups       -0x80(%rsp),%ymm7
+  .byte  197,252,91,247                      // vcvtdq2ps     %ymm7,%ymm6
+  .byte  196,193,76,89,240                   // vmulps        %ymm8,%ymm6,%ymm6
+  .byte  197,180,84,255                      // vandps        %ymm7,%ymm9,%ymm7
+  .byte  197,196,86,237                      // vorps         %ymm5,%ymm7,%ymm5
+  .byte  196,193,76,92,242                   // vsubps        %ymm10,%ymm6,%ymm6
+  .byte  196,193,84,89,251                   // vmulps        %ymm11,%ymm5,%ymm7
+  .byte  197,204,92,247                      // vsubps        %ymm7,%ymm6,%ymm6
+  .byte  196,193,84,88,237                   // vaddps        %ymm13,%ymm5,%ymm5
+  .byte  197,156,94,237                      // vdivps        %ymm5,%ymm12,%ymm5
+  .byte  197,204,92,237                      // vsubps        %ymm5,%ymm6,%ymm5
+  .byte  197,140,89,237                      // vmulps        %ymm5,%ymm14,%ymm5
+  .byte  196,227,125,8,245,1                 // vroundps      $0x1,%ymm5,%ymm6
+  .byte  197,212,92,246                      // vsubps        %ymm6,%ymm5,%ymm6
+  .byte  197,132,88,237                      // vaddps        %ymm5,%ymm15,%ymm5
+  .byte  197,228,89,222                      // vmulps        %ymm6,%ymm3,%ymm3
+  .byte  197,212,92,219                      // vsubps        %ymm3,%ymm5,%ymm3
+  .byte  197,252,92,198                      // vsubps        %ymm6,%ymm0,%ymm0
+  .byte  197,244,94,192                      // vdivps        %ymm0,%ymm1,%ymm0
+  .byte  197,228,88,192                      // vaddps        %ymm0,%ymm3,%ymm0
+  .byte  196,193,121,110,200                 // vmovd         %r8d,%xmm1
+  .byte  196,227,121,4,201,0                 // vpermilps     $0x0,%xmm1,%xmm1
+  .byte  196,227,117,24,201,1                // vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
+  .byte  197,244,89,220                      // vmulps        %ymm4,%ymm1,%ymm3
+  .byte  197,244,89,210                      // vmulps        %ymm2,%ymm1,%ymm2
+  .byte  197,244,89,224                      // vmulps        %ymm0,%ymm1,%ymm4
+  .byte  197,253,91,195                      // vcvtps2dq     %ymm3,%ymm0
+  .byte  197,253,91,202                      // vcvtps2dq     %ymm2,%ymm1
+  .byte  197,253,91,212                      // vcvtps2dq     %ymm4,%ymm2
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  197,252,16,92,36,160                // vmovups       -0x60(%rsp),%ymm3
+  .byte  197,252,16,100,36,192               // vmovups       -0x40(%rsp),%ymm4
+  .byte  197,252,16,108,36,224               // vmovups       -0x20(%rsp),%ymm5
+  .byte  197,252,16,52,36                    // vmovups       (%rsp),%ymm6
+  .byte  197,252,16,124,36,32                // vmovups       0x20(%rsp),%ymm7
+  .byte  72,131,196,88                       // add           $0x58,%rsp
   .byte  255,224                             // jmpq          *%rax
 
 HIDDEN _sk_rgb_to_hsl_avx
@@ -13588,7 +13949,7 @@ _sk_scale_u8_avx:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,1,248                            // add           %rdi,%rax
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,80                              // jne           1456 <_sk_scale_u8_avx+0x60>
+  .byte  117,80                              // jne           17ee <_sk_scale_u8_avx+0x60>
   .byte  197,122,126,0                       // vmovq         (%rax),%xmm8
   .byte  196,66,121,49,200                   // vpmovzxbd     %xmm8,%xmm9
   .byte  196,67,121,4,192,229                // vpermilps     $0xe5,%xmm8,%xmm8
@@ -13616,9 +13977,9 @@ _sk_scale_u8_avx:
   .byte  77,9,217                            // or            %r11,%r9
   .byte  72,131,193,8                        // add           $0x8,%rcx
   .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           145e <_sk_scale_u8_avx+0x68>
+  .byte  117,234                             // jne           17f6 <_sk_scale_u8_avx+0x68>
   .byte  196,65,249,110,193                  // vmovq         %r9,%xmm8
-  .byte  235,143                             // jmp           140a <_sk_scale_u8_avx+0x14>
+  .byte  235,143                             // jmp           17a2 <_sk_scale_u8_avx+0x14>
 
 HIDDEN _sk_lerp_1_float_avx
 .globl _sk_lerp_1_float_avx
@@ -13650,7 +14011,7 @@ _sk_lerp_u8_avx:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,1,248                            // add           %rdi,%rax
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,116                             // jne           153e <_sk_lerp_u8_avx+0x84>
+  .byte  117,116                             // jne           18d6 <_sk_lerp_u8_avx+0x84>
   .byte  197,122,126,0                       // vmovq         (%rax),%xmm8
   .byte  196,66,121,49,200                   // vpmovzxbd     %xmm8,%xmm9
   .byte  196,67,121,4,192,229                // vpermilps     $0xe5,%xmm8,%xmm8
@@ -13686,9 +14047,9 @@ _sk_lerp_u8_avx:
   .byte  77,9,217                            // or            %r11,%r9
   .byte  72,131,193,8                        // add           $0x8,%rcx
   .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           1546 <_sk_lerp_u8_avx+0x8c>
+  .byte  117,234                             // jne           18de <_sk_lerp_u8_avx+0x8c>
   .byte  196,65,249,110,193                  // vmovq         %r9,%xmm8
-  .byte  233,104,255,255,255                 // jmpq          14ce <_sk_lerp_u8_avx+0x14>
+  .byte  233,104,255,255,255                 // jmpq          1866 <_sk_lerp_u8_avx+0x14>
 
 HIDDEN _sk_lerp_565_avx
 .globl _sk_lerp_565_avx
@@ -13697,7 +14058,7 @@ _sk_lerp_565_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,16                           // mov           (%rax),%r10
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,250,0,0,0                    // jne           166e <_sk_lerp_565_avx+0x108>
+  .byte  15,133,250,0,0,0                    // jne           1a06 <_sk_lerp_565_avx+0x108>
   .byte  196,65,122,111,4,122                // vmovdqu       (%r10,%rdi,2),%xmm8
   .byte  197,225,239,219                     // vpxor         %xmm3,%xmm3,%xmm3
   .byte  197,185,105,219                     // vpunpckhwd    %xmm3,%xmm8,%xmm3
@@ -13756,9 +14117,9 @@ _sk_lerp_565_avx:
   .byte  196,65,57,239,192                   // vpxor         %xmm8,%xmm8,%xmm8
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,243,254,255,255              // ja            157a <_sk_lerp_565_avx+0x14>
+  .byte  15,135,243,254,255,255              // ja            1912 <_sk_lerp_565_avx+0x14>
   .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  76,141,13,74,0,0,0                  // lea           0x4a(%rip),%r9        # 16dc <_sk_lerp_565_avx+0x176>
+  .byte  76,141,13,74,0,0,0                  // lea           0x4a(%rip),%r9        # 1a74 <_sk_lerp_565_avx+0x176>
   .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
   .byte  76,1,200                            // add           %r9,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -13770,12 +14131,12 @@ _sk_lerp_565_avx:
   .byte  196,65,57,196,68,122,4,2            // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
   .byte  196,65,57,196,68,122,2,1            // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
   .byte  196,65,57,196,4,122,0               // vpinsrw       $0x0,(%r10,%rdi,2),%xmm8,%xmm8
-  .byte  233,159,254,255,255                 // jmpq          157a <_sk_lerp_565_avx+0x14>
+  .byte  233,159,254,255,255                 // jmpq          1912 <_sk_lerp_565_avx+0x14>
   .byte  144                                 // nop
   .byte  243,255                             // repz          (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  235,255                             // jmp           16e1 <_sk_lerp_565_avx+0x17b>
+  .byte  235,255                             // jmp           1a79 <_sk_lerp_565_avx+0x17b>
   .byte  255                                 // (bad)
   .byte  255,227                             // jmpq          *%rbx
   .byte  255                                 // (bad)
@@ -13808,7 +14169,7 @@ _sk_load_tables_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,56,2,0,0                     // jne           1948 <_sk_load_tables_avx+0x250>
+  .byte  15,133,56,2,0,0                     // jne           1ce0 <_sk_load_tables_avx+0x250>
   .byte  196,65,124,16,4,184                 // vmovups       (%r8,%rdi,4),%ymm8
   .byte  187,255,0,0,0                       // mov           $0xff,%ebx
   .byte  197,249,110,195                     // vmovd         %ebx,%xmm0
@@ -13927,9 +14288,9 @@ _sk_load_tables_avx:
   .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
   .byte  254,203                             // dec           %bl
   .byte  128,251,6                           // cmp           $0x6,%bl
-  .byte  15,135,185,253,255,255              // ja            1716 <_sk_load_tables_avx+0x1e>
+  .byte  15,135,185,253,255,255              // ja            1aae <_sk_load_tables_avx+0x1e>
   .byte  15,182,219                          // movzbl        %bl,%ebx
-  .byte  76,141,13,137,0,0,0                 // lea           0x89(%rip),%r9        # 19f0 <_sk_load_tables_avx+0x2f8>
+  .byte  76,141,13,137,0,0,0                 // lea           0x89(%rip),%r9        # 1d88 <_sk_load_tables_avx+0x2f8>
   .byte  73,99,28,153                        // movslq        (%r9,%rbx,4),%rbx
   .byte  76,1,203                            // add           %r9,%rbx
   .byte  255,227                             // jmpq          *%rbx
@@ -13952,7 +14313,7 @@ _sk_load_tables_avx:
   .byte  196,99,61,12,192,15                 // vblendps      $0xf,%ymm0,%ymm8,%ymm8
   .byte  196,195,57,34,4,184,0               // vpinsrd       $0x0,(%r8,%rdi,4),%xmm8,%xmm0
   .byte  196,99,61,12,192,15                 // vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  .byte  233,38,253,255,255                  // jmpq          1716 <_sk_load_tables_avx+0x1e>
+  .byte  233,38,253,255,255                  // jmpq          1aae <_sk_load_tables_avx+0x1e>
   .byte  238                                 // out           %al,(%dx)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
@@ -13980,7 +14341,7 @@ _sk_load_tables_u16_be_avx:
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,165,2,0,0                    // jne           1cc7 <_sk_load_tables_u16_be_avx+0x2bb>
+  .byte  15,133,165,2,0,0                    // jne           205f <_sk_load_tables_u16_be_avx+0x2bb>
   .byte  196,1,121,16,4,72                   // vmovupd       (%r8,%r9,2),%xmm8
   .byte  196,129,121,16,84,72,16             // vmovupd       0x10(%r8,%r9,2),%xmm2
   .byte  196,129,121,16,92,72,32             // vmovupd       0x20(%r8,%r9,2),%xmm3
@@ -14124,29 +14485,29 @@ _sk_load_tables_u16_be_avx:
   .byte  196,1,123,16,4,72                   // vmovsd        (%r8,%r9,2),%xmm8
   .byte  196,65,49,239,201                   // vpxor         %xmm9,%xmm9,%xmm9
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,85                              // je            1d2d <_sk_load_tables_u16_be_avx+0x321>
+  .byte  116,85                              // je            20c5 <_sk_load_tables_u16_be_avx+0x321>
   .byte  196,1,57,22,68,72,8                 // vmovhpd       0x8(%r8,%r9,2),%xmm8,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,72                              // jb            1d2d <_sk_load_tables_u16_be_avx+0x321>
+  .byte  114,72                              // jb            20c5 <_sk_load_tables_u16_be_avx+0x321>
   .byte  196,129,123,16,84,72,16             // vmovsd        0x10(%r8,%r9,2),%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  116,72                              // je            1d3a <_sk_load_tables_u16_be_avx+0x32e>
+  .byte  116,72                              // je            20d2 <_sk_load_tables_u16_be_avx+0x32e>
   .byte  196,129,105,22,84,72,24             // vmovhpd       0x18(%r8,%r9,2),%xmm2,%xmm2
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,59                              // jb            1d3a <_sk_load_tables_u16_be_avx+0x32e>
+  .byte  114,59                              // jb            20d2 <_sk_load_tables_u16_be_avx+0x32e>
   .byte  196,129,123,16,92,72,32             // vmovsd        0x20(%r8,%r9,2),%xmm3
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  15,132,45,253,255,255               // je            1a3d <_sk_load_tables_u16_be_avx+0x31>
+  .byte  15,132,45,253,255,255               // je            1dd5 <_sk_load_tables_u16_be_avx+0x31>
   .byte  196,129,97,22,92,72,40              // vmovhpd       0x28(%r8,%r9,2),%xmm3,%xmm3
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,28,253,255,255               // jb            1a3d <_sk_load_tables_u16_be_avx+0x31>
+  .byte  15,130,28,253,255,255               // jb            1dd5 <_sk_load_tables_u16_be_avx+0x31>
   .byte  196,1,122,126,76,72,48              // vmovq         0x30(%r8,%r9,2),%xmm9
-  .byte  233,16,253,255,255                  // jmpq          1a3d <_sk_load_tables_u16_be_avx+0x31>
+  .byte  233,16,253,255,255                  // jmpq          1dd5 <_sk_load_tables_u16_be_avx+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
   .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,3,253,255,255                   // jmpq          1a3d <_sk_load_tables_u16_be_avx+0x31>
+  .byte  233,3,253,255,255                   // jmpq          1dd5 <_sk_load_tables_u16_be_avx+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  233,250,252,255,255                 // jmpq          1a3d <_sk_load_tables_u16_be_avx+0x31>
+  .byte  233,250,252,255,255                 // jmpq          1dd5 <_sk_load_tables_u16_be_avx+0x31>
 
 HIDDEN _sk_load_tables_rgb_u16_be_avx
 .globl _sk_load_tables_rgb_u16_be_avx
@@ -14156,7 +14517,7 @@ _sk_load_tables_rgb_u16_be_avx:
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  76,141,12,127                       // lea           (%rdi,%rdi,2),%r9
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,146,2,0,0                    // jne           1fe7 <_sk_load_tables_rgb_u16_be_avx+0x2a4>
+  .byte  15,133,146,2,0,0                    // jne           237f <_sk_load_tables_rgb_u16_be_avx+0x2a4>
   .byte  196,129,122,111,4,72                // vmovdqu       (%r8,%r9,2),%xmm0
   .byte  196,129,122,111,84,72,12            // vmovdqu       0xc(%r8,%r9,2),%xmm2
   .byte  196,129,122,111,76,72,24            // vmovdqu       0x18(%r8,%r9,2),%xmm1
@@ -14296,36 +14657,36 @@ _sk_load_tables_rgb_u16_be_avx:
   .byte  196,129,121,110,4,72                // vmovd         (%r8,%r9,2),%xmm0
   .byte  196,129,121,196,68,72,4,2           // vpinsrw       $0x2,0x4(%r8,%r9,2),%xmm0,%xmm0
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  117,5                               // jne           2000 <_sk_load_tables_rgb_u16_be_avx+0x2bd>
-  .byte  233,137,253,255,255                 // jmpq          1d89 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  .byte  117,5                               // jne           2398 <_sk_load_tables_rgb_u16_be_avx+0x2bd>
+  .byte  233,137,253,255,255                 // jmpq          2121 <_sk_load_tables_rgb_u16_be_avx+0x46>
   .byte  196,129,121,110,76,72,6             // vmovd         0x6(%r8,%r9,2),%xmm1
   .byte  196,1,113,196,68,72,10,2            // vpinsrw       $0x2,0xa(%r8,%r9,2),%xmm1,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,26                              // jb            202f <_sk_load_tables_rgb_u16_be_avx+0x2ec>
+  .byte  114,26                              // jb            23c7 <_sk_load_tables_rgb_u16_be_avx+0x2ec>
   .byte  196,129,121,110,76,72,12            // vmovd         0xc(%r8,%r9,2),%xmm1
   .byte  196,129,113,196,84,72,16,2          // vpinsrw       $0x2,0x10(%r8,%r9,2),%xmm1,%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  117,10                              // jne           2034 <_sk_load_tables_rgb_u16_be_avx+0x2f1>
-  .byte  233,90,253,255,255                  // jmpq          1d89 <_sk_load_tables_rgb_u16_be_avx+0x46>
-  .byte  233,85,253,255,255                  // jmpq          1d89 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  .byte  117,10                              // jne           23cc <_sk_load_tables_rgb_u16_be_avx+0x2f1>
+  .byte  233,90,253,255,255                  // jmpq          2121 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  .byte  233,85,253,255,255                  // jmpq          2121 <_sk_load_tables_rgb_u16_be_avx+0x46>
   .byte  196,129,121,110,76,72,18            // vmovd         0x12(%r8,%r9,2),%xmm1
   .byte  196,1,113,196,76,72,22,2            // vpinsrw       $0x2,0x16(%r8,%r9,2),%xmm1,%xmm9
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,26                              // jb            2063 <_sk_load_tables_rgb_u16_be_avx+0x320>
+  .byte  114,26                              // jb            23fb <_sk_load_tables_rgb_u16_be_avx+0x320>
   .byte  196,129,121,110,76,72,24            // vmovd         0x18(%r8,%r9,2),%xmm1
   .byte  196,129,113,196,76,72,28,2          // vpinsrw       $0x2,0x1c(%r8,%r9,2),%xmm1,%xmm1
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  117,10                              // jne           2068 <_sk_load_tables_rgb_u16_be_avx+0x325>
-  .byte  233,38,253,255,255                  // jmpq          1d89 <_sk_load_tables_rgb_u16_be_avx+0x46>
-  .byte  233,33,253,255,255                  // jmpq          1d89 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  .byte  117,10                              // jne           2400 <_sk_load_tables_rgb_u16_be_avx+0x325>
+  .byte  233,38,253,255,255                  // jmpq          2121 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  .byte  233,33,253,255,255                  // jmpq          2121 <_sk_load_tables_rgb_u16_be_avx+0x46>
   .byte  196,129,121,110,92,72,30            // vmovd         0x1e(%r8,%r9,2),%xmm3
   .byte  196,1,97,196,92,72,34,2             // vpinsrw       $0x2,0x22(%r8,%r9,2),%xmm3,%xmm11
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,20                              // jb            2091 <_sk_load_tables_rgb_u16_be_avx+0x34e>
+  .byte  114,20                              // jb            2429 <_sk_load_tables_rgb_u16_be_avx+0x34e>
   .byte  196,129,121,110,92,72,36            // vmovd         0x24(%r8,%r9,2),%xmm3
   .byte  196,129,97,196,92,72,40,2           // vpinsrw       $0x2,0x28(%r8,%r9,2),%xmm3,%xmm3
-  .byte  233,248,252,255,255                 // jmpq          1d89 <_sk_load_tables_rgb_u16_be_avx+0x46>
-  .byte  233,243,252,255,255                 // jmpq          1d89 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  .byte  233,248,252,255,255                 // jmpq          2121 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  .byte  233,243,252,255,255                 // jmpq          2121 <_sk_load_tables_rgb_u16_be_avx+0x46>
 
 HIDDEN _sk_byte_tables_avx
 .globl _sk_byte_tables_avx
@@ -15313,7 +15674,7 @@ _sk_load_a8_avx:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,1,248                            // add           %rdi,%rax
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,74                              // jne           3228 <_sk_load_a8_avx+0x5a>
+  .byte  117,74                              // jne           35c0 <_sk_load_a8_avx+0x5a>
   .byte  197,250,126,0                       // vmovq         (%rax),%xmm0
   .byte  196,226,121,49,200                  // vpmovzxbd     %xmm0,%xmm1
   .byte  196,227,121,4,192,229               // vpermilps     $0xe5,%xmm0,%xmm0
@@ -15340,9 +15701,9 @@ _sk_load_a8_avx:
   .byte  77,9,217                            // or            %r11,%r9
   .byte  72,131,193,8                        // add           $0x8,%rcx
   .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           3230 <_sk_load_a8_avx+0x62>
+  .byte  117,234                             // jne           35c8 <_sk_load_a8_avx+0x62>
   .byte  196,193,249,110,193                 // vmovq         %r9,%xmm0
-  .byte  235,149                             // jmp           31e2 <_sk_load_a8_avx+0x14>
+  .byte  235,149                             // jmp           357a <_sk_load_a8_avx+0x14>
 
 HIDDEN _sk_gather_a8_avx
 .globl _sk_gather_a8_avx
@@ -15423,7 +15784,7 @@ _sk_store_a8_avx:
   .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
   .byte  196,65,57,103,192                   // vpackuswb     %xmm8,%xmm8,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           3389 <_sk_store_a8_avx+0x42>
+  .byte  117,10                              // jne           3721 <_sk_store_a8_avx+0x42>
   .byte  196,65,123,17,4,57                  // vmovsd        %xmm8,(%r9,%rdi,1)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
@@ -15431,10 +15792,10 @@ _sk_store_a8_avx:
   .byte  65,128,224,7                        // and           $0x7,%r8b
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            3385 <_sk_store_a8_avx+0x3e>
+  .byte  119,236                             // ja            371d <_sk_store_a8_avx+0x3e>
   .byte  196,66,121,48,192                   // vpmovzxbw     %xmm8,%xmm8
   .byte  65,15,182,192                       // movzbl        %r8b,%eax
-  .byte  76,141,5,67,0,0,0                   // lea           0x43(%rip),%r8        # 33ec <_sk_store_a8_avx+0xa5>
+  .byte  76,141,5,67,0,0,0                   // lea           0x43(%rip),%r8        # 3784 <_sk_store_a8_avx+0xa5>
   .byte  73,99,4,128                         // movslq        (%r8,%rax,4),%rax
   .byte  76,1,192                            // add           %r8,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -15445,7 +15806,7 @@ _sk_store_a8_avx:
   .byte  196,67,121,20,68,57,2,4             // vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
   .byte  196,67,121,20,68,57,1,2             // vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
   .byte  196,67,121,20,4,57,0                // vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
-  .byte  235,154                             // jmp           3385 <_sk_store_a8_avx+0x3e>
+  .byte  235,154                             // jmp           371d <_sk_store_a8_avx+0x3e>
   .byte  144                                 // nop
   .byte  246,255                             // idiv          %bh
   .byte  255                                 // (bad)
@@ -15479,7 +15840,7 @@ _sk_load_g8_avx:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  72,1,248                            // add           %rdi,%rax
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,91                              // jne           3473 <_sk_load_g8_avx+0x6b>
+  .byte  117,91                              // jne           380b <_sk_load_g8_avx+0x6b>
   .byte  197,250,126,0                       // vmovq         (%rax),%xmm0
   .byte  196,226,121,49,200                  // vpmovzxbd     %xmm0,%xmm1
   .byte  196,227,121,4,192,229               // vpermilps     $0xe5,%xmm0,%xmm0
@@ -15509,9 +15870,9 @@ _sk_load_g8_avx:
   .byte  77,9,217                            // or            %r11,%r9
   .byte  72,131,193,8                        // add           $0x8,%rcx
   .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           347b <_sk_load_g8_avx+0x73>
+  .byte  117,234                             // jne           3813 <_sk_load_g8_avx+0x73>
   .byte  196,193,249,110,193                 // vmovq         %r9,%xmm0
-  .byte  235,132                             // jmp           341c <_sk_load_g8_avx+0x14>
+  .byte  235,132                             // jmp           37b4 <_sk_load_g8_avx+0x14>
 
 HIDDEN _sk_gather_g8_avx
 .globl _sk_gather_g8_avx
@@ -15586,9 +15947,9 @@ _sk_gather_i8_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  73,137,192                          // mov           %rax,%r8
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  116,5                               // je            35b2 <_sk_gather_i8_avx+0xf>
+  .byte  116,5                               // je            394a <_sk_gather_i8_avx+0xf>
   .byte  76,137,192                          // mov           %r8,%rax
-  .byte  235,2                               // jmp           35b4 <_sk_gather_i8_avx+0x11>
+  .byte  235,2                               // jmp           394c <_sk_gather_i8_avx+0x11>
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  65,87                               // push          %r15
   .byte  65,86                               // push          %r14
@@ -15693,7 +16054,7 @@ _sk_load_565_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,16                           // mov           (%rax),%r10
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,209,0,0,0                    // jne           384e <_sk_load_565_avx+0xdf>
+  .byte  15,133,209,0,0,0                    // jne           3be6 <_sk_load_565_avx+0xdf>
   .byte  196,193,122,111,4,122               // vmovdqu       (%r10,%rdi,2),%xmm0
   .byte  197,241,239,201                     // vpxor         %xmm1,%xmm1,%xmm1
   .byte  197,249,105,201                     // vpunpckhwd    %xmm1,%xmm0,%xmm1
@@ -15743,9 +16104,9 @@ _sk_load_565_avx:
   .byte  197,249,239,192                     // vpxor         %xmm0,%xmm0,%xmm0
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,29,255,255,255               // ja            3783 <_sk_load_565_avx+0x14>
+  .byte  15,135,29,255,255,255               // ja            3b1b <_sk_load_565_avx+0x14>
   .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  76,141,13,75,0,0,0                  // lea           0x4b(%rip),%r9        # 38bc <_sk_load_565_avx+0x14d>
+  .byte  76,141,13,75,0,0,0                  // lea           0x4b(%rip),%r9        # 3c54 <_sk_load_565_avx+0x14d>
   .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
   .byte  76,1,200                            // add           %r9,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -15757,7 +16118,7 @@ _sk_load_565_avx:
   .byte  196,193,121,196,68,122,4,2          // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
   .byte  196,193,121,196,68,122,2,1          // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
   .byte  196,193,121,196,4,122,0             // vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  233,201,254,255,255                 // jmpq          3783 <_sk_load_565_avx+0x14>
+  .byte  233,201,254,255,255                 // jmpq          3b1b <_sk_load_565_avx+0x14>
   .byte  102,144                             // xchg          %ax,%ax
   .byte  242,255                             // repnz         (bad)
   .byte  255                                 // (bad)
@@ -15914,7 +16275,7 @@ _sk_store_565_avx:
   .byte  196,67,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm9
   .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           3b07 <_sk_store_565_avx+0x9e>
+  .byte  117,10                              // jne           3e9f <_sk_store_565_avx+0x9e>
   .byte  196,65,122,127,4,121                // vmovdqu       %xmm8,(%r9,%rdi,2)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
@@ -15922,9 +16283,9 @@ _sk_store_565_avx:
   .byte  65,128,224,7                        // and           $0x7,%r8b
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            3b03 <_sk_store_565_avx+0x9a>
+  .byte  119,236                             // ja            3e9b <_sk_store_565_avx+0x9a>
   .byte  65,15,182,192                       // movzbl        %r8b,%eax
-  .byte  76,141,5,66,0,0,0                   // lea           0x42(%rip),%r8        # 3b64 <_sk_store_565_avx+0xfb>
+  .byte  76,141,5,66,0,0,0                   // lea           0x42(%rip),%r8        # 3efc <_sk_store_565_avx+0xfb>
   .byte  73,99,4,128                         // movslq        (%r8,%rax,4),%rax
   .byte  76,1,192                            // add           %r8,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -15935,7 +16296,7 @@ _sk_store_565_avx:
   .byte  196,67,121,21,68,121,4,2            // vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
   .byte  196,67,121,21,68,121,2,1            // vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
   .byte  196,67,121,21,4,121,0               // vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
-  .byte  235,159                             // jmp           3b03 <_sk_store_565_avx+0x9a>
+  .byte  235,159                             // jmp           3e9b <_sk_store_565_avx+0x9a>
   .byte  247,255                             // idiv          %edi
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
@@ -15966,7 +16327,7 @@ _sk_load_4444_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,16                           // mov           (%rax),%r10
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,245,0,0,0                    // jne           3c83 <_sk_load_4444_avx+0x103>
+  .byte  15,133,245,0,0,0                    // jne           401b <_sk_load_4444_avx+0x103>
   .byte  196,193,122,111,4,122               // vmovdqu       (%r10,%rdi,2),%xmm0
   .byte  197,241,239,201                     // vpxor         %xmm1,%xmm1,%xmm1
   .byte  197,249,105,201                     // vpunpckhwd    %xmm1,%xmm0,%xmm1
@@ -16023,9 +16384,9 @@ _sk_load_4444_avx:
   .byte  197,249,239,192                     // vpxor         %xmm0,%xmm0,%xmm0
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,249,254,255,255              // ja            3b94 <_sk_load_4444_avx+0x14>
+  .byte  15,135,249,254,255,255              // ja            3f2c <_sk_load_4444_avx+0x14>
   .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  76,141,13,74,0,0,0                  // lea           0x4a(%rip),%r9        # 3cf0 <_sk_load_4444_avx+0x170>
+  .byte  76,141,13,74,0,0,0                  // lea           0x4a(%rip),%r9        # 4088 <_sk_load_4444_avx+0x170>
   .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
   .byte  76,1,200                            // add           %r9,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -16037,12 +16398,12 @@ _sk_load_4444_avx:
   .byte  196,193,121,196,68,122,4,2          // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
   .byte  196,193,121,196,68,122,2,1          // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
   .byte  196,193,121,196,4,122,0             // vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  233,165,254,255,255                 // jmpq          3b94 <_sk_load_4444_avx+0x14>
+  .byte  233,165,254,255,255                 // jmpq          3f2c <_sk_load_4444_avx+0x14>
   .byte  144                                 // nop
   .byte  243,255                             // repz          (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  235,255                             // jmp           3cf5 <_sk_load_4444_avx+0x175>
+  .byte  235,255                             // jmp           408d <_sk_load_4444_avx+0x175>
   .byte  255                                 // (bad)
   .byte  255,227                             // jmpq          *%rbx
   .byte  255                                 // (bad)
@@ -16203,7 +16564,7 @@ _sk_store_4444_avx:
   .byte  196,67,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm9
   .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           3f70 <_sk_store_4444_avx+0xaf>
+  .byte  117,10                              // jne           4308 <_sk_store_4444_avx+0xaf>
   .byte  196,65,122,127,4,121                // vmovdqu       %xmm8,(%r9,%rdi,2)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
@@ -16211,9 +16572,9 @@ _sk_store_4444_avx:
   .byte  65,128,224,7                        // and           $0x7,%r8b
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            3f6c <_sk_store_4444_avx+0xab>
+  .byte  119,236                             // ja            4304 <_sk_store_4444_avx+0xab>
   .byte  65,15,182,192                       // movzbl        %r8b,%eax
-  .byte  76,141,5,69,0,0,0                   // lea           0x45(%rip),%r8        # 3fd0 <_sk_store_4444_avx+0x10f>
+  .byte  76,141,5,69,0,0,0                   // lea           0x45(%rip),%r8        # 4368 <_sk_store_4444_avx+0x10f>
   .byte  73,99,4,128                         // movslq        (%r8,%rax,4),%rax
   .byte  76,1,192                            // add           %r8,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -16224,7 +16585,7 @@ _sk_store_4444_avx:
   .byte  196,67,121,21,68,121,4,2            // vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
   .byte  196,67,121,21,68,121,2,1            // vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
   .byte  196,67,121,21,4,121,0               // vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
-  .byte  235,159                             // jmp           3f6c <_sk_store_4444_avx+0xab>
+  .byte  235,159                             // jmp           4304 <_sk_store_4444_avx+0xab>
   .byte  15,31,0                             // nopl          (%rax)
   .byte  244                                 // hlt
   .byte  255                                 // (bad)
@@ -16257,7 +16618,7 @@ _sk_load_8888_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,16                           // mov           (%rax),%r10
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,157,0,0,0                    // jne           4097 <_sk_load_8888_avx+0xab>
+  .byte  15,133,157,0,0,0                    // jne           442f <_sk_load_8888_avx+0xab>
   .byte  196,65,124,16,12,186                // vmovups       (%r10,%rdi,4),%ymm9
   .byte  184,255,0,0,0                       // mov           $0xff,%eax
   .byte  197,249,110,192                     // vmovd         %eax,%xmm0
@@ -16295,9 +16656,9 @@ _sk_load_8888_avx:
   .byte  196,65,52,87,201                    // vxorps        %ymm9,%ymm9,%ymm9
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,80,255,255,255               // ja            4000 <_sk_load_8888_avx+0x14>
+  .byte  15,135,80,255,255,255               // ja            4398 <_sk_load_8888_avx+0x14>
   .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  76,141,13,137,0,0,0                 // lea           0x89(%rip),%r9        # 4144 <_sk_load_8888_avx+0x158>
+  .byte  76,141,13,137,0,0,0                 // lea           0x89(%rip),%r9        # 44dc <_sk_load_8888_avx+0x158>
   .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
   .byte  76,1,200                            // add           %r9,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -16320,7 +16681,7 @@ _sk_load_8888_avx:
   .byte  196,99,53,12,200,15                 // vblendps      $0xf,%ymm0,%ymm9,%ymm9
   .byte  196,195,49,34,4,186,0               // vpinsrd       $0x0,(%r10,%rdi,4),%xmm9,%xmm0
   .byte  196,99,53,12,200,15                 // vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  .byte  233,188,254,255,255                 // jmpq          4000 <_sk_load_8888_avx+0x14>
+  .byte  233,188,254,255,255                 // jmpq          4398 <_sk_load_8888_avx+0x14>
   .byte  238                                 // out           %al,(%dx)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
@@ -16450,7 +16811,7 @@ _sk_store_8888_avx:
   .byte  196,65,45,86,192                    // vorpd         %ymm8,%ymm10,%ymm8
   .byte  196,65,53,86,192                    // vorpd         %ymm8,%ymm9,%ymm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           4345 <_sk_store_8888_avx+0xa4>
+  .byte  117,10                              // jne           46dd <_sk_store_8888_avx+0xa4>
   .byte  196,65,124,17,4,185                 // vmovups       %ymm8,(%r9,%rdi,4)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
@@ -16458,9 +16819,9 @@ _sk_store_8888_avx:
   .byte  65,128,224,7                        // and           $0x7,%r8b
   .byte  65,254,200                          // dec           %r8b
   .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            4341 <_sk_store_8888_avx+0xa0>
+  .byte  119,236                             // ja            46d9 <_sk_store_8888_avx+0xa0>
   .byte  65,15,182,192                       // movzbl        %r8b,%eax
-  .byte  76,141,5,84,0,0,0                   // lea           0x54(%rip),%r8        # 43b4 <_sk_store_8888_avx+0x113>
+  .byte  76,141,5,84,0,0,0                   // lea           0x54(%rip),%r8        # 474c <_sk_store_8888_avx+0x113>
   .byte  73,99,4,128                         // movslq        (%r8,%rax,4),%rax
   .byte  76,1,192                            // add           %r8,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -16474,7 +16835,7 @@ _sk_store_8888_avx:
   .byte  196,67,121,22,68,185,8,2            // vpextrd       $0x2,%xmm8,0x8(%r9,%rdi,4)
   .byte  196,67,121,22,68,185,4,1            // vpextrd       $0x1,%xmm8,0x4(%r9,%rdi,4)
   .byte  196,65,121,126,4,185                // vmovd         %xmm8,(%r9,%rdi,4)
-  .byte  235,143                             // jmp           4341 <_sk_store_8888_avx+0xa0>
+  .byte  235,143                             // jmp           46d9 <_sk_store_8888_avx+0xa0>
   .byte  102,144                             // xchg          %ax,%ax
   .byte  246,255                             // idiv          %bh
   .byte  255                                 // (bad)
@@ -16509,7 +16870,7 @@ _sk_load_f16_avx:
   .byte  197,252,17,124,36,200               // vmovups       %ymm7,-0x38(%rsp)
   .byte  197,252,17,116,36,168               // vmovups       %ymm6,-0x58(%rsp)
   .byte  197,252,17,108,36,136               // vmovups       %ymm5,-0x78(%rsp)
-  .byte  15,133,46,2,0,0                     // jne           461e <_sk_load_f16_avx+0x24e>
+  .byte  15,133,46,2,0,0                     // jne           49b6 <_sk_load_f16_avx+0x24e>
   .byte  197,121,16,4,248                    // vmovupd       (%rax,%rdi,8),%xmm8
   .byte  197,249,16,84,248,16                // vmovupd       0x10(%rax,%rdi,8),%xmm2
   .byte  197,249,16,76,248,32                // vmovupd       0x20(%rax,%rdi,8),%xmm1
@@ -16626,29 +16987,29 @@ _sk_load_f16_avx:
   .byte  197,123,16,4,248                    // vmovsd        (%rax,%rdi,8),%xmm8
   .byte  196,65,49,239,201                   // vpxor         %xmm9,%xmm9,%xmm9
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,79                              // je            467d <_sk_load_f16_avx+0x2ad>
+  .byte  116,79                              // je            4a15 <_sk_load_f16_avx+0x2ad>
   .byte  197,57,22,68,248,8                  // vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,67                              // jb            467d <_sk_load_f16_avx+0x2ad>
+  .byte  114,67                              // jb            4a15 <_sk_load_f16_avx+0x2ad>
   .byte  197,251,16,84,248,16                // vmovsd        0x10(%rax,%rdi,8),%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  116,68                              // je            468a <_sk_load_f16_avx+0x2ba>
+  .byte  116,68                              // je            4a22 <_sk_load_f16_avx+0x2ba>
   .byte  197,233,22,84,248,24                // vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,56                              // jb            468a <_sk_load_f16_avx+0x2ba>
+  .byte  114,56                              // jb            4a22 <_sk_load_f16_avx+0x2ba>
   .byte  197,251,16,76,248,32                // vmovsd        0x20(%rax,%rdi,8),%xmm1
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  15,132,165,253,255,255              // je            4407 <_sk_load_f16_avx+0x37>
+  .byte  15,132,165,253,255,255              // je            479f <_sk_load_f16_avx+0x37>
   .byte  197,241,22,76,248,40                // vmovhpd       0x28(%rax,%rdi,8),%xmm1,%xmm1
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,149,253,255,255              // jb            4407 <_sk_load_f16_avx+0x37>
+  .byte  15,130,149,253,255,255              // jb            479f <_sk_load_f16_avx+0x37>
   .byte  197,122,126,76,248,48               // vmovq         0x30(%rax,%rdi,8),%xmm9
-  .byte  233,138,253,255,255                 // jmpq          4407 <_sk_load_f16_avx+0x37>
+  .byte  233,138,253,255,255                 // jmpq          479f <_sk_load_f16_avx+0x37>
   .byte  197,241,87,201                      // vxorpd        %xmm1,%xmm1,%xmm1
   .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,125,253,255,255                 // jmpq          4407 <_sk_load_f16_avx+0x37>
+  .byte  233,125,253,255,255                 // jmpq          479f <_sk_load_f16_avx+0x37>
   .byte  197,241,87,201                      // vxorpd        %xmm1,%xmm1,%xmm1
-  .byte  233,116,253,255,255                 // jmpq          4407 <_sk_load_f16_avx+0x37>
+  .byte  233,116,253,255,255                 // jmpq          479f <_sk_load_f16_avx+0x37>
 
 HIDDEN _sk_gather_f16_avx
 .globl _sk_gather_f16_avx
@@ -16925,7 +17286,7 @@ _sk_store_f16_avx:
   .byte  197,113,98,202                      // vpunpckldq    %xmm2,%xmm1,%xmm9
   .byte  197,113,106,194                     // vpunpckhdq    %xmm2,%xmm1,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,70                              // jne           4bde <_sk_store_f16_avx+0x25f>
+  .byte  117,70                              // jne           4f76 <_sk_store_f16_avx+0x25f>
   .byte  196,65,120,17,28,248                // vmovups       %xmm11,(%r8,%rdi,8)
   .byte  196,65,120,17,84,248,16             // vmovups       %xmm10,0x10(%r8,%rdi,8)
   .byte  196,65,120,17,76,248,32             // vmovups       %xmm9,0x20(%r8,%rdi,8)
@@ -16941,22 +17302,22 @@ _sk_store_f16_avx:
   .byte  255,224                             // jmpq          *%rax
   .byte  196,65,121,214,28,248               // vmovq         %xmm11,(%r8,%rdi,8)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,201                             // je            4bb3 <_sk_store_f16_avx+0x234>
+  .byte  116,201                             // je            4f4b <_sk_store_f16_avx+0x234>
   .byte  196,65,121,23,92,248,8              // vmovhpd       %xmm11,0x8(%r8,%rdi,8)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,188                             // jb            4bb3 <_sk_store_f16_avx+0x234>
+  .byte  114,188                             // jb            4f4b <_sk_store_f16_avx+0x234>
   .byte  196,65,121,214,84,248,16            // vmovq         %xmm10,0x10(%r8,%rdi,8)
-  .byte  116,179                             // je            4bb3 <_sk_store_f16_avx+0x234>
+  .byte  116,179                             // je            4f4b <_sk_store_f16_avx+0x234>
   .byte  196,65,121,23,84,248,24             // vmovhpd       %xmm10,0x18(%r8,%rdi,8)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,166                             // jb            4bb3 <_sk_store_f16_avx+0x234>
+  .byte  114,166                             // jb            4f4b <_sk_store_f16_avx+0x234>
   .byte  196,65,121,214,76,248,32            // vmovq         %xmm9,0x20(%r8,%rdi,8)
-  .byte  116,157                             // je            4bb3 <_sk_store_f16_avx+0x234>
+  .byte  116,157                             // je            4f4b <_sk_store_f16_avx+0x234>
   .byte  196,65,121,23,76,248,40             // vmovhpd       %xmm9,0x28(%r8,%rdi,8)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,144                             // jb            4bb3 <_sk_store_f16_avx+0x234>
+  .byte  114,144                             // jb            4f4b <_sk_store_f16_avx+0x234>
   .byte  196,65,121,214,68,248,48            // vmovq         %xmm8,0x30(%r8,%rdi,8)
-  .byte  235,135                             // jmp           4bb3 <_sk_store_f16_avx+0x234>
+  .byte  235,135                             // jmp           4f4b <_sk_store_f16_avx+0x234>
 
 HIDDEN _sk_load_u16_be_avx
 .globl _sk_load_u16_be_avx
@@ -16966,7 +17327,7 @@ _sk_load_u16_be_avx:
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  72,141,4,189,0,0,0,0                // lea           0x0(,%rdi,4),%rax
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,5,1,0,0                      // jne           4d47 <_sk_load_u16_be_avx+0x11b>
+  .byte  15,133,5,1,0,0                      // jne           50df <_sk_load_u16_be_avx+0x11b>
   .byte  196,65,121,16,4,64                  // vmovupd       (%r8,%rax,2),%xmm8
   .byte  196,193,121,16,84,64,16             // vmovupd       0x10(%r8,%rax,2),%xmm2
   .byte  196,193,121,16,92,64,32             // vmovupd       0x20(%r8,%rax,2),%xmm3
@@ -17025,29 +17386,29 @@ _sk_load_u16_be_avx:
   .byte  196,65,123,16,4,64                  // vmovsd        (%r8,%rax,2),%xmm8
   .byte  196,65,49,239,201                   // vpxor         %xmm9,%xmm9,%xmm9
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,85                              // je            4dad <_sk_load_u16_be_avx+0x181>
+  .byte  116,85                              // je            5145 <_sk_load_u16_be_avx+0x181>
   .byte  196,65,57,22,68,64,8                // vmovhpd       0x8(%r8,%rax,2),%xmm8,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,72                              // jb            4dad <_sk_load_u16_be_avx+0x181>
+  .byte  114,72                              // jb            5145 <_sk_load_u16_be_avx+0x181>
   .byte  196,193,123,16,84,64,16             // vmovsd        0x10(%r8,%rax,2),%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  116,72                              // je            4dba <_sk_load_u16_be_avx+0x18e>
+  .byte  116,72                              // je            5152 <_sk_load_u16_be_avx+0x18e>
   .byte  196,193,105,22,84,64,24             // vmovhpd       0x18(%r8,%rax,2),%xmm2,%xmm2
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,59                              // jb            4dba <_sk_load_u16_be_avx+0x18e>
+  .byte  114,59                              // jb            5152 <_sk_load_u16_be_avx+0x18e>
   .byte  196,193,123,16,92,64,32             // vmovsd        0x20(%r8,%rax,2),%xmm3
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  15,132,205,254,255,255              // je            4c5d <_sk_load_u16_be_avx+0x31>
+  .byte  15,132,205,254,255,255              // je            4ff5 <_sk_load_u16_be_avx+0x31>
   .byte  196,193,97,22,92,64,40              // vmovhpd       0x28(%r8,%rax,2),%xmm3,%xmm3
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,188,254,255,255              // jb            4c5d <_sk_load_u16_be_avx+0x31>
+  .byte  15,130,188,254,255,255              // jb            4ff5 <_sk_load_u16_be_avx+0x31>
   .byte  196,65,122,126,76,64,48             // vmovq         0x30(%r8,%rax,2),%xmm9
-  .byte  233,176,254,255,255                 // jmpq          4c5d <_sk_load_u16_be_avx+0x31>
+  .byte  233,176,254,255,255                 // jmpq          4ff5 <_sk_load_u16_be_avx+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
   .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,163,254,255,255                 // jmpq          4c5d <_sk_load_u16_be_avx+0x31>
+  .byte  233,163,254,255,255                 // jmpq          4ff5 <_sk_load_u16_be_avx+0x31>
   .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  233,154,254,255,255                 // jmpq          4c5d <_sk_load_u16_be_avx+0x31>
+  .byte  233,154,254,255,255                 // jmpq          4ff5 <_sk_load_u16_be_avx+0x31>
 
 HIDDEN _sk_load_rgb_u16_be_avx
 .globl _sk_load_rgb_u16_be_avx
@@ -17057,7 +17418,7 @@ _sk_load_rgb_u16_be_avx:
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  72,141,4,127                        // lea           (%rdi,%rdi,2),%rax
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,8,1,0,0                      // jne           4edd <_sk_load_rgb_u16_be_avx+0x11a>
+  .byte  15,133,8,1,0,0                      // jne           5275 <_sk_load_rgb_u16_be_avx+0x11a>
   .byte  196,193,122,111,4,64                // vmovdqu       (%r8,%rax,2),%xmm0
   .byte  196,193,122,111,84,64,12            // vmovdqu       0xc(%r8,%rax,2),%xmm2
   .byte  196,193,122,111,76,64,24            // vmovdqu       0x18(%r8,%rax,2),%xmm1
@@ -17116,36 +17477,36 @@ _sk_load_rgb_u16_be_avx:
   .byte  196,193,121,110,4,64                // vmovd         (%r8,%rax,2),%xmm0
   .byte  196,193,121,196,68,64,4,2           // vpinsrw       $0x2,0x4(%r8,%rax,2),%xmm0,%xmm0
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  117,5                               // jne           4ef6 <_sk_load_rgb_u16_be_avx+0x133>
-  .byte  233,19,255,255,255                  // jmpq          4e09 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  117,5                               // jne           528e <_sk_load_rgb_u16_be_avx+0x133>
+  .byte  233,19,255,255,255                  // jmpq          51a1 <_sk_load_rgb_u16_be_avx+0x46>
   .byte  196,193,121,110,76,64,6             // vmovd         0x6(%r8,%rax,2),%xmm1
   .byte  196,65,113,196,68,64,10,2           // vpinsrw       $0x2,0xa(%r8,%rax,2),%xmm1,%xmm8
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,26                              // jb            4f25 <_sk_load_rgb_u16_be_avx+0x162>
+  .byte  114,26                              // jb            52bd <_sk_load_rgb_u16_be_avx+0x162>
   .byte  196,193,121,110,76,64,12            // vmovd         0xc(%r8,%rax,2),%xmm1
   .byte  196,193,113,196,84,64,16,2          // vpinsrw       $0x2,0x10(%r8,%rax,2),%xmm1,%xmm2
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  117,10                              // jne           4f2a <_sk_load_rgb_u16_be_avx+0x167>
-  .byte  233,228,254,255,255                 // jmpq          4e09 <_sk_load_rgb_u16_be_avx+0x46>
-  .byte  233,223,254,255,255                 // jmpq          4e09 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  117,10                              // jne           52c2 <_sk_load_rgb_u16_be_avx+0x167>
+  .byte  233,228,254,255,255                 // jmpq          51a1 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  233,223,254,255,255                 // jmpq          51a1 <_sk_load_rgb_u16_be_avx+0x46>
   .byte  196,193,121,110,76,64,18            // vmovd         0x12(%r8,%rax,2),%xmm1
   .byte  196,65,113,196,76,64,22,2           // vpinsrw       $0x2,0x16(%r8,%rax,2),%xmm1,%xmm9
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,26                              // jb            4f59 <_sk_load_rgb_u16_be_avx+0x196>
+  .byte  114,26                              // jb            52f1 <_sk_load_rgb_u16_be_avx+0x196>
   .byte  196,193,121,110,76,64,24            // vmovd         0x18(%r8,%rax,2),%xmm1
   .byte  196,193,113,196,76,64,28,2          // vpinsrw       $0x2,0x1c(%r8,%rax,2),%xmm1,%xmm1
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  117,10                              // jne           4f5e <_sk_load_rgb_u16_be_avx+0x19b>
-  .byte  233,176,254,255,255                 // jmpq          4e09 <_sk_load_rgb_u16_be_avx+0x46>
-  .byte  233,171,254,255,255                 // jmpq          4e09 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  117,10                              // jne           52f6 <_sk_load_rgb_u16_be_avx+0x19b>
+  .byte  233,176,254,255,255                 // jmpq          51a1 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  233,171,254,255,255                 // jmpq          51a1 <_sk_load_rgb_u16_be_avx+0x46>
   .byte  196,193,121,110,92,64,30            // vmovd         0x1e(%r8,%rax,2),%xmm3
   .byte  196,65,97,196,92,64,34,2            // vpinsrw       $0x2,0x22(%r8,%rax,2),%xmm3,%xmm11
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,20                              // jb            4f87 <_sk_load_rgb_u16_be_avx+0x1c4>
+  .byte  114,20                              // jb            531f <_sk_load_rgb_u16_be_avx+0x1c4>
   .byte  196,193,121,110,92,64,36            // vmovd         0x24(%r8,%rax,2),%xmm3
   .byte  196,193,97,196,92,64,40,2           // vpinsrw       $0x2,0x28(%r8,%rax,2),%xmm3,%xmm3
-  .byte  233,130,254,255,255                 // jmpq          4e09 <_sk_load_rgb_u16_be_avx+0x46>
-  .byte  233,125,254,255,255                 // jmpq          4e09 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  233,130,254,255,255                 // jmpq          51a1 <_sk_load_rgb_u16_be_avx+0x46>
+  .byte  233,125,254,255,255                 // jmpq          51a1 <_sk_load_rgb_u16_be_avx+0x46>
 
 HIDDEN _sk_store_u16_be_avx
 .globl _sk_store_u16_be_avx
@@ -17195,7 +17556,7 @@ _sk_store_u16_be_avx:
   .byte  196,65,17,98,200                    // vpunpckldq    %xmm8,%xmm13,%xmm9
   .byte  196,65,17,106,192                   // vpunpckhdq    %xmm8,%xmm13,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,31                              // jne           508e <_sk_store_u16_be_avx+0x102>
+  .byte  117,31                              // jne           5426 <_sk_store_u16_be_avx+0x102>
   .byte  196,1,120,17,28,72                  // vmovups       %xmm11,(%r8,%r9,2)
   .byte  196,1,120,17,84,72,16               // vmovups       %xmm10,0x10(%r8,%r9,2)
   .byte  196,1,120,17,76,72,32               // vmovups       %xmm9,0x20(%r8,%r9,2)
@@ -17204,22 +17565,22 @@ _sk_store_u16_be_avx:
   .byte  255,224                             // jmpq          *%rax
   .byte  196,1,121,214,28,72                 // vmovq         %xmm11,(%r8,%r9,2)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            508a <_sk_store_u16_be_avx+0xfe>
+  .byte  116,240                             // je            5422 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,23,92,72,8                // vmovhpd       %xmm11,0x8(%r8,%r9,2)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            508a <_sk_store_u16_be_avx+0xfe>
+  .byte  114,227                             // jb            5422 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,214,84,72,16              // vmovq         %xmm10,0x10(%r8,%r9,2)
-  .byte  116,218                             // je            508a <_sk_store_u16_be_avx+0xfe>
+  .byte  116,218                             // je            5422 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,23,84,72,24               // vmovhpd       %xmm10,0x18(%r8,%r9,2)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            508a <_sk_store_u16_be_avx+0xfe>
+  .byte  114,205                             // jb            5422 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,214,76,72,32              // vmovq         %xmm9,0x20(%r8,%r9,2)
-  .byte  116,196                             // je            508a <_sk_store_u16_be_avx+0xfe>
+  .byte  116,196                             // je            5422 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,23,76,72,40               // vmovhpd       %xmm9,0x28(%r8,%r9,2)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,183                             // jb            508a <_sk_store_u16_be_avx+0xfe>
+  .byte  114,183                             // jb            5422 <_sk_store_u16_be_avx+0xfe>
   .byte  196,1,121,214,68,72,48              // vmovq         %xmm8,0x30(%r8,%r9,2)
-  .byte  235,174                             // jmp           508a <_sk_store_u16_be_avx+0xfe>
+  .byte  235,174                             // jmp           5422 <_sk_store_u16_be_avx+0xfe>
 
 HIDDEN _sk_load_f32_avx
 .globl _sk_load_f32_avx
@@ -17227,10 +17588,10 @@ FUNCTION(_sk_load_f32_avx)
 _sk_load_f32_avx:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  119,110                             // ja            5152 <_sk_load_f32_avx+0x76>
+  .byte  119,110                             // ja            54ea <_sk_load_f32_avx+0x76>
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
-  .byte  76,141,21,134,0,0,0                 // lea           0x86(%rip),%r10        # 517c <_sk_load_f32_avx+0xa0>
+  .byte  76,141,21,134,0,0,0                 // lea           0x86(%rip),%r10        # 5514 <_sk_load_f32_avx+0xa0>
   .byte  73,99,4,138                         // movslq        (%r10,%rcx,4),%rax
   .byte  76,1,208                            // add           %r10,%rax
   .byte  255,224                             // jmpq          *%rax
@@ -17289,7 +17650,7 @@ _sk_store_f32_avx:
   .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
   .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,55                              // jne           5209 <_sk_store_f32_avx+0x6d>
+  .byte  117,55                              // jne           55a1 <_sk_store_f32_avx+0x6d>
   .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -17302,22 +17663,22 @@ _sk_store_f32_avx:
   .byte  255,224                             // jmpq          *%rax
   .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            5205 <_sk_store_f32_avx+0x69>
+  .byte  116,240                             // je            559d <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            5205 <_sk_store_f32_avx+0x69>
+  .byte  114,227                             // jb            559d <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
-  .byte  116,218                             // je            5205 <_sk_store_f32_avx+0x69>
+  .byte  116,218                             // je            559d <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            5205 <_sk_store_f32_avx+0x69>
+  .byte  114,205                             // jb            559d <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  .byte  116,195                             // je            5205 <_sk_store_f32_avx+0x69>
+  .byte  116,195                             // je            559d <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,181                             // jb            5205 <_sk_store_f32_avx+0x69>
+  .byte  114,181                             // jb            559d <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  .byte  235,171                             // jmp           5205 <_sk_store_f32_avx+0x69>
+  .byte  235,171                             // jmp           559d <_sk_store_f32_avx+0x69>
 
 HIDDEN _sk_clamp_x_avx
 .globl _sk_clamp_x_avx
@@ -17645,7 +18006,7 @@ _sk_linear_gradient_avx:
   .byte  196,226,125,24,88,28                // vbroadcastss  0x1c(%rax),%ymm3
   .byte  76,139,0                            // mov           (%rax),%r8
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  15,132,146,0,0,0                    // je            57bd <_sk_linear_gradient_avx+0xb8>
+  .byte  15,132,146,0,0,0                    // je            5b55 <_sk_linear_gradient_avx+0xb8>
   .byte  72,139,64,8                         // mov           0x8(%rax),%rax
   .byte  72,131,192,32                       // add           $0x20,%rax
   .byte  196,65,28,87,228                    // vxorps        %ymm12,%ymm12,%ymm12
@@ -17672,8 +18033,8 @@ _sk_linear_gradient_avx:
   .byte  196,227,13,74,219,208               // vblendvps     %ymm13,%ymm3,%ymm14,%ymm3
   .byte  72,131,192,36                       // add           $0x24,%rax
   .byte  73,255,200                          // dec           %r8
-  .byte  117,140                             // jne           5747 <_sk_linear_gradient_avx+0x42>
-  .byte  235,20                              // jmp           57d1 <_sk_linear_gradient_avx+0xcc>
+  .byte  117,140                             // jne           5adf <_sk_linear_gradient_avx+0x42>
+  .byte  235,20                              // jmp           5b69 <_sk_linear_gradient_avx+0xcc>
   .byte  196,65,36,87,219                    // vxorps        %ymm11,%ymm11,%ymm11
   .byte  196,65,44,87,210                    // vxorps        %ymm10,%ymm10,%ymm10
   .byte  196,65,52,87,201                    // vxorps        %ymm9,%ymm9,%ymm9
@@ -19447,99 +19808,282 @@ HIDDEN _sk_from_2dot2_sse41
 .globl _sk_from_2dot2_sse41
 FUNCTION(_sk_from_2dot2_sse41)
 _sk_from_2dot2_sse41:
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  65,15,82,192                        // rsqrtps       %xmm8,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  68,15,82,200                        // rsqrtps       %xmm0,%xmm9
-  .byte  65,15,82,193                        // rsqrtps       %xmm9,%xmm0
-  .byte  68,15,82,208                        // rsqrtps       %xmm0,%xmm10
-  .byte  69,15,89,192                        // mulps         %xmm8,%xmm8
+  .byte  15,41,124,36,232                    // movaps        %xmm7,-0x18(%rsp)
+  .byte  15,41,116,36,216                    // movaps        %xmm6,-0x28(%rsp)
+  .byte  15,41,108,36,200                    // movaps        %xmm5,-0x38(%rsp)
+  .byte  15,41,100,36,184                    // movaps        %xmm4,-0x48(%rsp)
+  .byte  15,41,92,36,168                     // movaps        %xmm3,-0x58(%rsp)
+  .byte  15,41,84,36,152                     // movaps        %xmm2,-0x68(%rsp)
+  .byte  15,40,209                           // movaps        %xmm1,%xmm2
+  .byte  184,205,204,12,64                   // mov           $0x400ccccd,%eax
+  .byte  15,91,216                           // cvtdq2ps      %xmm0,%xmm3
+  .byte  185,0,0,0,52                        // mov           $0x34000000,%ecx
+  .byte  102,68,15,110,209                   // movd          %ecx,%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  65,15,89,218                        // mulps         %xmm10,%xmm3
+  .byte  185,255,255,127,0                   // mov           $0x7fffff,%ecx
+  .byte  102,15,110,201                      // movd          %ecx,%xmm1
+  .byte  102,68,15,112,193,0                 // pshufd        $0x0,%xmm1,%xmm8
+  .byte  65,15,84,192                        // andps         %xmm8,%xmm0
+  .byte  185,0,0,0,63                        // mov           $0x3f000000,%ecx
+  .byte  102,15,110,201                      // movd          %ecx,%xmm1
+  .byte  102,15,112,201,0                    // pshufd        $0x0,%xmm1,%xmm1
+  .byte  15,86,193                           // orps          %xmm1,%xmm0
+  .byte  15,40,241                           // movaps        %xmm1,%xmm6
+  .byte  15,41,116,36,136                    // movaps        %xmm6,-0x78(%rsp)
+  .byte  185,119,115,248,66                  // mov           $0x42f87377,%ecx
+  .byte  102,68,15,110,217                   // movd          %ecx,%xmm11
+  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
+  .byte  65,15,92,219                        // subps         %xmm11,%xmm3
+  .byte  185,117,191,191,63                  // mov           $0x3fbfbf75,%ecx
+  .byte  102,68,15,110,225                   // movd          %ecx,%xmm12
+  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
+  .byte  15,40,200                           // movaps        %xmm0,%xmm1
+  .byte  65,15,89,204                        // mulps         %xmm12,%xmm1
+  .byte  15,92,217                           // subps         %xmm1,%xmm3
+  .byte  185,163,233,220,63                  // mov           $0x3fdce9a3,%ecx
+  .byte  102,68,15,110,233                   // movd          %ecx,%xmm13
+  .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
+  .byte  185,249,68,180,62                   // mov           $0x3eb444f9,%ecx
+  .byte  102,68,15,110,241                   // movd          %ecx,%xmm14
+  .byte  69,15,198,246,0                     // shufps        $0x0,%xmm14,%xmm14
+  .byte  65,15,88,198                        // addps         %xmm14,%xmm0
+  .byte  65,15,40,205                        // movaps        %xmm13,%xmm1
+  .byte  15,94,200                           // divps         %xmm0,%xmm1
+  .byte  15,92,217                           // subps         %xmm1,%xmm3
+  .byte  102,68,15,110,248                   // movd          %eax,%xmm15
+  .byte  69,15,198,255,0                     // shufps        $0x0,%xmm15,%xmm15
+  .byte  65,15,89,223                        // mulps         %xmm15,%xmm3
+  .byte  65,184,0,0,0,75                     // mov           $0x4b000000,%r8d
+  .byte  185,81,140,242,66                   // mov           $0x42f28c51,%ecx
+  .byte  102,15,110,225                      // movd          %ecx,%xmm4
+  .byte  15,198,228,0                        // shufps        $0x0,%xmm4,%xmm4
+  .byte  15,40,204                           // movaps        %xmm4,%xmm1
+  .byte  15,88,203                           // addps         %xmm3,%xmm1
+  .byte  102,15,58,8,195,1                   // roundps       $0x1,%xmm3,%xmm0
+  .byte  15,92,216                           // subps         %xmm0,%xmm3
+  .byte  185,141,188,190,63                  // mov           $0x3fbebc8d,%ecx
+  .byte  102,68,15,110,201                   // movd          %ecx,%xmm9
+  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
   .byte  65,15,40,193                        // movaps        %xmm9,%xmm0
-  .byte  15,89,192                           // mulps         %xmm0,%xmm0
-  .byte  65,15,89,193                        // mulps         %xmm9,%xmm0
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,89,194                        // mulps         %xmm10,%xmm0
-  .byte  69,15,87,210                        // xorps         %xmm10,%xmm10
-  .byte  65,15,95,194                        // maxps         %xmm10,%xmm0
-  .byte  68,15,82,193                        // rsqrtps       %xmm1,%xmm8
-  .byte  69,15,82,192                        // rsqrtps       %xmm8,%xmm8
-  .byte  69,15,82,192                        // rsqrtps       %xmm8,%xmm8
-  .byte  69,15,82,200                        // rsqrtps       %xmm8,%xmm9
-  .byte  69,15,82,193                        // rsqrtps       %xmm9,%xmm8
-  .byte  69,15,82,216                        // rsqrtps       %xmm8,%xmm11
-  .byte  15,89,201                           // mulps         %xmm1,%xmm1
-  .byte  69,15,40,193                        // movaps        %xmm9,%xmm8
-  .byte  69,15,89,192                        // mulps         %xmm8,%xmm8
-  .byte  69,15,89,193                        // mulps         %xmm9,%xmm8
-  .byte  68,15,89,193                        // mulps         %xmm1,%xmm8
-  .byte  69,15,89,195                        // mulps         %xmm11,%xmm8
-  .byte  69,15,95,194                        // maxps         %xmm10,%xmm8
-  .byte  15,82,202                           // rsqrtps       %xmm2,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  68,15,82,217                        // rsqrtps       %xmm1,%xmm11
-  .byte  65,15,82,203                        // rsqrtps       %xmm11,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  15,89,210                           // mulps         %xmm2,%xmm2
-  .byte  69,15,40,203                        // movaps        %xmm11,%xmm9
-  .byte  69,15,89,201                        // mulps         %xmm9,%xmm9
-  .byte  69,15,89,203                        // mulps         %xmm11,%xmm9
-  .byte  68,15,89,202                        // mulps         %xmm2,%xmm9
+  .byte  15,89,195                           // mulps         %xmm3,%xmm0
+  .byte  15,92,200                           // subps         %xmm0,%xmm1
+  .byte  185,254,210,221,65                  // mov           $0x41ddd2fe,%ecx
+  .byte  184,248,245,154,64                  // mov           $0x409af5f8,%eax
+  .byte  102,15,110,248                      // movd          %eax,%xmm7
+  .byte  15,198,255,0                        // shufps        $0x0,%xmm7,%xmm7
+  .byte  15,40,239                           // movaps        %xmm7,%xmm5
+  .byte  15,92,235                           // subps         %xmm3,%xmm5
+  .byte  102,15,110,193                      // movd          %ecx,%xmm0
+  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  15,94,221                           // divps         %xmm5,%xmm3
+  .byte  15,88,217                           // addps         %xmm1,%xmm3
+  .byte  15,91,202                           // cvtdq2ps      %xmm2,%xmm1
+  .byte  65,15,89,202                        // mulps         %xmm10,%xmm1
+  .byte  65,15,84,208                        // andps         %xmm8,%xmm2
+  .byte  15,86,214                           // orps          %xmm6,%xmm2
+  .byte  65,15,92,203                        // subps         %xmm11,%xmm1
+  .byte  15,40,234                           // movaps        %xmm2,%xmm5
+  .byte  65,15,89,236                        // mulps         %xmm12,%xmm5
+  .byte  15,92,205                           // subps         %xmm5,%xmm1
+  .byte  65,15,88,214                        // addps         %xmm14,%xmm2
+  .byte  65,15,40,237                        // movaps        %xmm13,%xmm5
+  .byte  15,94,234                           // divps         %xmm2,%xmm5
+  .byte  15,92,205                           // subps         %xmm5,%xmm1
+  .byte  65,15,89,207                        // mulps         %xmm15,%xmm1
+  .byte  15,40,236                           // movaps        %xmm4,%xmm5
+  .byte  15,88,233                           // addps         %xmm1,%xmm5
+  .byte  102,15,58,8,209,1                   // roundps       $0x1,%xmm1,%xmm2
+  .byte  15,92,202                           // subps         %xmm2,%xmm1
+  .byte  65,15,40,209                        // movaps        %xmm9,%xmm2
+  .byte  15,89,209                           // mulps         %xmm1,%xmm2
+  .byte  15,92,234                           // subps         %xmm2,%xmm5
+  .byte  15,40,247                           // movaps        %xmm7,%xmm6
+  .byte  15,92,241                           // subps         %xmm1,%xmm6
+  .byte  15,40,208                           // movaps        %xmm0,%xmm2
+  .byte  15,94,214                           // divps         %xmm6,%xmm2
+  .byte  15,88,213                           // addps         %xmm5,%xmm2
+  .byte  15,40,108,36,152                    // movaps        -0x68(%rsp),%xmm5
+  .byte  15,91,205                           // cvtdq2ps      %xmm5,%xmm1
+  .byte  65,15,89,202                        // mulps         %xmm10,%xmm1
+  .byte  68,15,84,197                        // andps         %xmm5,%xmm8
+  .byte  68,15,86,68,36,136                  // orps          -0x78(%rsp),%xmm8
+  .byte  65,15,92,203                        // subps         %xmm11,%xmm1
+  .byte  69,15,89,224                        // mulps         %xmm8,%xmm12
+  .byte  65,15,92,204                        // subps         %xmm12,%xmm1
+  .byte  69,15,88,198                        // addps         %xmm14,%xmm8
+  .byte  69,15,94,232                        // divps         %xmm8,%xmm13
+  .byte  65,15,92,205                        // subps         %xmm13,%xmm1
+  .byte  65,15,89,207                        // mulps         %xmm15,%xmm1
+  .byte  102,15,58,8,233,1                   // roundps       $0x1,%xmm1,%xmm5
+  .byte  15,88,225                           // addps         %xmm1,%xmm4
+  .byte  15,92,205                           // subps         %xmm5,%xmm1
   .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  69,15,95,202                        // maxps         %xmm10,%xmm9
+  .byte  65,15,92,225                        // subps         %xmm9,%xmm4
+  .byte  15,92,249                           // subps         %xmm1,%xmm7
+  .byte  15,94,199                           // divps         %xmm7,%xmm0
+  .byte  15,88,196                           // addps         %xmm4,%xmm0
+  .byte  102,65,15,110,200                   // movd          %r8d,%xmm1
+  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
+  .byte  15,89,217                           // mulps         %xmm1,%xmm3
+  .byte  15,89,209                           // mulps         %xmm1,%xmm2
+  .byte  15,89,193                           // mulps         %xmm1,%xmm0
+  .byte  102,15,91,219                       // cvtps2dq      %xmm3,%xmm3
+  .byte  102,15,91,202                       // cvtps2dq      %xmm2,%xmm1
+  .byte  102,15,91,208                       // cvtps2dq      %xmm0,%xmm2
   .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,200                        // movaps        %xmm8,%xmm1
-  .byte  65,15,40,209                        // movaps        %xmm9,%xmm2
+  .byte  102,15,40,195                       // movapd        %xmm3,%xmm0
+  .byte  15,40,92,36,168                     // movaps        -0x58(%rsp),%xmm3
+  .byte  15,40,100,36,184                    // movaps        -0x48(%rsp),%xmm4
+  .byte  15,40,108,36,200                    // movaps        -0x38(%rsp),%xmm5
+  .byte  15,40,116,36,216                    // movaps        -0x28(%rsp),%xmm6
+  .byte  15,40,124,36,232                    // movaps        -0x18(%rsp),%xmm7
   .byte  255,224                             // jmpq          *%rax
 
 HIDDEN _sk_to_2dot2_sse41
 .globl _sk_to_2dot2_sse41
 FUNCTION(_sk_to_2dot2_sse41)
 _sk_to_2dot2_sse41:
-  .byte  68,15,82,192                        // rsqrtps       %xmm0,%xmm8
-  .byte  65,15,82,192                        // rsqrtps       %xmm8,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  68,15,82,200                        // rsqrtps       %xmm0,%xmm9
-  .byte  69,15,83,192                        // rcpps         %xmm8,%xmm8
-  .byte  68,15,89,192                        // mulps         %xmm0,%xmm8
-  .byte  65,15,83,193                        // rcpps         %xmm9,%xmm0
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  65,15,95,192                        // maxps         %xmm8,%xmm0
-  .byte  68,15,82,201                        // rsqrtps       %xmm1,%xmm9
-  .byte  65,15,82,201                        // rsqrtps       %xmm9,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  68,15,82,209                        // rsqrtps       %xmm1,%xmm10
-  .byte  69,15,83,201                        // rcpps         %xmm9,%xmm9
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  65,15,83,202                        // rcpps         %xmm10,%xmm1
-  .byte  65,15,89,201                        // mulps         %xmm9,%xmm1
-  .byte  65,15,95,200                        // maxps         %xmm8,%xmm1
-  .byte  68,15,82,202                        // rsqrtps       %xmm2,%xmm9
-  .byte  65,15,82,209                        // rsqrtps       %xmm9,%xmm2
-  .byte  15,82,210                           // rsqrtps       %xmm2,%xmm2
-  .byte  15,82,210                           // rsqrtps       %xmm2,%xmm2
-  .byte  15,82,210                           // rsqrtps       %xmm2,%xmm2
-  .byte  68,15,82,210                        // rsqrtps       %xmm2,%xmm10
-  .byte  69,15,83,201                        // rcpps         %xmm9,%xmm9
-  .byte  68,15,89,202                        // mulps         %xmm2,%xmm9
-  .byte  65,15,83,210                        // rcpps         %xmm10,%xmm2
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  65,15,95,208                        // maxps         %xmm8,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-HIDDEN _sk_rgb_to_hsl_sse41
-.globl _sk_rgb_to_hsl_sse41
-FUNCTION(_sk_rgb_to_hsl_sse41)
-_sk_rgb_to_hsl_sse41:
   .byte  15,41,124,36,232                    // movaps        %xmm7,-0x18(%rsp)
-  .byte  15,40,254                           // movaps        %xmm6,%xmm7
+  .byte  15,41,116,36,216                    // movaps        %xmm6,-0x28(%rsp)
+  .byte  15,41,108,36,200                    // movaps        %xmm5,-0x38(%rsp)
+  .byte  15,41,100,36,184                    // movaps        %xmm4,-0x48(%rsp)
+  .byte  15,41,92,36,168                     // movaps        %xmm3,-0x58(%rsp)
+  .byte  15,41,84,36,152                     // movaps        %xmm2,-0x68(%rsp)
+  .byte  15,40,209                           // movaps        %xmm1,%xmm2
+  .byte  184,46,186,232,62                   // mov           $0x3ee8ba2e,%eax
+  .byte  15,91,216                           // cvtdq2ps      %xmm0,%xmm3
+  .byte  185,0,0,0,52                        // mov           $0x34000000,%ecx
+  .byte  102,68,15,110,209                   // movd          %ecx,%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  65,15,89,218                        // mulps         %xmm10,%xmm3
+  .byte  185,255,255,127,0                   // mov           $0x7fffff,%ecx
+  .byte  102,15,110,201                      // movd          %ecx,%xmm1
+  .byte  102,68,15,112,193,0                 // pshufd        $0x0,%xmm1,%xmm8
+  .byte  65,15,84,192                        // andps         %xmm8,%xmm0
+  .byte  185,0,0,0,63                        // mov           $0x3f000000,%ecx
+  .byte  102,15,110,201                      // movd          %ecx,%xmm1
+  .byte  102,15,112,201,0                    // pshufd        $0x0,%xmm1,%xmm1
+  .byte  15,86,193                           // orps          %xmm1,%xmm0
+  .byte  15,40,241                           // movaps        %xmm1,%xmm6
+  .byte  15,41,116,36,136                    // movaps        %xmm6,-0x78(%rsp)
+  .byte  185,119,115,248,66                  // mov           $0x42f87377,%ecx
+  .byte  102,68,15,110,217                   // movd          %ecx,%xmm11
+  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
+  .byte  65,15,92,219                        // subps         %xmm11,%xmm3
+  .byte  185,117,191,191,63                  // mov           $0x3fbfbf75,%ecx
+  .byte  102,68,15,110,225                   // movd          %ecx,%xmm12
+  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
+  .byte  15,40,200                           // movaps        %xmm0,%xmm1
+  .byte  65,15,89,204                        // mulps         %xmm12,%xmm1
+  .byte  15,92,217                           // subps         %xmm1,%xmm3
+  .byte  185,163,233,220,63                  // mov           $0x3fdce9a3,%ecx
+  .byte  102,68,15,110,233                   // movd          %ecx,%xmm13
+  .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
+  .byte  185,249,68,180,62                   // mov           $0x3eb444f9,%ecx
+  .byte  102,68,15,110,241                   // movd          %ecx,%xmm14
+  .byte  69,15,198,246,0                     // shufps        $0x0,%xmm14,%xmm14
+  .byte  65,15,88,198                        // addps         %xmm14,%xmm0
+  .byte  65,15,40,205                        // movaps        %xmm13,%xmm1
+  .byte  15,94,200                           // divps         %xmm0,%xmm1
+  .byte  15,92,217                           // subps         %xmm1,%xmm3
+  .byte  102,68,15,110,248                   // movd          %eax,%xmm15
+  .byte  69,15,198,255,0                     // shufps        $0x0,%xmm15,%xmm15
+  .byte  65,15,89,223                        // mulps         %xmm15,%xmm3
+  .byte  65,184,0,0,0,75                     // mov           $0x4b000000,%r8d
+  .byte  185,81,140,242,66                   // mov           $0x42f28c51,%ecx
+  .byte  102,15,110,225                      // movd          %ecx,%xmm4
+  .byte  15,198,228,0                        // shufps        $0x0,%xmm4,%xmm4
+  .byte  15,40,204                           // movaps        %xmm4,%xmm1
+  .byte  15,88,203                           // addps         %xmm3,%xmm1
+  .byte  102,15,58,8,195,1                   // roundps       $0x1,%xmm3,%xmm0
+  .byte  15,92,216                           // subps         %xmm0,%xmm3
+  .byte  185,141,188,190,63                  // mov           $0x3fbebc8d,%ecx
+  .byte  102,68,15,110,201                   // movd          %ecx,%xmm9
+  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
+  .byte  65,15,40,193                        // movaps        %xmm9,%xmm0
+  .byte  15,89,195                           // mulps         %xmm3,%xmm0
+  .byte  15,92,200                           // subps         %xmm0,%xmm1
+  .byte  185,254,210,221,65                  // mov           $0x41ddd2fe,%ecx
+  .byte  184,248,245,154,64                  // mov           $0x409af5f8,%eax
+  .byte  102,15,110,248                      // movd          %eax,%xmm7
+  .byte  15,198,255,0                        // shufps        $0x0,%xmm7,%xmm7
+  .byte  15,40,239                           // movaps        %xmm7,%xmm5
+  .byte  15,92,235                           // subps         %xmm3,%xmm5
+  .byte  102,15,110,193                      // movd          %ecx,%xmm0
+  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  15,94,221                           // divps         %xmm5,%xmm3
+  .byte  15,88,217                           // addps         %xmm1,%xmm3
+  .byte  15,91,202                           // cvtdq2ps      %xmm2,%xmm1
+  .byte  65,15,89,202                        // mulps         %xmm10,%xmm1
+  .byte  65,15,84,208                        // andps         %xmm8,%xmm2
+  .byte  15,86,214                           // orps          %xmm6,%xmm2
+  .byte  65,15,92,203                        // subps         %xmm11,%xmm1
+  .byte  15,40,234                           // movaps        %xmm2,%xmm5
+  .byte  65,15,89,236                        // mulps         %xmm12,%xmm5
+  .byte  15,92,205                           // subps         %xmm5,%xmm1
+  .byte  65,15,88,214                        // addps         %xmm14,%xmm2
+  .byte  65,15,40,237                        // movaps        %xmm13,%xmm5
+  .byte  15,94,234                           // divps         %xmm2,%xmm5
+  .byte  15,92,205                           // subps         %xmm5,%xmm1
+  .byte  65,15,89,207                        // mulps         %xmm15,%xmm1
+  .byte  15,40,236                           // movaps        %xmm4,%xmm5
+  .byte  15,88,233                           // addps         %xmm1,%xmm5
+  .byte  102,15,58,8,209,1                   // roundps       $0x1,%xmm1,%xmm2
+  .byte  15,92,202                           // subps         %xmm2,%xmm1
+  .byte  65,15,40,209                        // movaps        %xmm9,%xmm2
+  .byte  15,89,209                           // mulps         %xmm1,%xmm2
+  .byte  15,92,234                           // subps         %xmm2,%xmm5
+  .byte  15,40,247                           // movaps        %xmm7,%xmm6
+  .byte  15,92,241                           // subps         %xmm1,%xmm6
+  .byte  15,40,208                           // movaps        %xmm0,%xmm2
+  .byte  15,94,214                           // divps         %xmm6,%xmm2
+  .byte  15,88,213                           // addps         %xmm5,%xmm2
+  .byte  15,40,108,36,152                    // movaps        -0x68(%rsp),%xmm5
+  .byte  15,91,205                           // cvtdq2ps      %xmm5,%xmm1
+  .byte  65,15,89,202                        // mulps         %xmm10,%xmm1
+  .byte  68,15,84,197                        // andps         %xmm5,%xmm8
+  .byte  68,15,86,68,36,136                  // orps          -0x78(%rsp),%xmm8
+  .byte  65,15,92,203                        // subps         %xmm11,%xmm1
+  .byte  69,15,89,224                        // mulps         %xmm8,%xmm12
+  .byte  65,15,92,204                        // subps         %xmm12,%xmm1
+  .byte  69,15,88,198                        // addps         %xmm14,%xmm8
+  .byte  69,15,94,232                        // divps         %xmm8,%xmm13
+  .byte  65,15,92,205                        // subps         %xmm13,%xmm1
+  .byte  65,15,89,207                        // mulps         %xmm15,%xmm1
+  .byte  102,15,58,8,233,1                   // roundps       $0x1,%xmm1,%xmm5
+  .byte  15,88,225                           // addps         %xmm1,%xmm4
+  .byte  15,92,205                           // subps         %xmm5,%xmm1
+  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
+  .byte  65,15,92,225                        // subps         %xmm9,%xmm4
+  .byte  15,92,249                           // subps         %xmm1,%xmm7
+  .byte  15,94,199                           // divps         %xmm7,%xmm0
+  .byte  15,88,196                           // addps         %xmm4,%xmm0
+  .byte  102,65,15,110,200                   // movd          %r8d,%xmm1
+  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
+  .byte  15,89,217                           // mulps         %xmm1,%xmm3
+  .byte  15,89,209                           // mulps         %xmm1,%xmm2
+  .byte  15,89,193                           // mulps         %xmm1,%xmm0
+  .byte  102,15,91,219                       // cvtps2dq      %xmm3,%xmm3
+  .byte  102,15,91,202                       // cvtps2dq      %xmm2,%xmm1
+  .byte  102,15,91,208                       // cvtps2dq      %xmm0,%xmm2
+  .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  102,15,40,195                       // movapd        %xmm3,%xmm0
+  .byte  15,40,92,36,168                     // movaps        -0x58(%rsp),%xmm3
+  .byte  15,40,100,36,184                    // movaps        -0x48(%rsp),%xmm4
+  .byte  15,40,108,36,200                    // movaps        -0x38(%rsp),%xmm5
+  .byte  15,40,116,36,216                    // movaps        -0x28(%rsp),%xmm6
+  .byte  15,40,124,36,232                    // movaps        -0x18(%rsp),%xmm7
+  .byte  255,224                             // jmpq          *%rax
+
+HIDDEN _sk_rgb_to_hsl_sse41
+.globl _sk_rgb_to_hsl_sse41
+FUNCTION(_sk_rgb_to_hsl_sse41)
+_sk_rgb_to_hsl_sse41:
+  .byte  15,41,124,36,232                    // movaps        %xmm7,-0x18(%rsp)
+  .byte  15,40,254                           // movaps        %xmm6,%xmm7
   .byte  15,40,245                           // movaps        %xmm5,%xmm6
   .byte  15,40,236                           // movaps        %xmm4,%xmm5
   .byte  15,40,227                           // movaps        %xmm3,%xmm4
@@ -20993,9 +21537,9 @@ _sk_gather_i8_sse41:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  73,137,192                          // mov           %rax,%r8
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  116,5                               // je            27eb <_sk_gather_i8_sse41+0xf>
+  .byte  116,5                               // je            2aed <_sk_gather_i8_sse41+0xf>
   .byte  76,137,192                          // mov           %r8,%rax
-  .byte  235,2                               // jmp           27ed <_sk_gather_i8_sse41+0x11>
+  .byte  235,2                               // jmp           2aef <_sk_gather_i8_sse41+0x11>
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,16                           // mov           (%rax),%r10
   .byte  243,15,91,201                       // cvttps2dq     %xmm1,%xmm1
@@ -22228,7 +22772,7 @@ _sk_linear_gradient_sse41:
   .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
   .byte  72,139,8                            // mov           (%rax),%rcx
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,132,254,0,0,0                    // je            3bd6 <_sk_linear_gradient_sse41+0x138>
+  .byte  15,132,254,0,0,0                    // je            3ed8 <_sk_linear_gradient_sse41+0x138>
   .byte  15,41,100,36,168                    // movaps        %xmm4,-0x58(%rsp)
   .byte  15,41,108,36,184                    // movaps        %xmm5,-0x48(%rsp)
   .byte  15,41,116,36,200                    // movaps        %xmm6,-0x38(%rsp)
@@ -22278,12 +22822,12 @@ _sk_linear_gradient_sse41:
   .byte  15,40,196                           // movaps        %xmm4,%xmm0
   .byte  72,131,192,36                       // add           $0x24,%rax
   .byte  72,255,201                          // dec           %rcx
-  .byte  15,133,65,255,255,255               // jne           3b01 <_sk_linear_gradient_sse41+0x63>
+  .byte  15,133,65,255,255,255               // jne           3e03 <_sk_linear_gradient_sse41+0x63>
   .byte  15,40,124,36,216                    // movaps        -0x28(%rsp),%xmm7
   .byte  15,40,116,36,200                    // movaps        -0x38(%rsp),%xmm6
   .byte  15,40,108,36,184                    // movaps        -0x48(%rsp),%xmm5
   .byte  15,40,100,36,168                    // movaps        -0x58(%rsp),%xmm4
-  .byte  235,13                              // jmp           3be3 <_sk_linear_gradient_sse41+0x145>
+  .byte  235,13                              // jmp           3ee5 <_sk_linear_gradient_sse41+0x145>
   .byte  15,87,201                           // xorps         %xmm1,%xmm1
   .byte  15,87,210                           // xorps         %xmm2,%xmm2
   .byte  15,87,219                           // xorps         %xmm3,%xmm3
@@ -24067,91 +24611,322 @@ HIDDEN _sk_from_2dot2_sse2
 .globl _sk_from_2dot2_sse2
 FUNCTION(_sk_from_2dot2_sse2)
 _sk_from_2dot2_sse2:
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  65,15,82,192                        // rsqrtps       %xmm8,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  68,15,82,200                        // rsqrtps       %xmm0,%xmm9
-  .byte  65,15,82,193                        // rsqrtps       %xmm9,%xmm0
-  .byte  68,15,82,208                        // rsqrtps       %xmm0,%xmm10
-  .byte  69,15,89,192                        // mulps         %xmm8,%xmm8
-  .byte  65,15,40,193                        // movaps        %xmm9,%xmm0
-  .byte  15,89,192                           // mulps         %xmm0,%xmm0
-  .byte  65,15,89,193                        // mulps         %xmm9,%xmm0
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,89,194                        // mulps         %xmm10,%xmm0
-  .byte  69,15,87,210                        // xorps         %xmm10,%xmm10
-  .byte  65,15,95,194                        // maxps         %xmm10,%xmm0
-  .byte  68,15,82,193                        // rsqrtps       %xmm1,%xmm8
-  .byte  69,15,82,192                        // rsqrtps       %xmm8,%xmm8
-  .byte  69,15,82,192                        // rsqrtps       %xmm8,%xmm8
-  .byte  69,15,82,200                        // rsqrtps       %xmm8,%xmm9
-  .byte  69,15,82,193                        // rsqrtps       %xmm9,%xmm8
-  .byte  69,15,82,216                        // rsqrtps       %xmm8,%xmm11
-  .byte  15,89,201                           // mulps         %xmm1,%xmm1
-  .byte  69,15,40,193                        // movaps        %xmm9,%xmm8
-  .byte  69,15,89,192                        // mulps         %xmm8,%xmm8
-  .byte  69,15,89,193                        // mulps         %xmm9,%xmm8
-  .byte  68,15,89,193                        // mulps         %xmm1,%xmm8
-  .byte  69,15,89,195                        // mulps         %xmm11,%xmm8
-  .byte  69,15,95,194                        // maxps         %xmm10,%xmm8
-  .byte  15,82,202                           // rsqrtps       %xmm2,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  68,15,82,217                        // rsqrtps       %xmm1,%xmm11
-  .byte  65,15,82,203                        // rsqrtps       %xmm11,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  15,89,210                           // mulps         %xmm2,%xmm2
-  .byte  69,15,40,203                        // movaps        %xmm11,%xmm9
-  .byte  69,15,89,201                        // mulps         %xmm9,%xmm9
-  .byte  69,15,89,203                        // mulps         %xmm11,%xmm9
-  .byte  68,15,89,202                        // mulps         %xmm2,%xmm9
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  69,15,95,202                        // maxps         %xmm10,%xmm9
+  .byte  72,131,236,24                       // sub           $0x18,%rsp
+  .byte  15,41,60,36                         // movaps        %xmm7,(%rsp)
+  .byte  15,41,116,36,240                    // movaps        %xmm6,-0x10(%rsp)
+  .byte  15,41,108,36,224                    // movaps        %xmm5,-0x20(%rsp)
+  .byte  15,41,100,36,208                    // movaps        %xmm4,-0x30(%rsp)
+  .byte  15,41,92,36,192                     // movaps        %xmm3,-0x40(%rsp)
+  .byte  15,41,84,36,176                     // movaps        %xmm2,-0x50(%rsp)
+  .byte  15,40,208                           // movaps        %xmm0,%xmm2
+  .byte  184,205,204,12,64                   // mov           $0x400ccccd,%eax
+  .byte  15,91,194                           // cvtdq2ps      %xmm2,%xmm0
+  .byte  185,0,0,0,52                        // mov           $0x34000000,%ecx
+  .byte  102,15,110,217                      // movd          %ecx,%xmm3
+  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
+  .byte  15,89,195                           // mulps         %xmm3,%xmm0
+  .byte  68,15,40,219                        // movaps        %xmm3,%xmm11
+  .byte  68,15,41,92,36,144                  // movaps        %xmm11,-0x70(%rsp)
+  .byte  185,255,255,127,0                   // mov           $0x7fffff,%ecx
+  .byte  102,15,110,217                      // movd          %ecx,%xmm3
+  .byte  102,68,15,112,195,0                 // pshufd        $0x0,%xmm3,%xmm8
+  .byte  65,15,84,208                        // andps         %xmm8,%xmm2
+  .byte  185,0,0,0,63                        // mov           $0x3f000000,%ecx
+  .byte  102,15,110,217                      // movd          %ecx,%xmm3
+  .byte  102,15,112,219,0                    // pshufd        $0x0,%xmm3,%xmm3
+  .byte  102,15,127,92,36,160                // movdqa        %xmm3,-0x60(%rsp)
+  .byte  15,86,211                           // orps          %xmm3,%xmm2
+  .byte  185,119,115,248,66                  // mov           $0x42f87377,%ecx
+  .byte  102,15,110,233                      // movd          %ecx,%xmm5
+  .byte  15,198,237,0                        // shufps        $0x0,%xmm5,%xmm5
+  .byte  15,92,197                           // subps         %xmm5,%xmm0
+  .byte  15,41,108,36,128                    // movaps        %xmm5,-0x80(%rsp)
+  .byte  185,117,191,191,63                  // mov           $0x3fbfbf75,%ecx
+  .byte  102,68,15,110,225                   // movd          %ecx,%xmm12
+  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
+  .byte  15,40,218                           // movaps        %xmm2,%xmm3
+  .byte  65,15,89,220                        // mulps         %xmm12,%xmm3
+  .byte  15,92,195                           // subps         %xmm3,%xmm0
+  .byte  185,163,233,220,63                  // mov           $0x3fdce9a3,%ecx
+  .byte  102,68,15,110,233                   // movd          %ecx,%xmm13
+  .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
+  .byte  185,249,68,180,62                   // mov           $0x3eb444f9,%ecx
+  .byte  102,68,15,110,241                   // movd          %ecx,%xmm14
+  .byte  69,15,198,246,0                     // shufps        $0x0,%xmm14,%xmm14
+  .byte  65,15,88,214                        // addps         %xmm14,%xmm2
+  .byte  65,15,40,221                        // movaps        %xmm13,%xmm3
+  .byte  15,94,218                           // divps         %xmm2,%xmm3
+  .byte  15,92,195                           // subps         %xmm3,%xmm0
+  .byte  102,68,15,110,248                   // movd          %eax,%xmm15
+  .byte  69,15,198,255,0                     // shufps        $0x0,%xmm15,%xmm15
+  .byte  65,15,89,199                        // mulps         %xmm15,%xmm0
+  .byte  243,15,91,208                       // cvttps2dq     %xmm0,%xmm2
+  .byte  15,91,210                           // cvtdq2ps      %xmm2,%xmm2
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  15,194,218,1                        // cmpltps       %xmm2,%xmm3
+  .byte  184,0,0,128,63                      // mov           $0x3f800000,%eax
+  .byte  102,68,15,110,208                   // movd          %eax,%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  65,15,84,218                        // andps         %xmm10,%xmm3
+  .byte  15,92,211                           // subps         %xmm3,%xmm2
+  .byte  15,40,224                           // movaps        %xmm0,%xmm4
+  .byte  15,92,226                           // subps         %xmm2,%xmm4
+  .byte  65,184,0,0,0,75                     // mov           $0x4b000000,%r8d
+  .byte  185,81,140,242,66                   // mov           $0x42f28c51,%ecx
+  .byte  102,68,15,110,201                   // movd          %ecx,%xmm9
+  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
+  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
+  .byte  185,141,188,190,63                  // mov           $0x3fbebc8d,%ecx
+  .byte  102,15,110,249                      // movd          %ecx,%xmm7
+  .byte  15,198,255,0                        // shufps        $0x0,%xmm7,%xmm7
+  .byte  15,40,215                           // movaps        %xmm7,%xmm2
+  .byte  15,89,212                           // mulps         %xmm4,%xmm2
+  .byte  15,92,194                           // subps         %xmm2,%xmm0
+  .byte  185,254,210,221,65                  // mov           $0x41ddd2fe,%ecx
+  .byte  184,248,245,154,64                  // mov           $0x409af5f8,%eax
+  .byte  102,15,110,240                      // movd          %eax,%xmm6
+  .byte  15,198,246,0                        // shufps        $0x0,%xmm6,%xmm6
+  .byte  15,40,222                           // movaps        %xmm6,%xmm3
+  .byte  15,92,220                           // subps         %xmm4,%xmm3
+  .byte  102,15,110,209                      // movd          %ecx,%xmm2
+  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
+  .byte  15,40,226                           // movaps        %xmm2,%xmm4
+  .byte  15,94,227                           // divps         %xmm3,%xmm4
+  .byte  15,88,224                           // addps         %xmm0,%xmm4
+  .byte  15,91,193                           // cvtdq2ps      %xmm1,%xmm0
+  .byte  65,15,89,195                        // mulps         %xmm11,%xmm0
+  .byte  65,15,84,200                        // andps         %xmm8,%xmm1
+  .byte  68,15,40,92,36,160                  // movaps        -0x60(%rsp),%xmm11
+  .byte  65,15,86,203                        // orps          %xmm11,%xmm1
+  .byte  15,92,197                           // subps         %xmm5,%xmm0
+  .byte  15,40,217                           // movaps        %xmm1,%xmm3
+  .byte  65,15,89,220                        // mulps         %xmm12,%xmm3
+  .byte  15,92,195                           // subps         %xmm3,%xmm0
+  .byte  65,15,88,206                        // addps         %xmm14,%xmm1
+  .byte  65,15,40,221                        // movaps        %xmm13,%xmm3
+  .byte  15,94,217                           // divps         %xmm1,%xmm3
+  .byte  15,92,195                           // subps         %xmm3,%xmm0
+  .byte  65,15,89,199                        // mulps         %xmm15,%xmm0
+  .byte  243,15,91,200                       // cvttps2dq     %xmm0,%xmm1
+  .byte  15,91,201                           // cvtdq2ps      %xmm1,%xmm1
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  15,194,217,1                        // cmpltps       %xmm1,%xmm3
+  .byte  65,15,84,218                        // andps         %xmm10,%xmm3
+  .byte  15,92,203                           // subps         %xmm3,%xmm1
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  15,92,217                           // subps         %xmm1,%xmm3
+  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
+  .byte  15,40,207                           // movaps        %xmm7,%xmm1
+  .byte  15,89,203                           // mulps         %xmm3,%xmm1
+  .byte  15,92,193                           // subps         %xmm1,%xmm0
+  .byte  15,40,238                           // movaps        %xmm6,%xmm5
+  .byte  15,92,235                           // subps         %xmm3,%xmm5
+  .byte  15,40,202                           // movaps        %xmm2,%xmm1
+  .byte  15,94,205                           // divps         %xmm5,%xmm1
+  .byte  15,88,200                           // addps         %xmm0,%xmm1
+  .byte  15,40,92,36,176                     // movaps        -0x50(%rsp),%xmm3
+  .byte  15,91,195                           // cvtdq2ps      %xmm3,%xmm0
+  .byte  15,89,68,36,144                     // mulps         -0x70(%rsp),%xmm0
+  .byte  68,15,84,195                        // andps         %xmm3,%xmm8
+  .byte  69,15,86,195                        // orps          %xmm11,%xmm8
+  .byte  15,92,68,36,128                     // subps         -0x80(%rsp),%xmm0
+  .byte  69,15,89,224                        // mulps         %xmm8,%xmm12
+  .byte  65,15,92,196                        // subps         %xmm12,%xmm0
+  .byte  69,15,88,198                        // addps         %xmm14,%xmm8
+  .byte  69,15,94,232                        // divps         %xmm8,%xmm13
+  .byte  65,15,92,197                        // subps         %xmm13,%xmm0
+  .byte  65,15,89,199                        // mulps         %xmm15,%xmm0
+  .byte  243,15,91,216                       // cvttps2dq     %xmm0,%xmm3
+  .byte  15,91,219                           // cvtdq2ps      %xmm3,%xmm3
+  .byte  15,40,232                           // movaps        %xmm0,%xmm5
+  .byte  15,194,235,1                        // cmpltps       %xmm3,%xmm5
+  .byte  65,15,84,234                        // andps         %xmm10,%xmm5
+  .byte  15,92,221                           // subps         %xmm5,%xmm3
+  .byte  15,40,232                           // movaps        %xmm0,%xmm5
+  .byte  15,92,235                           // subps         %xmm3,%xmm5
+  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
+  .byte  15,89,253                           // mulps         %xmm5,%xmm7
+  .byte  15,92,199                           // subps         %xmm7,%xmm0
+  .byte  15,92,245                           // subps         %xmm5,%xmm6
+  .byte  15,94,214                           // divps         %xmm6,%xmm2
+  .byte  15,88,208                           // addps         %xmm0,%xmm2
+  .byte  102,65,15,110,192                   // movd          %r8d,%xmm0
+  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
+  .byte  15,89,224                           // mulps         %xmm0,%xmm4
+  .byte  15,89,200                           // mulps         %xmm0,%xmm1
+  .byte  15,89,208                           // mulps         %xmm0,%xmm2
+  .byte  102,15,91,220                       // cvtps2dq      %xmm4,%xmm3
+  .byte  102,15,91,201                       // cvtps2dq      %xmm1,%xmm1
+  .byte  102,15,91,210                       // cvtps2dq      %xmm2,%xmm2
   .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,200                        // movaps        %xmm8,%xmm1
-  .byte  65,15,40,209                        // movaps        %xmm9,%xmm2
+  .byte  102,15,40,195                       // movapd        %xmm3,%xmm0
+  .byte  15,40,92,36,192                     // movaps        -0x40(%rsp),%xmm3
+  .byte  15,40,100,36,208                    // movaps        -0x30(%rsp),%xmm4
+  .byte  15,40,108,36,224                    // movaps        -0x20(%rsp),%xmm5
+  .byte  15,40,116,36,240                    // movaps        -0x10(%rsp),%xmm6
+  .byte  15,40,60,36                         // movaps        (%rsp),%xmm7
+  .byte  72,131,196,24                       // add           $0x18,%rsp
   .byte  255,224                             // jmpq          *%rax
 
 HIDDEN _sk_to_2dot2_sse2
 .globl _sk_to_2dot2_sse2
 FUNCTION(_sk_to_2dot2_sse2)
 _sk_to_2dot2_sse2:
-  .byte  68,15,82,192                        // rsqrtps       %xmm0,%xmm8
-  .byte  65,15,82,192                        // rsqrtps       %xmm8,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  68,15,82,200                        // rsqrtps       %xmm0,%xmm9
-  .byte  69,15,83,192                        // rcpps         %xmm8,%xmm8
-  .byte  68,15,89,192                        // mulps         %xmm0,%xmm8
-  .byte  65,15,83,193                        // rcpps         %xmm9,%xmm0
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  65,15,95,192                        // maxps         %xmm8,%xmm0
-  .byte  68,15,82,201                        // rsqrtps       %xmm1,%xmm9
-  .byte  65,15,82,201                        // rsqrtps       %xmm9,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  15,82,201                           // rsqrtps       %xmm1,%xmm1
-  .byte  68,15,82,209                        // rsqrtps       %xmm1,%xmm10
-  .byte  69,15,83,201                        // rcpps         %xmm9,%xmm9
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  65,15,83,202                        // rcpps         %xmm10,%xmm1
-  .byte  65,15,89,201                        // mulps         %xmm9,%xmm1
-  .byte  65,15,95,200                        // maxps         %xmm8,%xmm1
-  .byte  68,15,82,202                        // rsqrtps       %xmm2,%xmm9
-  .byte  65,15,82,209                        // rsqrtps       %xmm9,%xmm2
-  .byte  15,82,210                           // rsqrtps       %xmm2,%xmm2
-  .byte  15,82,210                           // rsqrtps       %xmm2,%xmm2
-  .byte  15,82,210                           // rsqrtps       %xmm2,%xmm2
-  .byte  68,15,82,210                        // rsqrtps       %xmm2,%xmm10
-  .byte  69,15,83,201                        // rcpps         %xmm9,%xmm9
-  .byte  68,15,89,202                        // mulps         %xmm2,%xmm9
-  .byte  65,15,83,210                        // rcpps         %xmm10,%xmm2
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  65,15,95,208                        // maxps         %xmm8,%xmm2
+  .byte  72,131,236,24                       // sub           $0x18,%rsp
+  .byte  15,41,60,36                         // movaps        %xmm7,(%rsp)
+  .byte  15,41,116,36,240                    // movaps        %xmm6,-0x10(%rsp)
+  .byte  15,41,108,36,224                    // movaps        %xmm5,-0x20(%rsp)
+  .byte  15,41,100,36,208                    // movaps        %xmm4,-0x30(%rsp)
+  .byte  15,41,92,36,192                     // movaps        %xmm3,-0x40(%rsp)
+  .byte  15,41,84,36,176                     // movaps        %xmm2,-0x50(%rsp)
+  .byte  15,40,208                           // movaps        %xmm0,%xmm2
+  .byte  184,46,186,232,62                   // mov           $0x3ee8ba2e,%eax
+  .byte  15,91,194                           // cvtdq2ps      %xmm2,%xmm0
+  .byte  185,0,0,0,52                        // mov           $0x34000000,%ecx
+  .byte  102,15,110,217                      // movd          %ecx,%xmm3
+  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
+  .byte  15,89,195                           // mulps         %xmm3,%xmm0
+  .byte  68,15,40,219                        // movaps        %xmm3,%xmm11
+  .byte  68,15,41,92,36,144                  // movaps        %xmm11,-0x70(%rsp)
+  .byte  185,255,255,127,0                   // mov           $0x7fffff,%ecx
+  .byte  102,15,110,217                      // movd          %ecx,%xmm3
+  .byte  102,68,15,112,195,0                 // pshufd        $0x0,%xmm3,%xmm8
+  .byte  65,15,84,208                        // andps         %xmm8,%xmm2
+  .byte  185,0,0,0,63                        // mov           $0x3f000000,%ecx
+  .byte  102,15,110,217                      // movd          %ecx,%xmm3
+  .byte  102,15,112,219,0                    // pshufd        $0x0,%xmm3,%xmm3
+  .byte  102,15,127,92,36,160                // movdqa        %xmm3,-0x60(%rsp)
+  .byte  15,86,211                           // orps          %xmm3,%xmm2
+  .byte  185,119,115,248,66                  // mov           $0x42f87377,%ecx
+  .byte  102,15,110,233                      // movd          %ecx,%xmm5
+  .byte  15,198,237,0                        // shufps        $0x0,%xmm5,%xmm5
+  .byte  15,92,197                           // subps         %xmm5,%xmm0
+  .byte  15,41,108,36,128                    // movaps        %xmm5,-0x80(%rsp)
+  .byte  185,117,191,191,63                  // mov           $0x3fbfbf75,%ecx
+  .byte  102,68,15,110,225                   // movd          %ecx,%xmm12
+  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
+  .byte  15,40,218                           // movaps        %xmm2,%xmm3
+  .byte  65,15,89,220                        // mulps         %xmm12,%xmm3
+  .byte  15,92,195                           // subps         %xmm3,%xmm0
+  .byte  185,163,233,220,63                  // mov           $0x3fdce9a3,%ecx
+  .byte  102,68,15,110,233                   // movd          %ecx,%xmm13
+  .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
+  .byte  185,249,68,180,62                   // mov           $0x3eb444f9,%ecx
+  .byte  102,68,15,110,241                   // movd          %ecx,%xmm14
+  .byte  69,15,198,246,0                     // shufps        $0x0,%xmm14,%xmm14
+  .byte  65,15,88,214                        // addps         %xmm14,%xmm2
+  .byte  65,15,40,221                        // movaps        %xmm13,%xmm3
+  .byte  15,94,218                           // divps         %xmm2,%xmm3
+  .byte  15,92,195                           // subps         %xmm3,%xmm0
+  .byte  102,68,15,110,248                   // movd          %eax,%xmm15
+  .byte  69,15,198,255,0                     // shufps        $0x0,%xmm15,%xmm15
+  .byte  65,15,89,199                        // mulps         %xmm15,%xmm0
+  .byte  243,15,91,208                       // cvttps2dq     %xmm0,%xmm2
+  .byte  15,91,210                           // cvtdq2ps      %xmm2,%xmm2
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  15,194,218,1                        // cmpltps       %xmm2,%xmm3
+  .byte  184,0,0,128,63                      // mov           $0x3f800000,%eax
+  .byte  102,68,15,110,208                   // movd          %eax,%xmm10
+  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
+  .byte  65,15,84,218                        // andps         %xmm10,%xmm3
+  .byte  15,92,211                           // subps         %xmm3,%xmm2
+  .byte  15,40,224                           // movaps        %xmm0,%xmm4
+  .byte  15,92,226                           // subps         %xmm2,%xmm4
+  .byte  65,184,0,0,0,75                     // mov           $0x4b000000,%r8d
+  .byte  185,81,140,242,66                   // mov           $0x42f28c51,%ecx
+  .byte  102,68,15,110,201                   // movd          %ecx,%xmm9
+  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
+  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
+  .byte  185,141,188,190,63                  // mov           $0x3fbebc8d,%ecx
+  .byte  102,15,110,249                      // movd          %ecx,%xmm7
+  .byte  15,198,255,0                        // shufps        $0x0,%xmm7,%xmm7
+  .byte  15,40,215                           // movaps        %xmm7,%xmm2
+  .byte  15,89,212                           // mulps         %xmm4,%xmm2
+  .byte  15,92,194                           // subps         %xmm2,%xmm0
+  .byte  185,254,210,221,65                  // mov           $0x41ddd2fe,%ecx
+  .byte  184,248,245,154,64                  // mov           $0x409af5f8,%eax
+  .byte  102,15,110,240                      // movd          %eax,%xmm6
+  .byte  15,198,246,0                        // shufps        $0x0,%xmm6,%xmm6
+  .byte  15,40,222                           // movaps        %xmm6,%xmm3
+  .byte  15,92,220                           // subps         %xmm4,%xmm3
+  .byte  102,15,110,209                      // movd          %ecx,%xmm2
+  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
+  .byte  15,40,226                           // movaps        %xmm2,%xmm4
+  .byte  15,94,227                           // divps         %xmm3,%xmm4
+  .byte  15,88,224                           // addps         %xmm0,%xmm4
+  .byte  15,91,193                           // cvtdq2ps      %xmm1,%xmm0
+  .byte  65,15,89,195                        // mulps         %xmm11,%xmm0
+  .byte  65,15,84,200                        // andps         %xmm8,%xmm1
+  .byte  68,15,40,92,36,160                  // movaps        -0x60(%rsp),%xmm11
+  .byte  65,15,86,203                        // orps          %xmm11,%xmm1
+  .byte  15,92,197                           // subps         %xmm5,%xmm0
+  .byte  15,40,217                           // movaps        %xmm1,%xmm3
+  .byte  65,15,89,220                        // mulps         %xmm12,%xmm3
+  .byte  15,92,195                           // subps         %xmm3,%xmm0
+  .byte  65,15,88,206                        // addps         %xmm14,%xmm1
+  .byte  65,15,40,221                        // movaps        %xmm13,%xmm3
+  .byte  15,94,217                           // divps         %xmm1,%xmm3
+  .byte  15,92,195                           // subps         %xmm3,%xmm0
+  .byte  65,15,89,199                        // mulps         %xmm15,%xmm0
+  .byte  243,15,91,200                       // cvttps2dq     %xmm0,%xmm1
+  .byte  15,91,201                           // cvtdq2ps      %xmm1,%xmm1
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  15,194,217,1                        // cmpltps       %xmm1,%xmm3
+  .byte  65,15,84,218                        // andps         %xmm10,%xmm3
+  .byte  15,92,203                           // subps         %xmm3,%xmm1
+  .byte  15,40,216                           // movaps        %xmm0,%xmm3
+  .byte  15,92,217                           // subps         %xmm1,%xmm3
+  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
+  .byte  15,40,207                           // movaps        %xmm7,%xmm1
+  .byte  15,89,203                           // mulps         %xmm3,%xmm1
+  .byte  15,92,193                           // subps         %xmm1,%xmm0
+  .byte  15,40,238                           // movaps        %xmm6,%xmm5
+  .byte  15,92,235                           // subps         %xmm3,%xmm5
+  .byte  15,40,202                           // movaps        %xmm2,%xmm1
+  .byte  15,94,205                           // divps         %xmm5,%xmm1
+  .byte  15,88,200                           // addps         %xmm0,%xmm1
+  .byte  15,40,92,36,176                     // movaps        -0x50(%rsp),%xmm3
+  .byte  15,91,195                           // cvtdq2ps      %xmm3,%xmm0
+  .byte  15,89,68,36,144                     // mulps         -0x70(%rsp),%xmm0
+  .byte  68,15,84,195                        // andps         %xmm3,%xmm8
+  .byte  69,15,86,195                        // orps          %xmm11,%xmm8
+  .byte  15,92,68,36,128                     // subps         -0x80(%rsp),%xmm0
+  .byte  69,15,89,224                        // mulps         %xmm8,%xmm12
+  .byte  65,15,92,196                        // subps         %xmm12,%xmm0
+  .byte  69,15,88,198                        // addps         %xmm14,%xmm8
+  .byte  69,15,94,232                        // divps         %xmm8,%xmm13
+  .byte  65,15,92,197                        // subps         %xmm13,%xmm0
+  .byte  65,15,89,199                        // mulps         %xmm15,%xmm0
+  .byte  243,15,91,216                       // cvttps2dq     %xmm0,%xmm3
+  .byte  15,91,219                           // cvtdq2ps      %xmm3,%xmm3
+  .byte  15,40,232                           // movaps        %xmm0,%xmm5
+  .byte  15,194,235,1                        // cmpltps       %xmm3,%xmm5
+  .byte  65,15,84,234                        // andps         %xmm10,%xmm5
+  .byte  15,92,221                           // subps         %xmm5,%xmm3
+  .byte  15,40,232                           // movaps        %xmm0,%xmm5
+  .byte  15,92,235                           // subps         %xmm3,%xmm5
+  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
+  .byte  15,89,253                           // mulps         %xmm5,%xmm7
+  .byte  15,92,199                           // subps         %xmm7,%xmm0
+  .byte  15,92,245                           // subps         %xmm5,%xmm6
+  .byte  15,94,214                           // divps         %xmm6,%xmm2
+  .byte  15,88,208                           // addps         %xmm0,%xmm2
+  .byte  102,65,15,110,192                   // movd          %r8d,%xmm0
+  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
+  .byte  15,89,224                           // mulps         %xmm0,%xmm4
+  .byte  15,89,200                           // mulps         %xmm0,%xmm1
+  .byte  15,89,208                           // mulps         %xmm0,%xmm2
+  .byte  102,15,91,220                       // cvtps2dq      %xmm4,%xmm3
+  .byte  102,15,91,201                       // cvtps2dq      %xmm1,%xmm1
+  .byte  102,15,91,210                       // cvtps2dq      %xmm2,%xmm2
   .byte  72,173                              // lods          %ds:(%rsi),%rax
+  .byte  102,15,40,195                       // movapd        %xmm3,%xmm0
+  .byte  15,40,92,36,192                     // movaps        -0x40(%rsp),%xmm3
+  .byte  15,40,100,36,208                    // movaps        -0x30(%rsp),%xmm4
+  .byte  15,40,108,36,224                    // movaps        -0x20(%rsp),%xmm5
+  .byte  15,40,116,36,240                    // movaps        -0x10(%rsp),%xmm6
+  .byte  15,40,60,36                         // movaps        (%rsp),%xmm7
+  .byte  72,131,196,24                       // add           $0x18,%rsp
   .byte  255,224                             // jmpq          *%rax
 
 HIDDEN _sk_rgb_to_hsl_sse2
@@ -25765,9 +26540,9 @@ _sk_gather_i8_sse2:
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  73,137,192                          // mov           %rax,%r8
   .byte  77,133,192                          // test          %r8,%r8
-  .byte  116,5                               // je            2a02 <_sk_gather_i8_sse2+0xf>
+  .byte  116,5                               // je            2da6 <_sk_gather_i8_sse2+0xf>
   .byte  76,137,192                          // mov           %r8,%rax
-  .byte  235,2                               // jmp           2a04 <_sk_gather_i8_sse2+0x11>
+  .byte  235,2                               // jmp           2da8 <_sk_gather_i8_sse2+0x11>
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  76,139,16                           // mov           (%rax),%r10
   .byte  243,15,91,201                       // cvttps2dq     %xmm1,%xmm1
@@ -27107,7 +27882,7 @@ _sk_linear_gradient_sse2:
   .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
   .byte  72,139,8                            // mov           (%rax),%rcx
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,132,15,1,0,0                     // je            3fbb <_sk_linear_gradient_sse2+0x149>
+  .byte  15,132,15,1,0,0                     // je            435f <_sk_linear_gradient_sse2+0x149>
   .byte  72,139,64,8                         // mov           0x8(%rax),%rax
   .byte  72,131,192,32                       // add           $0x20,%rax
   .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
@@ -27168,8 +27943,8 @@ _sk_linear_gradient_sse2:
   .byte  69,15,86,231                        // orps          %xmm15,%xmm12
   .byte  72,131,192,36                       // add           $0x24,%rax
   .byte  72,255,201                          // dec           %rcx
-  .byte  15,133,8,255,255,255                // jne           3ec1 <_sk_linear_gradient_sse2+0x4f>
-  .byte  235,13                              // jmp           3fc8 <_sk_linear_gradient_sse2+0x156>
+  .byte  15,133,8,255,255,255                // jne           4265 <_sk_linear_gradient_sse2+0x4f>
+  .byte  235,13                              // jmp           436c <_sk_linear_gradient_sse2+0x156>
   .byte  15,87,201                           // xorps         %xmm1,%xmm1
   .byte  15,87,210                           // xorps         %xmm2,%xmm2
   .byte  15,87,219                           // xorps         %xmm3,%xmm3
index 1a6fe0c..0188dd4 100644 (file)
@@ -938,83 +938,230 @@ _sk_to_srgb_hsw LABEL PROC
 
 PUBLIC _sk_from_2dot2_hsw
 _sk_from_2dot2_hsw LABEL PROC
-  DB  197,124,82,192                      ; vrsqrtps      %ymm0,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,200                   ; vrsqrtps      %ymm8,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  197,252,89,192                      ; vmulps        %ymm0,%ymm0,%ymm0
-  DB  196,65,60,89,208                    ; vmulps        %ymm8,%ymm8,%ymm10
-  DB  196,65,60,89,194                    ; vmulps        %ymm10,%ymm8,%ymm8
-  DB  196,193,124,89,192                  ; vmulps        %ymm8,%ymm0,%ymm0
-  DB  197,180,89,192                      ; vmulps        %ymm0,%ymm9,%ymm0
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  196,193,124,95,192                  ; vmaxps        %ymm8,%ymm0,%ymm0
-  DB  197,124,82,201                      ; vrsqrtps      %ymm1,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,209                   ; vrsqrtps      %ymm9,%ymm10
-  DB  196,65,124,82,210                   ; vrsqrtps      %ymm10,%ymm10
-  DB  197,244,89,201                      ; vmulps        %ymm1,%ymm1,%ymm1
-  DB  196,65,52,89,217                    ; vmulps        %ymm9,%ymm9,%ymm11
-  DB  196,65,52,89,203                    ; vmulps        %ymm11,%ymm9,%ymm9
-  DB  196,193,116,89,201                  ; vmulps        %ymm9,%ymm1,%ymm1
-  DB  197,172,89,201                      ; vmulps        %ymm1,%ymm10,%ymm1
-  DB  196,193,116,95,200                  ; vmaxps        %ymm8,%ymm1,%ymm1
-  DB  197,124,82,202                      ; vrsqrtps      %ymm2,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,209                   ; vrsqrtps      %ymm9,%ymm10
-  DB  196,65,124,82,210                   ; vrsqrtps      %ymm10,%ymm10
-  DB  197,236,89,210                      ; vmulps        %ymm2,%ymm2,%ymm2
-  DB  196,65,52,89,217                    ; vmulps        %ymm9,%ymm9,%ymm11
-  DB  196,65,52,89,203                    ; vmulps        %ymm11,%ymm9,%ymm9
-  DB  196,193,108,89,209                  ; vmulps        %ymm9,%ymm2,%ymm2
-  DB  197,172,89,210                      ; vmulps        %ymm2,%ymm10,%ymm2
-  DB  196,193,108,95,208                  ; vmaxps        %ymm8,%ymm2,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,129,236,216,0,0,0                ; sub           $0xd8,%rsp
+  DB  197,252,17,188,36,160,0,0,0         ; vmovups       %ymm7,0xa0(%rsp)
+  DB  197,252,17,180,36,128,0,0,0         ; vmovups       %ymm6,0x80(%rsp)
+  DB  197,252,17,108,36,96                ; vmovups       %ymm5,0x60(%rsp)
+  DB  197,252,17,100,36,64                ; vmovups       %ymm4,0x40(%rsp)
+  DB  197,252,17,92,36,32                 ; vmovups       %ymm3,0x20(%rsp)
+  DB  197,124,40,225                      ; vmovaps       %ymm1,%ymm12
+  DB  65,184,205,204,12,64                ; mov           $0x400ccccd,%r8d
+  DB  197,124,91,208                      ; vcvtdq2ps     %ymm0,%ymm10
+  DB  184,0,0,0,52                        ; mov           $0x34000000,%eax
+  DB  197,121,110,192                     ; vmovd         %eax,%xmm8
+  DB  196,66,125,88,216                   ; vpbroadcastd  %xmm8,%ymm11
+  DB  184,255,255,127,0                   ; mov           $0x7fffff,%eax
+  DB  197,121,110,192                     ; vmovd         %eax,%xmm8
+  DB  196,194,125,88,216                  ; vpbroadcastd  %xmm8,%ymm3
+  DB  197,254,127,28,36                   ; vmovdqu       %ymm3,(%rsp)
+  DB  197,101,219,200                     ; vpand         %ymm0,%ymm3,%ymm9
+  DB  184,0,0,0,63                        ; mov           $0x3f000000,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,98,125,88,248                   ; vpbroadcastd  %xmm0,%ymm15
+  DB  196,193,53,235,223                  ; vpor          %ymm15,%ymm9,%ymm3
+  DB  184,119,115,248,66                  ; mov           $0x42f87377,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,98,125,88,232                   ; vpbroadcastd  %xmm0,%ymm13
+  DB  196,66,37,170,213                   ; vfmsub213ps   %ymm13,%ymm11,%ymm10
+  DB  184,117,191,191,63                  ; mov           $0x3fbfbf75,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,98,125,88,200                   ; vpbroadcastd  %xmm0,%ymm9
+  DB  196,66,101,188,209                  ; vfnmadd231ps  %ymm9,%ymm3,%ymm10
+  DB  184,163,233,220,63                  ; mov           $0x3fdce9a3,%eax
+  DB  196,65,124,91,244                   ; vcvtdq2ps     %ymm12,%ymm14
+  DB  196,66,37,170,245                   ; vfmsub213ps   %ymm13,%ymm11,%ymm14
+  DB  197,252,91,202                      ; vcvtdq2ps     %ymm2,%ymm1
+  DB  197,124,40,194                      ; vmovaps       %ymm2,%ymm8
+  DB  196,194,37,170,205                  ; vfmsub213ps   %ymm13,%ymm11,%ymm1
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,226,125,88,192                  ; vpbroadcastd  %xmm0,%ymm0
+  DB  184,249,68,180,62                   ; mov           $0x3eb444f9,%eax
+  DB  197,249,110,248                     ; vmovd         %eax,%xmm7
+  DB  196,226,125,88,255                  ; vpbroadcastd  %xmm7,%ymm7
+  DB  197,100,88,223                      ; vaddps        %ymm7,%ymm3,%ymm11
+  DB  196,65,124,94,219                   ; vdivps        %ymm11,%ymm0,%ymm11
+  DB  196,65,44,92,211                    ; vsubps        %ymm11,%ymm10,%ymm10
+  DB  196,193,121,110,240                 ; vmovd         %r8d,%xmm6
+  DB  196,226,125,88,246                  ; vpbroadcastd  %xmm6,%ymm6
+  DB  196,65,76,89,210                    ; vmulps        %ymm10,%ymm6,%ymm10
+  DB  196,67,125,8,218,1                  ; vroundps      $0x1,%ymm10,%ymm11
+  DB  196,65,44,92,219                    ; vsubps        %ymm11,%ymm10,%ymm11
+  DB  65,184,0,0,0,75                     ; mov           $0x4b000000,%r8d
+  DB  184,81,140,242,66                   ; mov           $0x42f28c51,%eax
+  DB  197,249,110,232                     ; vmovd         %eax,%xmm5
+  DB  196,226,125,88,237                  ; vpbroadcastd  %xmm5,%ymm5
+  DB  196,65,84,88,210                    ; vaddps        %ymm10,%ymm5,%ymm10
+  DB  184,141,188,190,63                  ; mov           $0x3fbebc8d,%eax
+  DB  197,249,110,224                     ; vmovd         %eax,%xmm4
+  DB  196,226,125,88,228                  ; vpbroadcastd  %xmm4,%ymm4
+  DB  196,66,93,188,211                   ; vfnmadd231ps  %ymm11,%ymm4,%ymm10
+  DB  184,254,210,221,65                  ; mov           $0x41ddd2fe,%eax
+  DB  197,249,110,216                     ; vmovd         %eax,%xmm3
+  DB  196,226,125,88,219                  ; vpbroadcastd  %xmm3,%ymm3
+  DB  184,248,245,154,64                  ; mov           $0x409af5f8,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,226,125,88,210                  ; vpbroadcastd  %xmm2,%ymm2
+  DB  196,65,108,92,219                   ; vsubps        %ymm11,%ymm2,%ymm11
+  DB  196,65,100,94,219                   ; vdivps        %ymm11,%ymm3,%ymm11
+  DB  196,65,44,88,211                    ; vaddps        %ymm11,%ymm10,%ymm10
+  DB  197,124,16,44,36                    ; vmovups       (%rsp),%ymm13
+  DB  196,65,20,84,220                    ; vandps        %ymm12,%ymm13,%ymm11
+  DB  196,65,36,86,223                    ; vorps         %ymm15,%ymm11,%ymm11
+  DB  196,66,37,188,241                   ; vfnmadd231ps  %ymm9,%ymm11,%ymm14
+  DB  197,36,88,223                       ; vaddps        %ymm7,%ymm11,%ymm11
+  DB  196,65,124,94,219                   ; vdivps        %ymm11,%ymm0,%ymm11
+  DB  196,65,12,92,219                    ; vsubps        %ymm11,%ymm14,%ymm11
+  DB  196,65,76,89,219                    ; vmulps        %ymm11,%ymm6,%ymm11
+  DB  196,67,125,8,227,1                  ; vroundps      $0x1,%ymm11,%ymm12
+  DB  196,65,36,92,228                    ; vsubps        %ymm12,%ymm11,%ymm12
+  DB  196,65,84,88,219                    ; vaddps        %ymm11,%ymm5,%ymm11
+  DB  196,66,93,188,220                   ; vfnmadd231ps  %ymm12,%ymm4,%ymm11
+  DB  196,65,108,92,228                   ; vsubps        %ymm12,%ymm2,%ymm12
+  DB  196,65,100,94,228                   ; vdivps        %ymm12,%ymm3,%ymm12
+  DB  196,65,36,88,220                    ; vaddps        %ymm12,%ymm11,%ymm11
+  DB  196,65,20,84,192                    ; vandps        %ymm8,%ymm13,%ymm8
+  DB  196,65,60,86,199                    ; vorps         %ymm15,%ymm8,%ymm8
+  DB  196,194,61,188,201                  ; vfnmadd231ps  %ymm9,%ymm8,%ymm1
+  DB  197,188,88,255                      ; vaddps        %ymm7,%ymm8,%ymm7
+  DB  197,252,94,199                      ; vdivps        %ymm7,%ymm0,%ymm0
+  DB  197,244,92,192                      ; vsubps        %ymm0,%ymm1,%ymm0
+  DB  197,204,89,192                      ; vmulps        %ymm0,%ymm6,%ymm0
+  DB  196,227,125,8,200,1                 ; vroundps      $0x1,%ymm0,%ymm1
+  DB  197,252,92,201                      ; vsubps        %ymm1,%ymm0,%ymm1
+  DB  197,212,88,192                      ; vaddps        %ymm0,%ymm5,%ymm0
+  DB  196,226,117,172,224                 ; vfnmadd213ps  %ymm0,%ymm1,%ymm4
+  DB  197,236,92,193                      ; vsubps        %ymm1,%ymm2,%ymm0
+  DB  197,228,94,192                      ; vdivps        %ymm0,%ymm3,%ymm0
+  DB  197,220,88,192                      ; vaddps        %ymm0,%ymm4,%ymm0
+  DB  196,193,121,110,200                 ; vmovd         %r8d,%xmm1
+  DB  196,226,125,88,201                  ; vpbroadcastd  %xmm1,%ymm1
+  DB  196,193,116,89,210                  ; vmulps        %ymm10,%ymm1,%ymm2
+  DB  196,193,116,89,219                  ; vmulps        %ymm11,%ymm1,%ymm3
+  DB  197,244,89,224                      ; vmulps        %ymm0,%ymm1,%ymm4
+  DB  197,253,91,194                      ; vcvtps2dq     %ymm2,%ymm0
+  DB  197,253,91,203                      ; vcvtps2dq     %ymm3,%ymm1
+  DB  197,253,91,212                      ; vcvtps2dq     %ymm4,%ymm2
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  197,252,16,92,36,32                 ; vmovups       0x20(%rsp),%ymm3
+  DB  197,252,16,100,36,64                ; vmovups       0x40(%rsp),%ymm4
+  DB  197,252,16,108,36,96                ; vmovups       0x60(%rsp),%ymm5
+  DB  197,252,16,180,36,128,0,0,0         ; vmovups       0x80(%rsp),%ymm6
+  DB  197,252,16,188,36,160,0,0,0         ; vmovups       0xa0(%rsp),%ymm7
+  DB  72,129,196,216,0,0,0                ; add           $0xd8,%rsp
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_to_2dot2_hsw
 _sk_to_2dot2_hsw LABEL PROC
-  DB  197,252,82,192                      ; vrsqrtps      %ymm0,%ymm0
-  DB  197,124,82,192                      ; vrsqrtps      %ymm0,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,200                   ; vrsqrtps      %ymm8,%ymm9
-  DB  197,252,83,192                      ; vrcpps        %ymm0,%ymm0
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  196,65,124,83,193                   ; vrcpps        %ymm9,%ymm8
-  DB  196,193,124,89,192                  ; vmulps        %ymm8,%ymm0,%ymm0
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  196,193,124,95,192                  ; vmaxps        %ymm8,%ymm0,%ymm0
-  DB  197,252,82,201                      ; vrsqrtps      %ymm1,%ymm1
-  DB  197,124,82,201                      ; vrsqrtps      %ymm1,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,209                   ; vrsqrtps      %ymm9,%ymm10
-  DB  197,252,83,201                      ; vrcpps        %ymm1,%ymm1
-  DB  197,180,89,201                      ; vmulps        %ymm1,%ymm9,%ymm1
-  DB  196,65,124,83,202                   ; vrcpps        %ymm10,%ymm9
-  DB  196,193,116,89,201                  ; vmulps        %ymm9,%ymm1,%ymm1
-  DB  196,193,116,95,200                  ; vmaxps        %ymm8,%ymm1,%ymm1
-  DB  197,252,82,210                      ; vrsqrtps      %ymm2,%ymm2
-  DB  197,124,82,202                      ; vrsqrtps      %ymm2,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,209                   ; vrsqrtps      %ymm9,%ymm10
-  DB  197,252,83,210                      ; vrcpps        %ymm2,%ymm2
-  DB  197,180,89,210                      ; vmulps        %ymm2,%ymm9,%ymm2
-  DB  196,65,124,83,202                   ; vrcpps        %ymm10,%ymm9
-  DB  196,193,108,89,209                  ; vmulps        %ymm9,%ymm2,%ymm2
-  DB  196,193,108,95,208                  ; vmaxps        %ymm8,%ymm2,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,129,236,216,0,0,0                ; sub           $0xd8,%rsp
+  DB  197,252,17,188,36,160,0,0,0         ; vmovups       %ymm7,0xa0(%rsp)
+  DB  197,252,17,180,36,128,0,0,0         ; vmovups       %ymm6,0x80(%rsp)
+  DB  197,252,17,108,36,96                ; vmovups       %ymm5,0x60(%rsp)
+  DB  197,252,17,100,36,64                ; vmovups       %ymm4,0x40(%rsp)
+  DB  197,252,17,92,36,32                 ; vmovups       %ymm3,0x20(%rsp)
+  DB  197,124,40,225                      ; vmovaps       %ymm1,%ymm12
+  DB  65,184,46,186,232,62                ; mov           $0x3ee8ba2e,%r8d
+  DB  197,124,91,208                      ; vcvtdq2ps     %ymm0,%ymm10
+  DB  184,0,0,0,52                        ; mov           $0x34000000,%eax
+  DB  197,121,110,192                     ; vmovd         %eax,%xmm8
+  DB  196,66,125,88,216                   ; vpbroadcastd  %xmm8,%ymm11
+  DB  184,255,255,127,0                   ; mov           $0x7fffff,%eax
+  DB  197,121,110,192                     ; vmovd         %eax,%xmm8
+  DB  196,194,125,88,216                  ; vpbroadcastd  %xmm8,%ymm3
+  DB  197,254,127,28,36                   ; vmovdqu       %ymm3,(%rsp)
+  DB  197,101,219,200                     ; vpand         %ymm0,%ymm3,%ymm9
+  DB  184,0,0,0,63                        ; mov           $0x3f000000,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,98,125,88,248                   ; vpbroadcastd  %xmm0,%ymm15
+  DB  196,193,53,235,223                  ; vpor          %ymm15,%ymm9,%ymm3
+  DB  184,119,115,248,66                  ; mov           $0x42f87377,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,98,125,88,232                   ; vpbroadcastd  %xmm0,%ymm13
+  DB  196,66,37,170,213                   ; vfmsub213ps   %ymm13,%ymm11,%ymm10
+  DB  184,117,191,191,63                  ; mov           $0x3fbfbf75,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,98,125,88,200                   ; vpbroadcastd  %xmm0,%ymm9
+  DB  196,66,101,188,209                  ; vfnmadd231ps  %ymm9,%ymm3,%ymm10
+  DB  184,163,233,220,63                  ; mov           $0x3fdce9a3,%eax
+  DB  196,65,124,91,244                   ; vcvtdq2ps     %ymm12,%ymm14
+  DB  196,66,37,170,245                   ; vfmsub213ps   %ymm13,%ymm11,%ymm14
+  DB  197,252,91,202                      ; vcvtdq2ps     %ymm2,%ymm1
+  DB  197,124,40,194                      ; vmovaps       %ymm2,%ymm8
+  DB  196,194,37,170,205                  ; vfmsub213ps   %ymm13,%ymm11,%ymm1
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,226,125,88,192                  ; vpbroadcastd  %xmm0,%ymm0
+  DB  184,249,68,180,62                   ; mov           $0x3eb444f9,%eax
+  DB  197,249,110,248                     ; vmovd         %eax,%xmm7
+  DB  196,226,125,88,255                  ; vpbroadcastd  %xmm7,%ymm7
+  DB  197,100,88,223                      ; vaddps        %ymm7,%ymm3,%ymm11
+  DB  196,65,124,94,219                   ; vdivps        %ymm11,%ymm0,%ymm11
+  DB  196,65,44,92,211                    ; vsubps        %ymm11,%ymm10,%ymm10
+  DB  196,193,121,110,240                 ; vmovd         %r8d,%xmm6
+  DB  196,226,125,88,246                  ; vpbroadcastd  %xmm6,%ymm6
+  DB  196,65,76,89,210                    ; vmulps        %ymm10,%ymm6,%ymm10
+  DB  196,67,125,8,218,1                  ; vroundps      $0x1,%ymm10,%ymm11
+  DB  196,65,44,92,219                    ; vsubps        %ymm11,%ymm10,%ymm11
+  DB  65,184,0,0,0,75                     ; mov           $0x4b000000,%r8d
+  DB  184,81,140,242,66                   ; mov           $0x42f28c51,%eax
+  DB  197,249,110,232                     ; vmovd         %eax,%xmm5
+  DB  196,226,125,88,237                  ; vpbroadcastd  %xmm5,%ymm5
+  DB  196,65,84,88,210                    ; vaddps        %ymm10,%ymm5,%ymm10
+  DB  184,141,188,190,63                  ; mov           $0x3fbebc8d,%eax
+  DB  197,249,110,224                     ; vmovd         %eax,%xmm4
+  DB  196,226,125,88,228                  ; vpbroadcastd  %xmm4,%ymm4
+  DB  196,66,93,188,211                   ; vfnmadd231ps  %ymm11,%ymm4,%ymm10
+  DB  184,254,210,221,65                  ; mov           $0x41ddd2fe,%eax
+  DB  197,249,110,216                     ; vmovd         %eax,%xmm3
+  DB  196,226,125,88,219                  ; vpbroadcastd  %xmm3,%ymm3
+  DB  184,248,245,154,64                  ; mov           $0x409af5f8,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,226,125,88,210                  ; vpbroadcastd  %xmm2,%ymm2
+  DB  196,65,108,92,219                   ; vsubps        %ymm11,%ymm2,%ymm11
+  DB  196,65,100,94,219                   ; vdivps        %ymm11,%ymm3,%ymm11
+  DB  196,65,44,88,211                    ; vaddps        %ymm11,%ymm10,%ymm10
+  DB  197,124,16,44,36                    ; vmovups       (%rsp),%ymm13
+  DB  196,65,20,84,220                    ; vandps        %ymm12,%ymm13,%ymm11
+  DB  196,65,36,86,223                    ; vorps         %ymm15,%ymm11,%ymm11
+  DB  196,66,37,188,241                   ; vfnmadd231ps  %ymm9,%ymm11,%ymm14
+  DB  197,36,88,223                       ; vaddps        %ymm7,%ymm11,%ymm11
+  DB  196,65,124,94,219                   ; vdivps        %ymm11,%ymm0,%ymm11
+  DB  196,65,12,92,219                    ; vsubps        %ymm11,%ymm14,%ymm11
+  DB  196,65,76,89,219                    ; vmulps        %ymm11,%ymm6,%ymm11
+  DB  196,67,125,8,227,1                  ; vroundps      $0x1,%ymm11,%ymm12
+  DB  196,65,36,92,228                    ; vsubps        %ymm12,%ymm11,%ymm12
+  DB  196,65,84,88,219                    ; vaddps        %ymm11,%ymm5,%ymm11
+  DB  196,66,93,188,220                   ; vfnmadd231ps  %ymm12,%ymm4,%ymm11
+  DB  196,65,108,92,228                   ; vsubps        %ymm12,%ymm2,%ymm12
+  DB  196,65,100,94,228                   ; vdivps        %ymm12,%ymm3,%ymm12
+  DB  196,65,36,88,220                    ; vaddps        %ymm12,%ymm11,%ymm11
+  DB  196,65,20,84,192                    ; vandps        %ymm8,%ymm13,%ymm8
+  DB  196,65,60,86,199                    ; vorps         %ymm15,%ymm8,%ymm8
+  DB  196,194,61,188,201                  ; vfnmadd231ps  %ymm9,%ymm8,%ymm1
+  DB  197,188,88,255                      ; vaddps        %ymm7,%ymm8,%ymm7
+  DB  197,252,94,199                      ; vdivps        %ymm7,%ymm0,%ymm0
+  DB  197,244,92,192                      ; vsubps        %ymm0,%ymm1,%ymm0
+  DB  197,204,89,192                      ; vmulps        %ymm0,%ymm6,%ymm0
+  DB  196,227,125,8,200,1                 ; vroundps      $0x1,%ymm0,%ymm1
+  DB  197,252,92,201                      ; vsubps        %ymm1,%ymm0,%ymm1
+  DB  197,212,88,192                      ; vaddps        %ymm0,%ymm5,%ymm0
+  DB  196,226,117,172,224                 ; vfnmadd213ps  %ymm0,%ymm1,%ymm4
+  DB  197,236,92,193                      ; vsubps        %ymm1,%ymm2,%ymm0
+  DB  197,228,94,192                      ; vdivps        %ymm0,%ymm3,%ymm0
+  DB  197,220,88,192                      ; vaddps        %ymm0,%ymm4,%ymm0
+  DB  196,193,121,110,200                 ; vmovd         %r8d,%xmm1
+  DB  196,226,125,88,201                  ; vpbroadcastd  %xmm1,%ymm1
+  DB  196,193,116,89,210                  ; vmulps        %ymm10,%ymm1,%ymm2
+  DB  196,193,116,89,219                  ; vmulps        %ymm11,%ymm1,%ymm3
+  DB  197,244,89,224                      ; vmulps        %ymm0,%ymm1,%ymm4
+  DB  197,253,91,194                      ; vcvtps2dq     %ymm2,%ymm0
+  DB  197,253,91,203                      ; vcvtps2dq     %ymm3,%ymm1
+  DB  197,253,91,212                      ; vcvtps2dq     %ymm4,%ymm2
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  197,252,16,92,36,32                 ; vmovups       0x20(%rsp),%ymm3
+  DB  197,252,16,100,36,64                ; vmovups       0x40(%rsp),%ymm4
+  DB  197,252,16,108,36,96                ; vmovups       0x60(%rsp),%ymm5
+  DB  197,252,16,180,36,128,0,0,0         ; vmovups       0x80(%rsp),%ymm6
+  DB  197,252,16,188,36,160,0,0,0         ; vmovups       0xa0(%rsp),%ymm7
+  DB  72,129,196,216,0,0,0                ; add           $0xd8,%rsp
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_rgb_to_hsl_hsw
@@ -1204,7 +1351,7 @@ _sk_scale_u8_hsw LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,1,248                            ; add           %rdi,%rax
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,56                              ; jne           126b <_sk_scale_u8_hsw+0x48>
+  DB  117,56                              ; jne           155f <_sk_scale_u8_hsw+0x48>
   DB  197,122,126,0                       ; vmovq         (%rax),%xmm8
   DB  196,66,125,49,192                   ; vpmovzxbd     %xmm8,%ymm8
   DB  196,65,124,91,192                   ; vcvtdq2ps     %ymm8,%ymm8
@@ -1228,9 +1375,9 @@ _sk_scale_u8_hsw LABEL PROC
   DB  77,9,217                            ; or            %r11,%r9
   DB  72,131,193,8                        ; add           $0x8,%rcx
   DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           1273 <_sk_scale_u8_hsw+0x50>
+  DB  117,234                             ; jne           1567 <_sk_scale_u8_hsw+0x50>
   DB  196,65,249,110,193                  ; vmovq         %r9,%xmm8
-  DB  235,167                             ; jmp           1237 <_sk_scale_u8_hsw+0x14>
+  DB  235,167                             ; jmp           152b <_sk_scale_u8_hsw+0x14>
 
 PUBLIC _sk_lerp_1_float_hsw
 _sk_lerp_1_float_hsw LABEL PROC
@@ -1254,7 +1401,7 @@ _sk_lerp_u8_hsw LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,1,248                            ; add           %rdi,%rax
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,76                              ; jne           131b <_sk_lerp_u8_hsw+0x5c>
+  DB  117,76                              ; jne           160f <_sk_lerp_u8_hsw+0x5c>
   DB  197,122,126,0                       ; vmovq         (%rax),%xmm8
   DB  196,66,125,49,192                   ; vpmovzxbd     %xmm8,%ymm8
   DB  196,65,124,91,192                   ; vcvtdq2ps     %ymm8,%ymm8
@@ -1282,16 +1429,16 @@ _sk_lerp_u8_hsw LABEL PROC
   DB  77,9,217                            ; or            %r11,%r9
   DB  72,131,193,8                        ; add           $0x8,%rcx
   DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           1323 <_sk_lerp_u8_hsw+0x64>
+  DB  117,234                             ; jne           1617 <_sk_lerp_u8_hsw+0x64>
   DB  196,65,249,110,193                  ; vmovq         %r9,%xmm8
-  DB  235,147                             ; jmp           12d3 <_sk_lerp_u8_hsw+0x14>
+  DB  235,147                             ; jmp           15c7 <_sk_lerp_u8_hsw+0x14>
 
 PUBLIC _sk_lerp_565_hsw
 _sk_lerp_565_hsw LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,16                           ; mov           (%rax),%r10
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,179,0,0,0                    ; jne           1401 <_sk_lerp_565_hsw+0xc1>
+  DB  15,133,179,0,0,0                    ; jne           16f5 <_sk_lerp_565_hsw+0xc1>
   DB  196,193,122,111,28,122              ; vmovdqu       (%r10,%rdi,2),%xmm3
   DB  196,98,125,51,195                   ; vpmovzxwd     %xmm3,%ymm8
   DB  184,0,248,0,0                       ; mov           $0xf800,%eax
@@ -1337,9 +1484,9 @@ _sk_lerp_565_hsw LABEL PROC
   DB  197,225,239,219                     ; vpxor         %xmm3,%xmm3,%xmm3
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,59,255,255,255               ; ja            1354 <_sk_lerp_565_hsw+0x14>
+  DB  15,135,59,255,255,255               ; ja            1648 <_sk_lerp_565_hsw+0x14>
   DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  76,141,13,76,0,0,0                  ; lea           0x4c(%rip),%r9        # 1470 <_sk_lerp_565_hsw+0x130>
+  DB  76,141,13,76,0,0,0                  ; lea           0x4c(%rip),%r9        # 1764 <_sk_lerp_565_hsw+0x130>
   DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
   DB  76,1,200                            ; add           %r9,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -1351,13 +1498,13 @@ _sk_lerp_565_hsw LABEL PROC
   DB  196,193,97,196,92,122,4,2           ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
   DB  196,193,97,196,92,122,2,1           ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
   DB  196,193,97,196,28,122,0             ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm3,%xmm3
-  DB  233,231,254,255,255                 ; jmpq          1354 <_sk_lerp_565_hsw+0x14>
+  DB  233,231,254,255,255                 ; jmpq          1648 <_sk_lerp_565_hsw+0x14>
   DB  15,31,0                             ; nopl          (%rax)
   DB  241                                 ; icebp
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
-  DB  233,255,255,255,225                 ; jmpq          ffffffffe2001478 <_sk_callback_hsw+0xffffffffe1ffd152>
+  DB  233,255,255,255,225                 ; jmpq          ffffffffe200176c <_sk_callback_hsw+0xffffffffe1ffd152>
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -1382,7 +1529,7 @@ _sk_load_tables_hsw LABEL PROC
   DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
   DB  76,3,8                              ; add           (%rax),%r9
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,121                             ; jne           151a <_sk_load_tables_hsw+0x8e>
+  DB  117,121                             ; jne           180e <_sk_load_tables_hsw+0x8e>
   DB  196,193,126,111,25                  ; vmovdqu       (%r9),%ymm3
   DB  185,255,0,0,0                       ; mov           $0xff,%ecx
   DB  197,249,110,193                     ; vmovd         %ecx,%xmm0
@@ -1418,7 +1565,7 @@ _sk_load_tables_hsw LABEL PROC
   DB  196,193,249,110,194                 ; vmovq         %r10,%xmm0
   DB  196,226,125,33,192                  ; vpmovsxbd     %xmm0,%ymm0
   DB  196,194,125,140,25                  ; vpmaskmovd    (%r9),%ymm0,%ymm3
-  DB  233,99,255,255,255                  ; jmpq          14a6 <_sk_load_tables_hsw+0x1a>
+  DB  233,99,255,255,255                  ; jmpq          179a <_sk_load_tables_hsw+0x1a>
 
 PUBLIC _sk_load_tables_u16_be_hsw
 _sk_load_tables_u16_be_hsw LABEL PROC
@@ -1426,7 +1573,7 @@ _sk_load_tables_u16_be_hsw LABEL PROC
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,216,0,0,0                    ; jne           1631 <_sk_load_tables_u16_be_hsw+0xee>
+  DB  15,133,216,0,0,0                    ; jne           1925 <_sk_load_tables_u16_be_hsw+0xee>
   DB  196,1,121,16,4,72                   ; vmovupd       (%r8,%r9,2),%xmm8
   DB  196,129,121,16,84,72,16             ; vmovupd       0x10(%r8,%r9,2),%xmm2
   DB  196,129,121,16,92,72,32             ; vmovupd       0x20(%r8,%r9,2),%xmm3
@@ -1475,29 +1622,29 @@ _sk_load_tables_u16_be_hsw LABEL PROC
   DB  196,1,123,16,4,72                   ; vmovsd        (%r8,%r9,2),%xmm8
   DB  196,65,49,239,201                   ; vpxor         %xmm9,%xmm9,%xmm9
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,85                              ; je            1697 <_sk_load_tables_u16_be_hsw+0x154>
+  DB  116,85                              ; je            198b <_sk_load_tables_u16_be_hsw+0x154>
   DB  196,1,57,22,68,72,8                 ; vmovhpd       0x8(%r8,%r9,2),%xmm8,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,72                              ; jb            1697 <_sk_load_tables_u16_be_hsw+0x154>
+  DB  114,72                              ; jb            198b <_sk_load_tables_u16_be_hsw+0x154>
   DB  196,129,123,16,84,72,16             ; vmovsd        0x10(%r8,%r9,2),%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  116,72                              ; je            16a4 <_sk_load_tables_u16_be_hsw+0x161>
+  DB  116,72                              ; je            1998 <_sk_load_tables_u16_be_hsw+0x161>
   DB  196,129,105,22,84,72,24             ; vmovhpd       0x18(%r8,%r9,2),%xmm2,%xmm2
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,59                              ; jb            16a4 <_sk_load_tables_u16_be_hsw+0x161>
+  DB  114,59                              ; jb            1998 <_sk_load_tables_u16_be_hsw+0x161>
   DB  196,129,123,16,92,72,32             ; vmovsd        0x20(%r8,%r9,2),%xmm3
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  15,132,250,254,255,255              ; je            1574 <_sk_load_tables_u16_be_hsw+0x31>
+  DB  15,132,250,254,255,255              ; je            1868 <_sk_load_tables_u16_be_hsw+0x31>
   DB  196,129,97,22,92,72,40              ; vmovhpd       0x28(%r8,%r9,2),%xmm3,%xmm3
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,233,254,255,255              ; jb            1574 <_sk_load_tables_u16_be_hsw+0x31>
+  DB  15,130,233,254,255,255              ; jb            1868 <_sk_load_tables_u16_be_hsw+0x31>
   DB  196,1,122,126,76,72,48              ; vmovq         0x30(%r8,%r9,2),%xmm9
-  DB  233,221,254,255,255                 ; jmpq          1574 <_sk_load_tables_u16_be_hsw+0x31>
+  DB  233,221,254,255,255                 ; jmpq          1868 <_sk_load_tables_u16_be_hsw+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
   DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,208,254,255,255                 ; jmpq          1574 <_sk_load_tables_u16_be_hsw+0x31>
+  DB  233,208,254,255,255                 ; jmpq          1868 <_sk_load_tables_u16_be_hsw+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  233,199,254,255,255                 ; jmpq          1574 <_sk_load_tables_u16_be_hsw+0x31>
+  DB  233,199,254,255,255                 ; jmpq          1868 <_sk_load_tables_u16_be_hsw+0x31>
 
 PUBLIC _sk_load_tables_rgb_u16_be_hsw
 _sk_load_tables_rgb_u16_be_hsw LABEL PROC
@@ -1505,7 +1652,7 @@ _sk_load_tables_rgb_u16_be_hsw LABEL PROC
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  76,141,12,127                       ; lea           (%rdi,%rdi,2),%r9
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,207,0,0,0                    ; jne           178e <_sk_load_tables_rgb_u16_be_hsw+0xe1>
+  DB  15,133,207,0,0,0                    ; jne           1a82 <_sk_load_tables_rgb_u16_be_hsw+0xe1>
   DB  196,129,122,111,4,72                ; vmovdqu       (%r8,%r9,2),%xmm0
   DB  196,129,122,111,84,72,12            ; vmovdqu       0xc(%r8,%r9,2),%xmm2
   DB  196,129,122,111,76,72,24            ; vmovdqu       0x18(%r8,%r9,2),%xmm1
@@ -1552,36 +1699,36 @@ _sk_load_tables_rgb_u16_be_hsw LABEL PROC
   DB  196,129,121,110,4,72                ; vmovd         (%r8,%r9,2),%xmm0
   DB  196,129,121,196,68,72,4,2           ; vpinsrw       $0x2,0x4(%r8,%r9,2),%xmm0,%xmm0
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  117,5                               ; jne           17a7 <_sk_load_tables_rgb_u16_be_hsw+0xfa>
-  DB  233,76,255,255,255                  ; jmpq          16f3 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  DB  117,5                               ; jne           1a9b <_sk_load_tables_rgb_u16_be_hsw+0xfa>
+  DB  233,76,255,255,255                  ; jmpq          19e7 <_sk_load_tables_rgb_u16_be_hsw+0x46>
   DB  196,129,121,110,76,72,6             ; vmovd         0x6(%r8,%r9,2),%xmm1
   DB  196,1,113,196,68,72,10,2            ; vpinsrw       $0x2,0xa(%r8,%r9,2),%xmm1,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,26                              ; jb            17d6 <_sk_load_tables_rgb_u16_be_hsw+0x129>
+  DB  114,26                              ; jb            1aca <_sk_load_tables_rgb_u16_be_hsw+0x129>
   DB  196,129,121,110,76,72,12            ; vmovd         0xc(%r8,%r9,2),%xmm1
   DB  196,129,113,196,84,72,16,2          ; vpinsrw       $0x2,0x10(%r8,%r9,2),%xmm1,%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  117,10                              ; jne           17db <_sk_load_tables_rgb_u16_be_hsw+0x12e>
-  DB  233,29,255,255,255                  ; jmpq          16f3 <_sk_load_tables_rgb_u16_be_hsw+0x46>
-  DB  233,24,255,255,255                  ; jmpq          16f3 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  DB  117,10                              ; jne           1acf <_sk_load_tables_rgb_u16_be_hsw+0x12e>
+  DB  233,29,255,255,255                  ; jmpq          19e7 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  DB  233,24,255,255,255                  ; jmpq          19e7 <_sk_load_tables_rgb_u16_be_hsw+0x46>
   DB  196,129,121,110,76,72,18            ; vmovd         0x12(%r8,%r9,2),%xmm1
   DB  196,1,113,196,76,72,22,2            ; vpinsrw       $0x2,0x16(%r8,%r9,2),%xmm1,%xmm9
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,26                              ; jb            180a <_sk_load_tables_rgb_u16_be_hsw+0x15d>
+  DB  114,26                              ; jb            1afe <_sk_load_tables_rgb_u16_be_hsw+0x15d>
   DB  196,129,121,110,76,72,24            ; vmovd         0x18(%r8,%r9,2),%xmm1
   DB  196,129,113,196,76,72,28,2          ; vpinsrw       $0x2,0x1c(%r8,%r9,2),%xmm1,%xmm1
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  117,10                              ; jne           180f <_sk_load_tables_rgb_u16_be_hsw+0x162>
-  DB  233,233,254,255,255                 ; jmpq          16f3 <_sk_load_tables_rgb_u16_be_hsw+0x46>
-  DB  233,228,254,255,255                 ; jmpq          16f3 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  DB  117,10                              ; jne           1b03 <_sk_load_tables_rgb_u16_be_hsw+0x162>
+  DB  233,233,254,255,255                 ; jmpq          19e7 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  DB  233,228,254,255,255                 ; jmpq          19e7 <_sk_load_tables_rgb_u16_be_hsw+0x46>
   DB  196,129,121,110,92,72,30            ; vmovd         0x1e(%r8,%r9,2),%xmm3
   DB  196,1,97,196,92,72,34,2             ; vpinsrw       $0x2,0x22(%r8,%r9,2),%xmm3,%xmm11
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,20                              ; jb            1838 <_sk_load_tables_rgb_u16_be_hsw+0x18b>
+  DB  114,20                              ; jb            1b2c <_sk_load_tables_rgb_u16_be_hsw+0x18b>
   DB  196,129,121,110,92,72,36            ; vmovd         0x24(%r8,%r9,2),%xmm3
   DB  196,129,97,196,92,72,40,2           ; vpinsrw       $0x2,0x28(%r8,%r9,2),%xmm3,%xmm3
-  DB  233,187,254,255,255                 ; jmpq          16f3 <_sk_load_tables_rgb_u16_be_hsw+0x46>
-  DB  233,182,254,255,255                 ; jmpq          16f3 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  DB  233,187,254,255,255                 ; jmpq          19e7 <_sk_load_tables_rgb_u16_be_hsw+0x46>
+  DB  233,182,254,255,255                 ; jmpq          19e7 <_sk_load_tables_rgb_u16_be_hsw+0x46>
 
 PUBLIC _sk_byte_tables_hsw
 _sk_byte_tables_hsw LABEL PROC
@@ -2320,7 +2467,7 @@ _sk_load_a8_hsw LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,1,248                            ; add           %rdi,%rax
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,50                              ; jne           2535 <_sk_load_a8_hsw+0x42>
+  DB  117,50                              ; jne           2829 <_sk_load_a8_hsw+0x42>
   DB  197,250,126,0                       ; vmovq         (%rax),%xmm0
   DB  196,226,125,49,192                  ; vpmovzxbd     %xmm0,%ymm0
   DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
@@ -2343,9 +2490,9 @@ _sk_load_a8_hsw LABEL PROC
   DB  77,9,217                            ; or            %r11,%r9
   DB  72,131,193,8                        ; add           $0x8,%rcx
   DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           253d <_sk_load_a8_hsw+0x4a>
+  DB  117,234                             ; jne           2831 <_sk_load_a8_hsw+0x4a>
   DB  196,193,249,110,193                 ; vmovq         %r9,%xmm0
-  DB  235,173                             ; jmp           2507 <_sk_load_a8_hsw+0x14>
+  DB  235,173                             ; jmp           27fb <_sk_load_a8_hsw+0x14>
 
 PUBLIC _sk_gather_a8_hsw
 _sk_gather_a8_hsw LABEL PROC
@@ -2416,7 +2563,7 @@ _sk_store_a8_hsw LABEL PROC
   DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
   DB  196,65,57,103,192                   ; vpackuswb     %xmm8,%xmm8,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           2672 <_sk_store_a8_hsw+0x3b>
+  DB  117,10                              ; jne           2966 <_sk_store_a8_hsw+0x3b>
   DB  196,65,123,17,4,57                  ; vmovsd        %xmm8,(%r9,%rdi,1)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
@@ -2424,10 +2571,10 @@ _sk_store_a8_hsw LABEL PROC
   DB  65,128,224,7                        ; and           $0x7,%r8b
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            266e <_sk_store_a8_hsw+0x37>
+  DB  119,236                             ; ja            2962 <_sk_store_a8_hsw+0x37>
   DB  196,66,121,48,192                   ; vpmovzxbw     %xmm8,%xmm8
   DB  65,15,182,192                       ; movzbl        %r8b,%eax
-  DB  76,141,5,66,0,0,0                   ; lea           0x42(%rip),%r8        # 26d4 <_sk_store_a8_hsw+0x9d>
+  DB  76,141,5,66,0,0,0                   ; lea           0x42(%rip),%r8        # 29c8 <_sk_store_a8_hsw+0x9d>
   DB  73,99,4,128                         ; movslq        (%r8,%rax,4),%rax
   DB  76,1,192                            ; add           %r8,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -2438,7 +2585,7 @@ _sk_store_a8_hsw LABEL PROC
   DB  196,67,121,20,68,57,2,4             ; vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
   DB  196,67,121,20,68,57,1,2             ; vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
   DB  196,67,121,20,4,57,0                ; vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
-  DB  235,154                             ; jmp           266e <_sk_store_a8_hsw+0x37>
+  DB  235,154                             ; jmp           2962 <_sk_store_a8_hsw+0x37>
   DB  247,255                             ; idiv          %edi
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -2469,7 +2616,7 @@ _sk_load_g8_hsw LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,1,248                            ; add           %rdi,%rax
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,60                              ; jne           273c <_sk_load_g8_hsw+0x4c>
+  DB  117,60                              ; jne           2a30 <_sk_load_g8_hsw+0x4c>
   DB  197,250,126,0                       ; vmovq         (%rax),%xmm0
   DB  196,226,125,49,192                  ; vpmovzxbd     %xmm0,%ymm0
   DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
@@ -2494,9 +2641,9 @@ _sk_load_g8_hsw LABEL PROC
   DB  77,9,217                            ; or            %r11,%r9
   DB  72,131,193,8                        ; add           $0x8,%rcx
   DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           2744 <_sk_load_g8_hsw+0x54>
+  DB  117,234                             ; jne           2a38 <_sk_load_g8_hsw+0x54>
   DB  196,193,249,110,193                 ; vmovq         %r9,%xmm0
-  DB  235,163                             ; jmp           2704 <_sk_load_g8_hsw+0x14>
+  DB  235,163                             ; jmp           29f8 <_sk_load_g8_hsw+0x14>
 
 PUBLIC _sk_gather_g8_hsw
 _sk_gather_g8_hsw LABEL PROC
@@ -2561,9 +2708,9 @@ _sk_gather_i8_hsw LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  73,137,192                          ; mov           %rax,%r8
   DB  77,133,192                          ; test          %r8,%r8
-  DB  116,5                               ; je            2857 <_sk_gather_i8_hsw+0xf>
+  DB  116,5                               ; je            2b4b <_sk_gather_i8_hsw+0xf>
   DB  76,137,192                          ; mov           %r8,%rax
-  DB  235,2                               ; jmp           2859 <_sk_gather_i8_hsw+0x11>
+  DB  235,2                               ; jmp           2b4d <_sk_gather_i8_hsw+0x11>
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  65,87                               ; push          %r15
   DB  65,86                               ; push          %r14
@@ -2634,7 +2781,7 @@ _sk_load_565_hsw LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,16                           ; mov           (%rax),%r10
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,149,0,0,0                    ; jne           2a0b <_sk_load_565_hsw+0xa3>
+  DB  15,133,149,0,0,0                    ; jne           2cff <_sk_load_565_hsw+0xa3>
   DB  196,193,122,111,4,122               ; vmovdqu       (%r10,%rdi,2),%xmm0
   DB  196,226,125,51,208                  ; vpmovzxwd     %xmm0,%ymm2
   DB  184,0,248,0,0                       ; mov           $0xf800,%eax
@@ -2674,9 +2821,9 @@ _sk_load_565_hsw LABEL PROC
   DB  197,249,239,192                     ; vpxor         %xmm0,%xmm0,%xmm0
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,89,255,255,255               ; ja            297c <_sk_load_565_hsw+0x14>
+  DB  15,135,89,255,255,255               ; ja            2c70 <_sk_load_565_hsw+0x14>
   DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  76,141,13,74,0,0,0                  ; lea           0x4a(%rip),%r9        # 2a78 <_sk_load_565_hsw+0x110>
+  DB  76,141,13,74,0,0,0                  ; lea           0x4a(%rip),%r9        # 2d6c <_sk_load_565_hsw+0x110>
   DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
   DB  76,1,200                            ; add           %r9,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -2688,12 +2835,12 @@ _sk_load_565_hsw LABEL PROC
   DB  196,193,121,196,68,122,4,2          ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
   DB  196,193,121,196,68,122,2,1          ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
   DB  196,193,121,196,4,122,0             ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  DB  233,5,255,255,255                   ; jmpq          297c <_sk_load_565_hsw+0x14>
+  DB  233,5,255,255,255                   ; jmpq          2c70 <_sk_load_565_hsw+0x14>
   DB  144                                 ; nop
   DB  243,255                             ; repz          (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
-  DB  235,255                             ; jmp           2a7d <_sk_load_565_hsw+0x115>
+  DB  235,255                             ; jmp           2d71 <_sk_load_565_hsw+0x115>
   DB  255                                 ; (bad)
   DB  255,227                             ; jmpq          *%rbx
   DB  255                                 ; (bad)
@@ -2816,7 +2963,7 @@ _sk_store_565_hsw LABEL PROC
   DB  196,67,125,57,193,1                 ; vextracti128  $0x1,%ymm8,%xmm9
   DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           2c43 <_sk_store_565_hsw+0x6c>
+  DB  117,10                              ; jne           2f37 <_sk_store_565_hsw+0x6c>
   DB  196,65,122,127,4,121                ; vmovdqu       %xmm8,(%r9,%rdi,2)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
@@ -2824,9 +2971,9 @@ _sk_store_565_hsw LABEL PROC
   DB  65,128,224,7                        ; and           $0x7,%r8b
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            2c3f <_sk_store_565_hsw+0x68>
+  DB  119,236                             ; ja            2f33 <_sk_store_565_hsw+0x68>
   DB  65,15,182,192                       ; movzbl        %r8b,%eax
-  DB  76,141,5,66,0,0,0                   ; lea           0x42(%rip),%r8        # 2ca0 <_sk_store_565_hsw+0xc9>
+  DB  76,141,5,66,0,0,0                   ; lea           0x42(%rip),%r8        # 2f94 <_sk_store_565_hsw+0xc9>
   DB  73,99,4,128                         ; movslq        (%r8,%rax,4),%rax
   DB  76,1,192                            ; add           %r8,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -2837,7 +2984,7 @@ _sk_store_565_hsw LABEL PROC
   DB  196,67,121,21,68,121,4,2            ; vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
   DB  196,67,121,21,68,121,2,1            ; vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
   DB  196,67,121,21,4,121,0               ; vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
-  DB  235,159                             ; jmp           2c3f <_sk_store_565_hsw+0x68>
+  DB  235,159                             ; jmp           2f33 <_sk_store_565_hsw+0x68>
   DB  247,255                             ; idiv          %edi
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -2866,7 +3013,7 @@ _sk_load_4444_hsw LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,16                           ; mov           (%rax),%r10
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,179,0,0,0                    ; jne           2d7d <_sk_load_4444_hsw+0xc1>
+  DB  15,133,179,0,0,0                    ; jne           3071 <_sk_load_4444_hsw+0xc1>
   DB  196,193,122,111,4,122               ; vmovdqu       (%r10,%rdi,2),%xmm0
   DB  196,98,125,51,200                   ; vpmovzxwd     %xmm0,%ymm9
   DB  184,0,240,0,0                       ; mov           $0xf000,%eax
@@ -2912,9 +3059,9 @@ _sk_load_4444_hsw LABEL PROC
   DB  197,249,239,192                     ; vpxor         %xmm0,%xmm0,%xmm0
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,59,255,255,255               ; ja            2cd0 <_sk_load_4444_hsw+0x14>
+  DB  15,135,59,255,255,255               ; ja            2fc4 <_sk_load_4444_hsw+0x14>
   DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  76,141,13,76,0,0,0                  ; lea           0x4c(%rip),%r9        # 2dec <_sk_load_4444_hsw+0x130>
+  DB  76,141,13,76,0,0,0                  ; lea           0x4c(%rip),%r9        # 30e0 <_sk_load_4444_hsw+0x130>
   DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
   DB  76,1,200                            ; add           %r9,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -2926,13 +3073,13 @@ _sk_load_4444_hsw LABEL PROC
   DB  196,193,121,196,68,122,4,2          ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
   DB  196,193,121,196,68,122,2,1          ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
   DB  196,193,121,196,4,122,0             ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  DB  233,231,254,255,255                 ; jmpq          2cd0 <_sk_load_4444_hsw+0x14>
+  DB  233,231,254,255,255                 ; jmpq          2fc4 <_sk_load_4444_hsw+0x14>
   DB  15,31,0                             ; nopl          (%rax)
   DB  241                                 ; icebp
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
-  DB  233,255,255,255,225                 ; jmpq          ffffffffe2002df4 <_sk_callback_hsw+0xffffffffe1ffeace>
+  DB  233,255,255,255,225                 ; jmpq          ffffffffe20030e8 <_sk_callback_hsw+0xffffffffe1ffeace>
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -3060,7 +3207,7 @@ _sk_store_4444_hsw LABEL PROC
   DB  196,67,125,57,193,1                 ; vextracti128  $0x1,%ymm8,%xmm9
   DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           2fdb <_sk_store_4444_hsw+0x72>
+  DB  117,10                              ; jne           32cf <_sk_store_4444_hsw+0x72>
   DB  196,65,122,127,4,121                ; vmovdqu       %xmm8,(%r9,%rdi,2)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
@@ -3068,9 +3215,9 @@ _sk_store_4444_hsw LABEL PROC
   DB  65,128,224,7                        ; and           $0x7,%r8b
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            2fd7 <_sk_store_4444_hsw+0x6e>
+  DB  119,236                             ; ja            32cb <_sk_store_4444_hsw+0x6e>
   DB  65,15,182,192                       ; movzbl        %r8b,%eax
-  DB  76,141,5,66,0,0,0                   ; lea           0x42(%rip),%r8        # 3038 <_sk_store_4444_hsw+0xcf>
+  DB  76,141,5,66,0,0,0                   ; lea           0x42(%rip),%r8        # 332c <_sk_store_4444_hsw+0xcf>
   DB  73,99,4,128                         ; movslq        (%r8,%rax,4),%rax
   DB  76,1,192                            ; add           %r8,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -3081,7 +3228,7 @@ _sk_store_4444_hsw LABEL PROC
   DB  196,67,121,21,68,121,4,2            ; vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
   DB  196,67,121,21,68,121,2,1            ; vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
   DB  196,67,121,21,4,121,0               ; vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
-  DB  235,159                             ; jmp           2fd7 <_sk_store_4444_hsw+0x6e>
+  DB  235,159                             ; jmp           32cb <_sk_store_4444_hsw+0x6e>
   DB  247,255                             ; idiv          %edi
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -3112,7 +3259,7 @@ _sk_load_8888_hsw LABEL PROC
   DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
   DB  76,3,8                              ; add           (%rax),%r9
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,104                             ; jne           30d1 <_sk_load_8888_hsw+0x7d>
+  DB  117,104                             ; jne           33c5 <_sk_load_8888_hsw+0x7d>
   DB  196,193,126,111,25                  ; vmovdqu       (%r9),%ymm3
   DB  184,255,0,0,0                       ; mov           $0xff,%eax
   DB  197,249,110,192                     ; vmovd         %eax,%xmm0
@@ -3145,7 +3292,7 @@ _sk_load_8888_hsw LABEL PROC
   DB  196,225,249,110,192                 ; vmovq         %rax,%xmm0
   DB  196,226,125,33,192                  ; vpmovsxbd     %xmm0,%ymm0
   DB  196,194,125,140,25                  ; vpmaskmovd    (%r9),%ymm0,%ymm3
-  DB  233,116,255,255,255                 ; jmpq          306e <_sk_load_8888_hsw+0x1a>
+  DB  233,116,255,255,255                 ; jmpq          3362 <_sk_load_8888_hsw+0x1a>
 
 PUBLIC _sk_gather_8888_hsw
 _sk_gather_8888_hsw LABEL PROC
@@ -3205,7 +3352,7 @@ _sk_store_8888_hsw LABEL PROC
   DB  196,65,45,235,192                   ; vpor          %ymm8,%ymm10,%ymm8
   DB  196,65,53,235,192                   ; vpor          %ymm8,%ymm9,%ymm8
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,12                              ; jne           31f4 <_sk_store_8888_hsw+0x74>
+  DB  117,12                              ; jne           34e8 <_sk_store_8888_hsw+0x74>
   DB  196,65,126,127,1                    ; vmovdqu       %ymm8,(%r9)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,137,193                          ; mov           %r8,%rcx
@@ -3218,14 +3365,14 @@ _sk_store_8888_hsw LABEL PROC
   DB  196,97,249,110,200                  ; vmovq         %rax,%xmm9
   DB  196,66,125,33,201                   ; vpmovsxbd     %xmm9,%ymm9
   DB  196,66,53,142,1                     ; vpmaskmovd    %ymm8,%ymm9,(%r9)
-  DB  235,211                             ; jmp           31ed <_sk_store_8888_hsw+0x6d>
+  DB  235,211                             ; jmp           34e1 <_sk_store_8888_hsw+0x6d>
 
 PUBLIC _sk_load_f16_hsw
 _sk_load_f16_hsw LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,97                              ; jne           3285 <_sk_load_f16_hsw+0x6b>
+  DB  117,97                              ; jne           3579 <_sk_load_f16_hsw+0x6b>
   DB  197,121,16,4,248                    ; vmovupd       (%rax,%rdi,8),%xmm8
   DB  197,249,16,84,248,16                ; vmovupd       0x10(%rax,%rdi,8),%xmm2
   DB  197,249,16,92,248,32                ; vmovupd       0x20(%rax,%rdi,8),%xmm3
@@ -3251,29 +3398,29 @@ _sk_load_f16_hsw LABEL PROC
   DB  197,123,16,4,248                    ; vmovsd        (%rax,%rdi,8),%xmm8
   DB  196,65,49,239,201                   ; vpxor         %xmm9,%xmm9,%xmm9
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,79                              ; je            32e4 <_sk_load_f16_hsw+0xca>
+  DB  116,79                              ; je            35d8 <_sk_load_f16_hsw+0xca>
   DB  197,57,22,68,248,8                  ; vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,67                              ; jb            32e4 <_sk_load_f16_hsw+0xca>
+  DB  114,67                              ; jb            35d8 <_sk_load_f16_hsw+0xca>
   DB  197,251,16,84,248,16                ; vmovsd        0x10(%rax,%rdi,8),%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  116,68                              ; je            32f1 <_sk_load_f16_hsw+0xd7>
+  DB  116,68                              ; je            35e5 <_sk_load_f16_hsw+0xd7>
   DB  197,233,22,84,248,24                ; vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,56                              ; jb            32f1 <_sk_load_f16_hsw+0xd7>
+  DB  114,56                              ; jb            35e5 <_sk_load_f16_hsw+0xd7>
   DB  197,251,16,92,248,32                ; vmovsd        0x20(%rax,%rdi,8),%xmm3
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  15,132,114,255,255,255              ; je            323b <_sk_load_f16_hsw+0x21>
+  DB  15,132,114,255,255,255              ; je            352f <_sk_load_f16_hsw+0x21>
   DB  197,225,22,92,248,40                ; vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,98,255,255,255               ; jb            323b <_sk_load_f16_hsw+0x21>
+  DB  15,130,98,255,255,255               ; jb            352f <_sk_load_f16_hsw+0x21>
   DB  197,122,126,76,248,48               ; vmovq         0x30(%rax,%rdi,8),%xmm9
-  DB  233,87,255,255,255                  ; jmpq          323b <_sk_load_f16_hsw+0x21>
+  DB  233,87,255,255,255                  ; jmpq          352f <_sk_load_f16_hsw+0x21>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
   DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,74,255,255,255                  ; jmpq          323b <_sk_load_f16_hsw+0x21>
+  DB  233,74,255,255,255                  ; jmpq          352f <_sk_load_f16_hsw+0x21>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  233,65,255,255,255                  ; jmpq          323b <_sk_load_f16_hsw+0x21>
+  DB  233,65,255,255,255                  ; jmpq          352f <_sk_load_f16_hsw+0x21>
 
 PUBLIC _sk_gather_f16_hsw
 _sk_gather_f16_hsw LABEL PROC
@@ -3327,7 +3474,7 @@ _sk_store_f16_hsw LABEL PROC
   DB  196,65,57,98,205                    ; vpunpckldq    %xmm13,%xmm8,%xmm9
   DB  196,65,57,106,197                   ; vpunpckhdq    %xmm13,%xmm8,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,27                              ; jne           33e9 <_sk_store_f16_hsw+0x65>
+  DB  117,27                              ; jne           36dd <_sk_store_f16_hsw+0x65>
   DB  197,120,17,28,248                   ; vmovups       %xmm11,(%rax,%rdi,8)
   DB  197,120,17,84,248,16                ; vmovups       %xmm10,0x10(%rax,%rdi,8)
   DB  197,120,17,76,248,32                ; vmovups       %xmm9,0x20(%rax,%rdi,8)
@@ -3336,22 +3483,22 @@ _sk_store_f16_hsw LABEL PROC
   DB  255,224                             ; jmpq          *%rax
   DB  197,121,214,28,248                  ; vmovq         %xmm11,(%rax,%rdi,8)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,241                             ; je            33e5 <_sk_store_f16_hsw+0x61>
+  DB  116,241                             ; je            36d9 <_sk_store_f16_hsw+0x61>
   DB  197,121,23,92,248,8                 ; vmovhpd       %xmm11,0x8(%rax,%rdi,8)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,229                             ; jb            33e5 <_sk_store_f16_hsw+0x61>
+  DB  114,229                             ; jb            36d9 <_sk_store_f16_hsw+0x61>
   DB  197,121,214,84,248,16               ; vmovq         %xmm10,0x10(%rax,%rdi,8)
-  DB  116,221                             ; je            33e5 <_sk_store_f16_hsw+0x61>
+  DB  116,221                             ; je            36d9 <_sk_store_f16_hsw+0x61>
   DB  197,121,23,84,248,24                ; vmovhpd       %xmm10,0x18(%rax,%rdi,8)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,209                             ; jb            33e5 <_sk_store_f16_hsw+0x61>
+  DB  114,209                             ; jb            36d9 <_sk_store_f16_hsw+0x61>
   DB  197,121,214,76,248,32               ; vmovq         %xmm9,0x20(%rax,%rdi,8)
-  DB  116,201                             ; je            33e5 <_sk_store_f16_hsw+0x61>
+  DB  116,201                             ; je            36d9 <_sk_store_f16_hsw+0x61>
   DB  197,121,23,76,248,40                ; vmovhpd       %xmm9,0x28(%rax,%rdi,8)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,189                             ; jb            33e5 <_sk_store_f16_hsw+0x61>
+  DB  114,189                             ; jb            36d9 <_sk_store_f16_hsw+0x61>
   DB  197,121,214,68,248,48               ; vmovq         %xmm8,0x30(%rax,%rdi,8)
-  DB  235,181                             ; jmp           33e5 <_sk_store_f16_hsw+0x61>
+  DB  235,181                             ; jmp           36d9 <_sk_store_f16_hsw+0x61>
 
 PUBLIC _sk_load_u16_be_hsw
 _sk_load_u16_be_hsw LABEL PROC
@@ -3359,7 +3506,7 @@ _sk_load_u16_be_hsw LABEL PROC
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  72,141,4,189,0,0,0,0                ; lea           0x0(,%rdi,4),%rax
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,205,0,0,0                    ; jne           3513 <_sk_load_u16_be_hsw+0xe3>
+  DB  15,133,205,0,0,0                    ; jne           3807 <_sk_load_u16_be_hsw+0xe3>
   DB  196,65,121,16,4,64                  ; vmovupd       (%r8,%rax,2),%xmm8
   DB  196,193,121,16,84,64,16             ; vmovupd       0x10(%r8,%rax,2),%xmm2
   DB  196,193,121,16,92,64,32             ; vmovupd       0x20(%r8,%rax,2),%xmm3
@@ -3408,29 +3555,29 @@ _sk_load_u16_be_hsw LABEL PROC
   DB  196,65,123,16,4,64                  ; vmovsd        (%r8,%rax,2),%xmm8
   DB  196,65,49,239,201                   ; vpxor         %xmm9,%xmm9,%xmm9
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,85                              ; je            3579 <_sk_load_u16_be_hsw+0x149>
+  DB  116,85                              ; je            386d <_sk_load_u16_be_hsw+0x149>
   DB  196,65,57,22,68,64,8                ; vmovhpd       0x8(%r8,%rax,2),%xmm8,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,72                              ; jb            3579 <_sk_load_u16_be_hsw+0x149>
+  DB  114,72                              ; jb            386d <_sk_load_u16_be_hsw+0x149>
   DB  196,193,123,16,84,64,16             ; vmovsd        0x10(%r8,%rax,2),%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  116,72                              ; je            3586 <_sk_load_u16_be_hsw+0x156>
+  DB  116,72                              ; je            387a <_sk_load_u16_be_hsw+0x156>
   DB  196,193,105,22,84,64,24             ; vmovhpd       0x18(%r8,%rax,2),%xmm2,%xmm2
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,59                              ; jb            3586 <_sk_load_u16_be_hsw+0x156>
+  DB  114,59                              ; jb            387a <_sk_load_u16_be_hsw+0x156>
   DB  196,193,123,16,92,64,32             ; vmovsd        0x20(%r8,%rax,2),%xmm3
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  15,132,5,255,255,255                ; je            3461 <_sk_load_u16_be_hsw+0x31>
+  DB  15,132,5,255,255,255                ; je            3755 <_sk_load_u16_be_hsw+0x31>
   DB  196,193,97,22,92,64,40              ; vmovhpd       0x28(%r8,%rax,2),%xmm3,%xmm3
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,244,254,255,255              ; jb            3461 <_sk_load_u16_be_hsw+0x31>
+  DB  15,130,244,254,255,255              ; jb            3755 <_sk_load_u16_be_hsw+0x31>
   DB  196,65,122,126,76,64,48             ; vmovq         0x30(%r8,%rax,2),%xmm9
-  DB  233,232,254,255,255                 ; jmpq          3461 <_sk_load_u16_be_hsw+0x31>
+  DB  233,232,254,255,255                 ; jmpq          3755 <_sk_load_u16_be_hsw+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
   DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,219,254,255,255                 ; jmpq          3461 <_sk_load_u16_be_hsw+0x31>
+  DB  233,219,254,255,255                 ; jmpq          3755 <_sk_load_u16_be_hsw+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  233,210,254,255,255                 ; jmpq          3461 <_sk_load_u16_be_hsw+0x31>
+  DB  233,210,254,255,255                 ; jmpq          3755 <_sk_load_u16_be_hsw+0x31>
 
 PUBLIC _sk_load_rgb_u16_be_hsw
 _sk_load_rgb_u16_be_hsw LABEL PROC
@@ -3438,7 +3585,7 @@ _sk_load_rgb_u16_be_hsw LABEL PROC
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  72,141,4,127                        ; lea           (%rdi,%rdi,2),%rax
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,211,0,0,0                    ; jne           3674 <_sk_load_rgb_u16_be_hsw+0xe5>
+  DB  15,133,211,0,0,0                    ; jne           3968 <_sk_load_rgb_u16_be_hsw+0xe5>
   DB  196,193,122,111,4,64                ; vmovdqu       (%r8,%rax,2),%xmm0
   DB  196,193,122,111,84,64,12            ; vmovdqu       0xc(%r8,%rax,2),%xmm2
   DB  196,193,122,111,76,64,24            ; vmovdqu       0x18(%r8,%rax,2),%xmm1
@@ -3488,36 +3635,36 @@ _sk_load_rgb_u16_be_hsw LABEL PROC
   DB  196,193,121,110,4,64                ; vmovd         (%r8,%rax,2),%xmm0
   DB  196,193,121,196,68,64,4,2           ; vpinsrw       $0x2,0x4(%r8,%rax,2),%xmm0,%xmm0
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  117,5                               ; jne           368d <_sk_load_rgb_u16_be_hsw+0xfe>
-  DB  233,72,255,255,255                  ; jmpq          35d5 <_sk_load_rgb_u16_be_hsw+0x46>
+  DB  117,5                               ; jne           3981 <_sk_load_rgb_u16_be_hsw+0xfe>
+  DB  233,72,255,255,255                  ; jmpq          38c9 <_sk_load_rgb_u16_be_hsw+0x46>
   DB  196,193,121,110,76,64,6             ; vmovd         0x6(%r8,%rax,2),%xmm1
   DB  196,65,113,196,68,64,10,2           ; vpinsrw       $0x2,0xa(%r8,%rax,2),%xmm1,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,26                              ; jb            36bc <_sk_load_rgb_u16_be_hsw+0x12d>
+  DB  114,26                              ; jb            39b0 <_sk_load_rgb_u16_be_hsw+0x12d>
   DB  196,193,121,110,76,64,12            ; vmovd         0xc(%r8,%rax,2),%xmm1
   DB  196,193,113,196,84,64,16,2          ; vpinsrw       $0x2,0x10(%r8,%rax,2),%xmm1,%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  117,10                              ; jne           36c1 <_sk_load_rgb_u16_be_hsw+0x132>
-  DB  233,25,255,255,255                  ; jmpq          35d5 <_sk_load_rgb_u16_be_hsw+0x46>
-  DB  233,20,255,255,255                  ; jmpq          35d5 <_sk_load_rgb_u16_be_hsw+0x46>
+  DB  117,10                              ; jne           39b5 <_sk_load_rgb_u16_be_hsw+0x132>
+  DB  233,25,255,255,255                  ; jmpq          38c9 <_sk_load_rgb_u16_be_hsw+0x46>
+  DB  233,20,255,255,255                  ; jmpq          38c9 <_sk_load_rgb_u16_be_hsw+0x46>
   DB  196,193,121,110,76,64,18            ; vmovd         0x12(%r8,%rax,2),%xmm1
   DB  196,65,113,196,76,64,22,2           ; vpinsrw       $0x2,0x16(%r8,%rax,2),%xmm1,%xmm9
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,26                              ; jb            36f0 <_sk_load_rgb_u16_be_hsw+0x161>
+  DB  114,26                              ; jb            39e4 <_sk_load_rgb_u16_be_hsw+0x161>
   DB  196,193,121,110,76,64,24            ; vmovd         0x18(%r8,%rax,2),%xmm1
   DB  196,193,113,196,76,64,28,2          ; vpinsrw       $0x2,0x1c(%r8,%rax,2),%xmm1,%xmm1
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  117,10                              ; jne           36f5 <_sk_load_rgb_u16_be_hsw+0x166>
-  DB  233,229,254,255,255                 ; jmpq          35d5 <_sk_load_rgb_u16_be_hsw+0x46>
-  DB  233,224,254,255,255                 ; jmpq          35d5 <_sk_load_rgb_u16_be_hsw+0x46>
+  DB  117,10                              ; jne           39e9 <_sk_load_rgb_u16_be_hsw+0x166>
+  DB  233,229,254,255,255                 ; jmpq          38c9 <_sk_load_rgb_u16_be_hsw+0x46>
+  DB  233,224,254,255,255                 ; jmpq          38c9 <_sk_load_rgb_u16_be_hsw+0x46>
   DB  196,193,121,110,92,64,30            ; vmovd         0x1e(%r8,%rax,2),%xmm3
   DB  196,65,97,196,92,64,34,2            ; vpinsrw       $0x2,0x22(%r8,%rax,2),%xmm3,%xmm11
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,20                              ; jb            371e <_sk_load_rgb_u16_be_hsw+0x18f>
+  DB  114,20                              ; jb            3a12 <_sk_load_rgb_u16_be_hsw+0x18f>
   DB  196,193,121,110,92,64,36            ; vmovd         0x24(%r8,%rax,2),%xmm3
   DB  196,193,97,196,92,64,40,2           ; vpinsrw       $0x2,0x28(%r8,%rax,2),%xmm3,%xmm3
-  DB  233,183,254,255,255                 ; jmpq          35d5 <_sk_load_rgb_u16_be_hsw+0x46>
-  DB  233,178,254,255,255                 ; jmpq          35d5 <_sk_load_rgb_u16_be_hsw+0x46>
+  DB  233,183,254,255,255                 ; jmpq          38c9 <_sk_load_rgb_u16_be_hsw+0x46>
+  DB  233,178,254,255,255                 ; jmpq          38c9 <_sk_load_rgb_u16_be_hsw+0x46>
 
 PUBLIC _sk_store_u16_be_hsw
 _sk_store_u16_be_hsw LABEL PROC
@@ -3564,7 +3711,7 @@ _sk_store_u16_be_hsw LABEL PROC
   DB  196,65,17,98,200                    ; vpunpckldq    %xmm8,%xmm13,%xmm9
   DB  196,65,17,106,192                   ; vpunpckhdq    %xmm8,%xmm13,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,31                              ; jne           381e <_sk_store_u16_be_hsw+0xfb>
+  DB  117,31                              ; jne           3b12 <_sk_store_u16_be_hsw+0xfb>
   DB  196,1,120,17,28,72                  ; vmovups       %xmm11,(%r8,%r9,2)
   DB  196,1,120,17,84,72,16               ; vmovups       %xmm10,0x10(%r8,%r9,2)
   DB  196,1,120,17,76,72,32               ; vmovups       %xmm9,0x20(%r8,%r9,2)
@@ -3573,31 +3720,31 @@ _sk_store_u16_be_hsw LABEL PROC
   DB  255,224                             ; jmpq          *%rax
   DB  196,1,121,214,28,72                 ; vmovq         %xmm11,(%r8,%r9,2)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            381a <_sk_store_u16_be_hsw+0xf7>
+  DB  116,240                             ; je            3b0e <_sk_store_u16_be_hsw+0xf7>
   DB  196,1,121,23,92,72,8                ; vmovhpd       %xmm11,0x8(%r8,%r9,2)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            381a <_sk_store_u16_be_hsw+0xf7>
+  DB  114,227                             ; jb            3b0e <_sk_store_u16_be_hsw+0xf7>
   DB  196,1,121,214,84,72,16              ; vmovq         %xmm10,0x10(%r8,%r9,2)
-  DB  116,218                             ; je            381a <_sk_store_u16_be_hsw+0xf7>
+  DB  116,218                             ; je            3b0e <_sk_store_u16_be_hsw+0xf7>
   DB  196,1,121,23,84,72,24               ; vmovhpd       %xmm10,0x18(%r8,%r9,2)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            381a <_sk_store_u16_be_hsw+0xf7>
+  DB  114,205                             ; jb            3b0e <_sk_store_u16_be_hsw+0xf7>
   DB  196,1,121,214,76,72,32              ; vmovq         %xmm9,0x20(%r8,%r9,2)
-  DB  116,196                             ; je            381a <_sk_store_u16_be_hsw+0xf7>
+  DB  116,196                             ; je            3b0e <_sk_store_u16_be_hsw+0xf7>
   DB  196,1,121,23,76,72,40               ; vmovhpd       %xmm9,0x28(%r8,%r9,2)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,183                             ; jb            381a <_sk_store_u16_be_hsw+0xf7>
+  DB  114,183                             ; jb            3b0e <_sk_store_u16_be_hsw+0xf7>
   DB  196,1,121,214,68,72,48              ; vmovq         %xmm8,0x30(%r8,%r9,2)
-  DB  235,174                             ; jmp           381a <_sk_store_u16_be_hsw+0xf7>
+  DB  235,174                             ; jmp           3b0e <_sk_store_u16_be_hsw+0xf7>
 
 PUBLIC _sk_load_f32_hsw
 _sk_load_f32_hsw LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  119,110                             ; ja            38e2 <_sk_load_f32_hsw+0x76>
+  DB  119,110                             ; ja            3bd6 <_sk_load_f32_hsw+0x76>
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
-  DB  76,141,21,134,0,0,0                 ; lea           0x86(%rip),%r10        # 390c <_sk_load_f32_hsw+0xa0>
+  DB  76,141,21,134,0,0,0                 ; lea           0x86(%rip),%r10        # 3c00 <_sk_load_f32_hsw+0xa0>
   DB  73,99,4,138                         ; movslq        (%r10,%rcx,4),%rax
   DB  76,1,208                            ; add           %r10,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -3654,7 +3801,7 @@ _sk_store_f32_hsw LABEL PROC
   DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
   DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,55                              ; jne           3999 <_sk_store_f32_hsw+0x6d>
+  DB  117,55                              ; jne           3c8d <_sk_store_f32_hsw+0x6d>
   DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -3667,22 +3814,22 @@ _sk_store_f32_hsw LABEL PROC
   DB  255,224                             ; jmpq          *%rax
   DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            3995 <_sk_store_f32_hsw+0x69>
+  DB  116,240                             ; je            3c89 <_sk_store_f32_hsw+0x69>
   DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            3995 <_sk_store_f32_hsw+0x69>
+  DB  114,227                             ; jb            3c89 <_sk_store_f32_hsw+0x69>
   DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
-  DB  116,218                             ; je            3995 <_sk_store_f32_hsw+0x69>
+  DB  116,218                             ; je            3c89 <_sk_store_f32_hsw+0x69>
   DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            3995 <_sk_store_f32_hsw+0x69>
+  DB  114,205                             ; jb            3c89 <_sk_store_f32_hsw+0x69>
   DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  DB  116,195                             ; je            3995 <_sk_store_f32_hsw+0x69>
+  DB  116,195                             ; je            3c89 <_sk_store_f32_hsw+0x69>
   DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,181                             ; jb            3995 <_sk_store_f32_hsw+0x69>
+  DB  114,181                             ; jb            3c89 <_sk_store_f32_hsw+0x69>
   DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  DB  235,171                             ; jmp           3995 <_sk_store_f32_hsw+0x69>
+  DB  235,171                             ; jmp           3c89 <_sk_store_f32_hsw+0x69>
 
 PUBLIC _sk_clamp_x_hsw
 _sk_clamp_x_hsw LABEL PROC
@@ -3923,7 +4070,7 @@ _sk_linear_gradient_hsw LABEL PROC
   DB  196,98,125,24,72,28                 ; vbroadcastss  0x1c(%rax),%ymm9
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  77,133,192                          ; test          %r8,%r8
-  DB  15,132,143,0,0,0                    ; je            3e25 <_sk_linear_gradient_hsw+0xb5>
+  DB  15,132,143,0,0,0                    ; je            4119 <_sk_linear_gradient_hsw+0xb5>
   DB  72,139,64,8                         ; mov           0x8(%rax),%rax
   DB  72,131,192,32                       ; add           $0x20,%rax
   DB  196,65,28,87,228                    ; vxorps        %ymm12,%ymm12,%ymm12
@@ -3950,8 +4097,8 @@ _sk_linear_gradient_hsw LABEL PROC
   DB  196,67,13,74,201,208                ; vblendvps     %ymm13,%ymm9,%ymm14,%ymm9
   DB  72,131,192,36                       ; add           $0x24,%rax
   DB  73,255,200                          ; dec           %r8
-  DB  117,140                             ; jne           3daf <_sk_linear_gradient_hsw+0x3f>
-  DB  235,17                              ; jmp           3e36 <_sk_linear_gradient_hsw+0xc6>
+  DB  117,140                             ; jne           40a3 <_sk_linear_gradient_hsw+0x3f>
+  DB  235,17                              ; jmp           412a <_sk_linear_gradient_hsw+0xc6>
   DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
   DB  197,236,87,210                      ; vxorps        %ymm2,%ymm2,%ymm2
   DB  197,228,87,219                      ; vxorps        %ymm3,%ymm3,%ymm3
@@ -5375,83 +5522,272 @@ _sk_to_srgb_avx LABEL PROC
 
 PUBLIC _sk_from_2dot2_avx
 _sk_from_2dot2_avx LABEL PROC
-  DB  197,124,82,192                      ; vrsqrtps      %ymm0,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,200                   ; vrsqrtps      %ymm8,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  197,252,89,192                      ; vmulps        %ymm0,%ymm0,%ymm0
-  DB  196,65,60,89,208                    ; vmulps        %ymm8,%ymm8,%ymm10
-  DB  196,65,60,89,194                    ; vmulps        %ymm10,%ymm8,%ymm8
-  DB  196,193,124,89,192                  ; vmulps        %ymm8,%ymm0,%ymm0
-  DB  197,180,89,192                      ; vmulps        %ymm0,%ymm9,%ymm0
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  196,193,124,95,192                  ; vmaxps        %ymm8,%ymm0,%ymm0
-  DB  197,124,82,201                      ; vrsqrtps      %ymm1,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,209                   ; vrsqrtps      %ymm9,%ymm10
-  DB  196,65,124,82,210                   ; vrsqrtps      %ymm10,%ymm10
-  DB  197,244,89,201                      ; vmulps        %ymm1,%ymm1,%ymm1
-  DB  196,65,52,89,217                    ; vmulps        %ymm9,%ymm9,%ymm11
-  DB  196,65,52,89,203                    ; vmulps        %ymm11,%ymm9,%ymm9
-  DB  196,193,116,89,201                  ; vmulps        %ymm9,%ymm1,%ymm1
-  DB  197,172,89,201                      ; vmulps        %ymm1,%ymm10,%ymm1
-  DB  196,193,116,95,200                  ; vmaxps        %ymm8,%ymm1,%ymm1
-  DB  197,124,82,202                      ; vrsqrtps      %ymm2,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,209                   ; vrsqrtps      %ymm9,%ymm10
-  DB  196,65,124,82,210                   ; vrsqrtps      %ymm10,%ymm10
-  DB  197,236,89,210                      ; vmulps        %ymm2,%ymm2,%ymm2
-  DB  196,65,52,89,217                    ; vmulps        %ymm9,%ymm9,%ymm11
-  DB  196,65,52,89,203                    ; vmulps        %ymm11,%ymm9,%ymm9
-  DB  196,193,108,89,209                  ; vmulps        %ymm9,%ymm2,%ymm2
-  DB  197,172,89,210                      ; vmulps        %ymm2,%ymm10,%ymm2
-  DB  196,193,108,95,208                  ; vmaxps        %ymm8,%ymm2,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,129,236,216,0,0,0                ; sub           $0xd8,%rsp
+  DB  197,252,17,188,36,160,0,0,0         ; vmovups       %ymm7,0xa0(%rsp)
+  DB  197,252,17,180,36,128,0,0,0         ; vmovups       %ymm6,0x80(%rsp)
+  DB  197,252,17,108,36,96                ; vmovups       %ymm5,0x60(%rsp)
+  DB  197,252,17,100,36,64                ; vmovups       %ymm4,0x40(%rsp)
+  DB  197,252,17,92,36,32                 ; vmovups       %ymm3,0x20(%rsp)
+  DB  197,252,17,20,36                    ; vmovups       %ymm2,(%rsp)
+  DB  197,252,40,241                      ; vmovaps       %ymm1,%ymm6
+  DB  65,184,205,204,12,64                ; mov           $0x400ccccd,%r8d
+  DB  197,252,91,200                      ; vcvtdq2ps     %ymm0,%ymm1
+  DB  184,0,0,0,52                        ; mov           $0x34000000,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,194,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm8
+  DB  196,193,116,89,200                  ; vmulps        %ymm8,%ymm1,%ymm1
+  DB  184,255,255,127,0                   ; mov           $0x7fffff,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  197,249,112,210,0                   ; vpshufd       $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,202,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm9
+  DB  197,180,84,192                      ; vandps        %ymm0,%ymm9,%ymm0
+  DB  184,0,0,0,63                        ; mov           $0x3f000000,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  197,249,112,210,0                   ; vpshufd       $0x0,%xmm2,%xmm2
+  DB  196,227,109,24,234,1                ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm5
+  DB  197,252,86,197                      ; vorps         %ymm5,%ymm0,%ymm0
+  DB  184,119,115,248,66                  ; mov           $0x42f87377,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,210,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm10
+  DB  196,193,116,92,202                  ; vsubps        %ymm10,%ymm1,%ymm1
+  DB  184,117,191,191,63                  ; mov           $0x3fbfbf75,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,218,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm11
+  DB  196,193,124,89,211                  ; vmulps        %ymm11,%ymm0,%ymm2
+  DB  197,244,92,202                      ; vsubps        %ymm2,%ymm1,%ymm1
+  DB  184,163,233,220,63                  ; mov           $0x3fdce9a3,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,226,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm12
+  DB  184,249,68,180,62                   ; mov           $0x3eb444f9,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,234,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm13
+  DB  196,193,124,88,197                  ; vaddps        %ymm13,%ymm0,%ymm0
+  DB  197,156,94,192                      ; vdivps        %ymm0,%ymm12,%ymm0
+  DB  197,244,92,192                      ; vsubps        %ymm0,%ymm1,%ymm0
+  DB  196,193,121,110,200                 ; vmovd         %r8d,%xmm1
+  DB  196,227,121,4,201,0                 ; vpermilps     $0x0,%xmm1,%xmm1
+  DB  196,99,117,24,241,1                 ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm14
+  DB  197,140,89,192                      ; vmulps        %ymm0,%ymm14,%ymm0
+  DB  196,227,125,8,200,1                 ; vroundps      $0x1,%ymm0,%ymm1
+  DB  197,252,92,225                      ; vsubps        %ymm1,%ymm0,%ymm4
+  DB  65,184,0,0,0,75                     ; mov           $0x4b000000,%r8d
+  DB  184,81,140,242,66                   ; mov           $0x42f28c51,%eax
+  DB  197,249,110,200                     ; vmovd         %eax,%xmm1
+  DB  196,227,121,4,201,0                 ; vpermilps     $0x0,%xmm1,%xmm1
+  DB  196,99,117,24,249,1                 ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm15
+  DB  197,132,88,192                      ; vaddps        %ymm0,%ymm15,%ymm0
+  DB  184,141,188,190,63                  ; mov           $0x3fbebc8d,%eax
+  DB  197,249,110,200                     ; vmovd         %eax,%xmm1
+  DB  196,227,121,4,201,0                 ; vpermilps     $0x0,%xmm1,%xmm1
+  DB  196,227,117,24,217,1                ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm3
+  DB  197,228,89,204                      ; vmulps        %ymm4,%ymm3,%ymm1
+  DB  197,252,92,209                      ; vsubps        %ymm1,%ymm0,%ymm2
+  DB  184,254,210,221,65                  ; mov           $0x41ddd2fe,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,227,121,4,192,0                 ; vpermilps     $0x0,%xmm0,%xmm0
+  DB  196,227,125,24,200,1                ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm1
+  DB  184,248,245,154,64                  ; mov           $0x409af5f8,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,227,121,4,192,0                 ; vpermilps     $0x0,%xmm0,%xmm0
+  DB  196,227,125,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  DB  197,252,92,228                      ; vsubps        %ymm4,%ymm0,%ymm4
+  DB  197,244,94,228                      ; vdivps        %ymm4,%ymm1,%ymm4
+  DB  197,236,88,228                      ; vaddps        %ymm4,%ymm2,%ymm4
+  DB  197,252,91,214                      ; vcvtdq2ps     %ymm6,%ymm2
+  DB  196,193,108,89,208                  ; vmulps        %ymm8,%ymm2,%ymm2
+  DB  197,180,84,246                      ; vandps        %ymm6,%ymm9,%ymm6
+  DB  197,204,86,245                      ; vorps         %ymm5,%ymm6,%ymm6
+  DB  196,193,108,92,210                  ; vsubps        %ymm10,%ymm2,%ymm2
+  DB  196,193,76,89,251                   ; vmulps        %ymm11,%ymm6,%ymm7
+  DB  197,236,92,215                      ; vsubps        %ymm7,%ymm2,%ymm2
+  DB  196,193,76,88,245                   ; vaddps        %ymm13,%ymm6,%ymm6
+  DB  197,156,94,246                      ; vdivps        %ymm6,%ymm12,%ymm6
+  DB  197,236,92,214                      ; vsubps        %ymm6,%ymm2,%ymm2
+  DB  197,140,89,210                      ; vmulps        %ymm2,%ymm14,%ymm2
+  DB  196,227,125,8,242,1                 ; vroundps      $0x1,%ymm2,%ymm6
+  DB  197,236,92,246                      ; vsubps        %ymm6,%ymm2,%ymm6
+  DB  197,132,88,210                      ; vaddps        %ymm2,%ymm15,%ymm2
+  DB  197,228,89,254                      ; vmulps        %ymm6,%ymm3,%ymm7
+  DB  197,236,92,215                      ; vsubps        %ymm7,%ymm2,%ymm2
+  DB  197,252,92,246                      ; vsubps        %ymm6,%ymm0,%ymm6
+  DB  197,244,94,246                      ; vdivps        %ymm6,%ymm1,%ymm6
+  DB  197,236,88,214                      ; vaddps        %ymm6,%ymm2,%ymm2
+  DB  197,252,16,60,36                    ; vmovups       (%rsp),%ymm7
+  DB  197,252,91,247                      ; vcvtdq2ps     %ymm7,%ymm6
+  DB  196,193,76,89,240                   ; vmulps        %ymm8,%ymm6,%ymm6
+  DB  197,180,84,255                      ; vandps        %ymm7,%ymm9,%ymm7
+  DB  197,196,86,237                      ; vorps         %ymm5,%ymm7,%ymm5
+  DB  196,193,76,92,242                   ; vsubps        %ymm10,%ymm6,%ymm6
+  DB  196,193,84,89,251                   ; vmulps        %ymm11,%ymm5,%ymm7
+  DB  197,204,92,247                      ; vsubps        %ymm7,%ymm6,%ymm6
+  DB  196,193,84,88,237                   ; vaddps        %ymm13,%ymm5,%ymm5
+  DB  197,156,94,237                      ; vdivps        %ymm5,%ymm12,%ymm5
+  DB  197,204,92,237                      ; vsubps        %ymm5,%ymm6,%ymm5
+  DB  197,140,89,237                      ; vmulps        %ymm5,%ymm14,%ymm5
+  DB  196,227,125,8,245,1                 ; vroundps      $0x1,%ymm5,%ymm6
+  DB  197,212,92,246                      ; vsubps        %ymm6,%ymm5,%ymm6
+  DB  197,132,88,237                      ; vaddps        %ymm5,%ymm15,%ymm5
+  DB  197,228,89,222                      ; vmulps        %ymm6,%ymm3,%ymm3
+  DB  197,212,92,219                      ; vsubps        %ymm3,%ymm5,%ymm3
+  DB  197,252,92,198                      ; vsubps        %ymm6,%ymm0,%ymm0
+  DB  197,244,94,192                      ; vdivps        %ymm0,%ymm1,%ymm0
+  DB  197,228,88,192                      ; vaddps        %ymm0,%ymm3,%ymm0
+  DB  196,193,121,110,200                 ; vmovd         %r8d,%xmm1
+  DB  196,227,121,4,201,0                 ; vpermilps     $0x0,%xmm1,%xmm1
+  DB  196,227,117,24,201,1                ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
+  DB  197,244,89,220                      ; vmulps        %ymm4,%ymm1,%ymm3
+  DB  197,244,89,210                      ; vmulps        %ymm2,%ymm1,%ymm2
+  DB  197,244,89,224                      ; vmulps        %ymm0,%ymm1,%ymm4
+  DB  197,253,91,195                      ; vcvtps2dq     %ymm3,%ymm0
+  DB  197,253,91,202                      ; vcvtps2dq     %ymm2,%ymm1
+  DB  197,253,91,212                      ; vcvtps2dq     %ymm4,%ymm2
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  197,252,16,92,36,32                 ; vmovups       0x20(%rsp),%ymm3
+  DB  197,252,16,100,36,64                ; vmovups       0x40(%rsp),%ymm4
+  DB  197,252,16,108,36,96                ; vmovups       0x60(%rsp),%ymm5
+  DB  197,252,16,180,36,128,0,0,0         ; vmovups       0x80(%rsp),%ymm6
+  DB  197,252,16,188,36,160,0,0,0         ; vmovups       0xa0(%rsp),%ymm7
+  DB  72,129,196,216,0,0,0                ; add           $0xd8,%rsp
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_to_2dot2_avx
 _sk_to_2dot2_avx LABEL PROC
-  DB  197,252,82,192                      ; vrsqrtps      %ymm0,%ymm0
-  DB  197,124,82,192                      ; vrsqrtps      %ymm0,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,192                   ; vrsqrtps      %ymm8,%ymm8
-  DB  196,65,124,82,200                   ; vrsqrtps      %ymm8,%ymm9
-  DB  197,252,83,192                      ; vrcpps        %ymm0,%ymm0
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  196,65,124,83,193                   ; vrcpps        %ymm9,%ymm8
-  DB  196,193,124,89,192                  ; vmulps        %ymm8,%ymm0,%ymm0
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  196,193,124,95,192                  ; vmaxps        %ymm8,%ymm0,%ymm0
-  DB  197,252,82,201                      ; vrsqrtps      %ymm1,%ymm1
-  DB  197,124,82,201                      ; vrsqrtps      %ymm1,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,209                   ; vrsqrtps      %ymm9,%ymm10
-  DB  197,252,83,201                      ; vrcpps        %ymm1,%ymm1
-  DB  197,180,89,201                      ; vmulps        %ymm1,%ymm9,%ymm1
-  DB  196,65,124,83,202                   ; vrcpps        %ymm10,%ymm9
-  DB  196,193,116,89,201                  ; vmulps        %ymm9,%ymm1,%ymm1
-  DB  196,193,116,95,200                  ; vmaxps        %ymm8,%ymm1,%ymm1
-  DB  197,252,82,210                      ; vrsqrtps      %ymm2,%ymm2
-  DB  197,124,82,202                      ; vrsqrtps      %ymm2,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,124,82,209                   ; vrsqrtps      %ymm9,%ymm10
-  DB  197,252,83,210                      ; vrcpps        %ymm2,%ymm2
-  DB  197,180,89,210                      ; vmulps        %ymm2,%ymm9,%ymm2
-  DB  196,65,124,83,202                   ; vrcpps        %ymm10,%ymm9
-  DB  196,193,108,89,209                  ; vmulps        %ymm9,%ymm2,%ymm2
-  DB  196,193,108,95,208                  ; vmaxps        %ymm8,%ymm2,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  72,129,236,216,0,0,0                ; sub           $0xd8,%rsp
+  DB  197,252,17,188,36,160,0,0,0         ; vmovups       %ymm7,0xa0(%rsp)
+  DB  197,252,17,180,36,128,0,0,0         ; vmovups       %ymm6,0x80(%rsp)
+  DB  197,252,17,108,36,96                ; vmovups       %ymm5,0x60(%rsp)
+  DB  197,252,17,100,36,64                ; vmovups       %ymm4,0x40(%rsp)
+  DB  197,252,17,92,36,32                 ; vmovups       %ymm3,0x20(%rsp)
+  DB  197,252,17,20,36                    ; vmovups       %ymm2,(%rsp)
+  DB  197,252,40,241                      ; vmovaps       %ymm1,%ymm6
+  DB  65,184,46,186,232,62                ; mov           $0x3ee8ba2e,%r8d
+  DB  197,252,91,200                      ; vcvtdq2ps     %ymm0,%ymm1
+  DB  184,0,0,0,52                        ; mov           $0x34000000,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,194,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm8
+  DB  196,193,116,89,200                  ; vmulps        %ymm8,%ymm1,%ymm1
+  DB  184,255,255,127,0                   ; mov           $0x7fffff,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  197,249,112,210,0                   ; vpshufd       $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,202,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm9
+  DB  197,180,84,192                      ; vandps        %ymm0,%ymm9,%ymm0
+  DB  184,0,0,0,63                        ; mov           $0x3f000000,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  197,249,112,210,0                   ; vpshufd       $0x0,%xmm2,%xmm2
+  DB  196,227,109,24,234,1                ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm5
+  DB  197,252,86,197                      ; vorps         %ymm5,%ymm0,%ymm0
+  DB  184,119,115,248,66                  ; mov           $0x42f87377,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,210,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm10
+  DB  196,193,116,92,202                  ; vsubps        %ymm10,%ymm1,%ymm1
+  DB  184,117,191,191,63                  ; mov           $0x3fbfbf75,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,218,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm11
+  DB  196,193,124,89,211                  ; vmulps        %ymm11,%ymm0,%ymm2
+  DB  197,244,92,202                      ; vsubps        %ymm2,%ymm1,%ymm1
+  DB  184,163,233,220,63                  ; mov           $0x3fdce9a3,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,226,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm12
+  DB  184,249,68,180,62                   ; mov           $0x3eb444f9,%eax
+  DB  197,249,110,208                     ; vmovd         %eax,%xmm2
+  DB  196,227,121,4,210,0                 ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,99,109,24,234,1                 ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm13
+  DB  196,193,124,88,197                  ; vaddps        %ymm13,%ymm0,%ymm0
+  DB  197,156,94,192                      ; vdivps        %ymm0,%ymm12,%ymm0
+  DB  197,244,92,192                      ; vsubps        %ymm0,%ymm1,%ymm0
+  DB  196,193,121,110,200                 ; vmovd         %r8d,%xmm1
+  DB  196,227,121,4,201,0                 ; vpermilps     $0x0,%xmm1,%xmm1
+  DB  196,99,117,24,241,1                 ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm14
+  DB  197,140,89,192                      ; vmulps        %ymm0,%ymm14,%ymm0
+  DB  196,227,125,8,200,1                 ; vroundps      $0x1,%ymm0,%ymm1
+  DB  197,252,92,225                      ; vsubps        %ymm1,%ymm0,%ymm4
+  DB  65,184,0,0,0,75                     ; mov           $0x4b000000,%r8d
+  DB  184,81,140,242,66                   ; mov           $0x42f28c51,%eax
+  DB  197,249,110,200                     ; vmovd         %eax,%xmm1
+  DB  196,227,121,4,201,0                 ; vpermilps     $0x0,%xmm1,%xmm1
+  DB  196,99,117,24,249,1                 ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm15
+  DB  197,132,88,192                      ; vaddps        %ymm0,%ymm15,%ymm0
+  DB  184,141,188,190,63                  ; mov           $0x3fbebc8d,%eax
+  DB  197,249,110,200                     ; vmovd         %eax,%xmm1
+  DB  196,227,121,4,201,0                 ; vpermilps     $0x0,%xmm1,%xmm1
+  DB  196,227,117,24,217,1                ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm3
+  DB  197,228,89,204                      ; vmulps        %ymm4,%ymm3,%ymm1
+  DB  197,252,92,209                      ; vsubps        %ymm1,%ymm0,%ymm2
+  DB  184,254,210,221,65                  ; mov           $0x41ddd2fe,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,227,121,4,192,0                 ; vpermilps     $0x0,%xmm0,%xmm0
+  DB  196,227,125,24,200,1                ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm1
+  DB  184,248,245,154,64                  ; mov           $0x409af5f8,%eax
+  DB  197,249,110,192                     ; vmovd         %eax,%xmm0
+  DB  196,227,121,4,192,0                 ; vpermilps     $0x0,%xmm0,%xmm0
+  DB  196,227,125,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  DB  197,252,92,228                      ; vsubps        %ymm4,%ymm0,%ymm4
+  DB  197,244,94,228                      ; vdivps        %ymm4,%ymm1,%ymm4
+  DB  197,236,88,228                      ; vaddps        %ymm4,%ymm2,%ymm4
+  DB  197,252,91,214                      ; vcvtdq2ps     %ymm6,%ymm2
+  DB  196,193,108,89,208                  ; vmulps        %ymm8,%ymm2,%ymm2
+  DB  197,180,84,246                      ; vandps        %ymm6,%ymm9,%ymm6
+  DB  197,204,86,245                      ; vorps         %ymm5,%ymm6,%ymm6
+  DB  196,193,108,92,210                  ; vsubps        %ymm10,%ymm2,%ymm2
+  DB  196,193,76,89,251                   ; vmulps        %ymm11,%ymm6,%ymm7
+  DB  197,236,92,215                      ; vsubps        %ymm7,%ymm2,%ymm2
+  DB  196,193,76,88,245                   ; vaddps        %ymm13,%ymm6,%ymm6
+  DB  197,156,94,246                      ; vdivps        %ymm6,%ymm12,%ymm6
+  DB  197,236,92,214                      ; vsubps        %ymm6,%ymm2,%ymm2
+  DB  197,140,89,210                      ; vmulps        %ymm2,%ymm14,%ymm2
+  DB  196,227,125,8,242,1                 ; vroundps      $0x1,%ymm2,%ymm6
+  DB  197,236,92,246                      ; vsubps        %ymm6,%ymm2,%ymm6
+  DB  197,132,88,210                      ; vaddps        %ymm2,%ymm15,%ymm2
+  DB  197,228,89,254                      ; vmulps        %ymm6,%ymm3,%ymm7
+  DB  197,236,92,215                      ; vsubps        %ymm7,%ymm2,%ymm2
+  DB  197,252,92,246                      ; vsubps        %ymm6,%ymm0,%ymm6
+  DB  197,244,94,246                      ; vdivps        %ymm6,%ymm1,%ymm6
+  DB  197,236,88,214                      ; vaddps        %ymm6,%ymm2,%ymm2
+  DB  197,252,16,60,36                    ; vmovups       (%rsp),%ymm7
+  DB  197,252,91,247                      ; vcvtdq2ps     %ymm7,%ymm6
+  DB  196,193,76,89,240                   ; vmulps        %ymm8,%ymm6,%ymm6
+  DB  197,180,84,255                      ; vandps        %ymm7,%ymm9,%ymm7
+  DB  197,196,86,237                      ; vorps         %ymm5,%ymm7,%ymm5
+  DB  196,193,76,92,242                   ; vsubps        %ymm10,%ymm6,%ymm6
+  DB  196,193,84,89,251                   ; vmulps        %ymm11,%ymm5,%ymm7
+  DB  197,204,92,247                      ; vsubps        %ymm7,%ymm6,%ymm6
+  DB  196,193,84,88,237                   ; vaddps        %ymm13,%ymm5,%ymm5
+  DB  197,156,94,237                      ; vdivps        %ymm5,%ymm12,%ymm5
+  DB  197,204,92,237                      ; vsubps        %ymm5,%ymm6,%ymm5
+  DB  197,140,89,237                      ; vmulps        %ymm5,%ymm14,%ymm5
+  DB  196,227,125,8,245,1                 ; vroundps      $0x1,%ymm5,%ymm6
+  DB  197,212,92,246                      ; vsubps        %ymm6,%ymm5,%ymm6
+  DB  197,132,88,237                      ; vaddps        %ymm5,%ymm15,%ymm5
+  DB  197,228,89,222                      ; vmulps        %ymm6,%ymm3,%ymm3
+  DB  197,212,92,219                      ; vsubps        %ymm3,%ymm5,%ymm3
+  DB  197,252,92,198                      ; vsubps        %ymm6,%ymm0,%ymm0
+  DB  197,244,94,192                      ; vdivps        %ymm0,%ymm1,%ymm0
+  DB  197,228,88,192                      ; vaddps        %ymm0,%ymm3,%ymm0
+  DB  196,193,121,110,200                 ; vmovd         %r8d,%xmm1
+  DB  196,227,121,4,201,0                 ; vpermilps     $0x0,%xmm1,%xmm1
+  DB  196,227,117,24,201,1                ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
+  DB  197,244,89,220                      ; vmulps        %ymm4,%ymm1,%ymm3
+  DB  197,244,89,210                      ; vmulps        %ymm2,%ymm1,%ymm2
+  DB  197,244,89,224                      ; vmulps        %ymm0,%ymm1,%ymm4
+  DB  197,253,91,195                      ; vcvtps2dq     %ymm3,%ymm0
+  DB  197,253,91,202                      ; vcvtps2dq     %ymm2,%ymm1
+  DB  197,253,91,212                      ; vcvtps2dq     %ymm4,%ymm2
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  197,252,16,92,36,32                 ; vmovups       0x20(%rsp),%ymm3
+  DB  197,252,16,100,36,64                ; vmovups       0x40(%rsp),%ymm4
+  DB  197,252,16,108,36,96                ; vmovups       0x60(%rsp),%ymm5
+  DB  197,252,16,180,36,128,0,0,0         ; vmovups       0x80(%rsp),%ymm6
+  DB  197,252,16,188,36,160,0,0,0         ; vmovups       0xa0(%rsp),%ymm7
+  DB  72,129,196,216,0,0,0                ; add           $0xd8,%rsp
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_rgb_to_hsl_avx
@@ -5655,7 +5991,7 @@ _sk_scale_u8_avx LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,1,248                            ; add           %rdi,%rax
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,80                              ; jne           14ff <_sk_scale_u8_avx+0x60>
+  DB  117,80                              ; jne           18bb <_sk_scale_u8_avx+0x60>
   DB  197,122,126,0                       ; vmovq         (%rax),%xmm8
   DB  196,66,121,49,200                   ; vpmovzxbd     %xmm8,%xmm9
   DB  196,67,121,4,192,229                ; vpermilps     $0xe5,%xmm8,%xmm8
@@ -5683,9 +6019,9 @@ _sk_scale_u8_avx LABEL PROC
   DB  77,9,217                            ; or            %r11,%r9
   DB  72,131,193,8                        ; add           $0x8,%rcx
   DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           1507 <_sk_scale_u8_avx+0x68>
+  DB  117,234                             ; jne           18c3 <_sk_scale_u8_avx+0x68>
   DB  196,65,249,110,193                  ; vmovq         %r9,%xmm8
-  DB  235,143                             ; jmp           14b3 <_sk_scale_u8_avx+0x14>
+  DB  235,143                             ; jmp           186f <_sk_scale_u8_avx+0x14>
 
 PUBLIC _sk_lerp_1_float_avx
 _sk_lerp_1_float_avx LABEL PROC
@@ -5713,7 +6049,7 @@ _sk_lerp_u8_avx LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,1,248                            ; add           %rdi,%rax
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,116                             ; jne           15e7 <_sk_lerp_u8_avx+0x84>
+  DB  117,116                             ; jne           19a3 <_sk_lerp_u8_avx+0x84>
   DB  197,122,126,0                       ; vmovq         (%rax),%xmm8
   DB  196,66,121,49,200                   ; vpmovzxbd     %xmm8,%xmm9
   DB  196,67,121,4,192,229                ; vpermilps     $0xe5,%xmm8,%xmm8
@@ -5749,16 +6085,16 @@ _sk_lerp_u8_avx LABEL PROC
   DB  77,9,217                            ; or            %r11,%r9
   DB  72,131,193,8                        ; add           $0x8,%rcx
   DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           15ef <_sk_lerp_u8_avx+0x8c>
+  DB  117,234                             ; jne           19ab <_sk_lerp_u8_avx+0x8c>
   DB  196,65,249,110,193                  ; vmovq         %r9,%xmm8
-  DB  233,104,255,255,255                 ; jmpq          1577 <_sk_lerp_u8_avx+0x14>
+  DB  233,104,255,255,255                 ; jmpq          1933 <_sk_lerp_u8_avx+0x14>
 
 PUBLIC _sk_lerp_565_avx
 _sk_lerp_565_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,16                           ; mov           (%rax),%r10
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,250,0,0,0                    ; jne           1717 <_sk_lerp_565_avx+0x108>
+  DB  15,133,250,0,0,0                    ; jne           1ad3 <_sk_lerp_565_avx+0x108>
   DB  196,65,122,111,4,122                ; vmovdqu       (%r10,%rdi,2),%xmm8
   DB  197,225,239,219                     ; vpxor         %xmm3,%xmm3,%xmm3
   DB  197,185,105,219                     ; vpunpckhwd    %xmm3,%xmm8,%xmm3
@@ -5817,9 +6153,9 @@ _sk_lerp_565_avx LABEL PROC
   DB  196,65,57,239,192                   ; vpxor         %xmm8,%xmm8,%xmm8
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,243,254,255,255              ; ja            1623 <_sk_lerp_565_avx+0x14>
+  DB  15,135,243,254,255,255              ; ja            19df <_sk_lerp_565_avx+0x14>
   DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  76,141,13,73,0,0,0                  ; lea           0x49(%rip),%r9        # 1784 <_sk_lerp_565_avx+0x175>
+  DB  76,141,13,73,0,0,0                  ; lea           0x49(%rip),%r9        # 1b40 <_sk_lerp_565_avx+0x175>
   DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
   DB  76,1,200                            ; add           %r9,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -5831,7 +6167,7 @@ _sk_lerp_565_avx LABEL PROC
   DB  196,65,57,196,68,122,4,2            ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
   DB  196,65,57,196,68,122,2,1            ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
   DB  196,65,57,196,4,122,0               ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm8,%xmm8
-  DB  233,159,254,255,255                 ; jmpq          1623 <_sk_lerp_565_avx+0x14>
+  DB  233,159,254,255,255                 ; jmpq          19df <_sk_lerp_565_avx+0x14>
   DB  244                                 ; hlt
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -5867,7 +6203,7 @@ _sk_load_tables_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,56,2,0,0                     ; jne           19f0 <_sk_load_tables_avx+0x250>
+  DB  15,133,56,2,0,0                     ; jne           1dac <_sk_load_tables_avx+0x250>
   DB  196,65,124,16,4,184                 ; vmovups       (%r8,%rdi,4),%ymm8
   DB  187,255,0,0,0                       ; mov           $0xff,%ebx
   DB  197,249,110,195                     ; vmovd         %ebx,%xmm0
@@ -5986,9 +6322,9 @@ _sk_load_tables_avx LABEL PROC
   DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
   DB  254,203                             ; dec           %bl
   DB  128,251,6                           ; cmp           $0x6,%bl
-  DB  15,135,185,253,255,255              ; ja            17be <_sk_load_tables_avx+0x1e>
+  DB  15,135,185,253,255,255              ; ja            1b7a <_sk_load_tables_avx+0x1e>
   DB  15,182,219                          ; movzbl        %bl,%ebx
-  DB  76,141,13,137,0,0,0                 ; lea           0x89(%rip),%r9        # 1a98 <_sk_load_tables_avx+0x2f8>
+  DB  76,141,13,137,0,0,0                 ; lea           0x89(%rip),%r9        # 1e54 <_sk_load_tables_avx+0x2f8>
   DB  73,99,28,153                        ; movslq        (%r9,%rbx,4),%rbx
   DB  76,1,203                            ; add           %r9,%rbx
   DB  255,227                             ; jmpq          *%rbx
@@ -6011,7 +6347,7 @@ _sk_load_tables_avx LABEL PROC
   DB  196,99,61,12,192,15                 ; vblendps      $0xf,%ymm0,%ymm8,%ymm8
   DB  196,195,57,34,4,184,0               ; vpinsrd       $0x0,(%r8,%rdi,4),%xmm8,%xmm0
   DB  196,99,61,12,192,15                 ; vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  DB  233,38,253,255,255                  ; jmpq          17be <_sk_load_tables_avx+0x1e>
+  DB  233,38,253,255,255                  ; jmpq          1b7a <_sk_load_tables_avx+0x1e>
   DB  238                                 ; out           %al,(%dx)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -6037,7 +6373,7 @@ _sk_load_tables_u16_be_avx LABEL PROC
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,165,2,0,0                    ; jne           1d6f <_sk_load_tables_u16_be_avx+0x2bb>
+  DB  15,133,165,2,0,0                    ; jne           212b <_sk_load_tables_u16_be_avx+0x2bb>
   DB  196,1,121,16,4,72                   ; vmovupd       (%r8,%r9,2),%xmm8
   DB  196,129,121,16,84,72,16             ; vmovupd       0x10(%r8,%r9,2),%xmm2
   DB  196,129,121,16,92,72,32             ; vmovupd       0x20(%r8,%r9,2),%xmm3
@@ -6181,29 +6517,29 @@ _sk_load_tables_u16_be_avx LABEL PROC
   DB  196,1,123,16,4,72                   ; vmovsd        (%r8,%r9,2),%xmm8
   DB  196,65,49,239,201                   ; vpxor         %xmm9,%xmm9,%xmm9
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,85                              ; je            1dd5 <_sk_load_tables_u16_be_avx+0x321>
+  DB  116,85                              ; je            2191 <_sk_load_tables_u16_be_avx+0x321>
   DB  196,1,57,22,68,72,8                 ; vmovhpd       0x8(%r8,%r9,2),%xmm8,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,72                              ; jb            1dd5 <_sk_load_tables_u16_be_avx+0x321>
+  DB  114,72                              ; jb            2191 <_sk_load_tables_u16_be_avx+0x321>
   DB  196,129,123,16,84,72,16             ; vmovsd        0x10(%r8,%r9,2),%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  116,72                              ; je            1de2 <_sk_load_tables_u16_be_avx+0x32e>
+  DB  116,72                              ; je            219e <_sk_load_tables_u16_be_avx+0x32e>
   DB  196,129,105,22,84,72,24             ; vmovhpd       0x18(%r8,%r9,2),%xmm2,%xmm2
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,59                              ; jb            1de2 <_sk_load_tables_u16_be_avx+0x32e>
+  DB  114,59                              ; jb            219e <_sk_load_tables_u16_be_avx+0x32e>
   DB  196,129,123,16,92,72,32             ; vmovsd        0x20(%r8,%r9,2),%xmm3
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  15,132,45,253,255,255               ; je            1ae5 <_sk_load_tables_u16_be_avx+0x31>
+  DB  15,132,45,253,255,255               ; je            1ea1 <_sk_load_tables_u16_be_avx+0x31>
   DB  196,129,97,22,92,72,40              ; vmovhpd       0x28(%r8,%r9,2),%xmm3,%xmm3
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,28,253,255,255               ; jb            1ae5 <_sk_load_tables_u16_be_avx+0x31>
+  DB  15,130,28,253,255,255               ; jb            1ea1 <_sk_load_tables_u16_be_avx+0x31>
   DB  196,1,122,126,76,72,48              ; vmovq         0x30(%r8,%r9,2),%xmm9
-  DB  233,16,253,255,255                  ; jmpq          1ae5 <_sk_load_tables_u16_be_avx+0x31>
+  DB  233,16,253,255,255                  ; jmpq          1ea1 <_sk_load_tables_u16_be_avx+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
   DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,3,253,255,255                   ; jmpq          1ae5 <_sk_load_tables_u16_be_avx+0x31>
+  DB  233,3,253,255,255                   ; jmpq          1ea1 <_sk_load_tables_u16_be_avx+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  233,250,252,255,255                 ; jmpq          1ae5 <_sk_load_tables_u16_be_avx+0x31>
+  DB  233,250,252,255,255                 ; jmpq          1ea1 <_sk_load_tables_u16_be_avx+0x31>
 
 PUBLIC _sk_load_tables_rgb_u16_be_avx
 _sk_load_tables_rgb_u16_be_avx LABEL PROC
@@ -6211,7 +6547,7 @@ _sk_load_tables_rgb_u16_be_avx LABEL PROC
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  76,141,12,127                       ; lea           (%rdi,%rdi,2),%r9
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,146,2,0,0                    ; jne           208f <_sk_load_tables_rgb_u16_be_avx+0x2a4>
+  DB  15,133,146,2,0,0                    ; jne           244b <_sk_load_tables_rgb_u16_be_avx+0x2a4>
   DB  196,129,122,111,4,72                ; vmovdqu       (%r8,%r9,2),%xmm0
   DB  196,129,122,111,84,72,12            ; vmovdqu       0xc(%r8,%r9,2),%xmm2
   DB  196,129,122,111,76,72,24            ; vmovdqu       0x18(%r8,%r9,2),%xmm1
@@ -6351,36 +6687,36 @@ _sk_load_tables_rgb_u16_be_avx LABEL PROC
   DB  196,129,121,110,4,72                ; vmovd         (%r8,%r9,2),%xmm0
   DB  196,129,121,196,68,72,4,2           ; vpinsrw       $0x2,0x4(%r8,%r9,2),%xmm0,%xmm0
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  117,5                               ; jne           20a8 <_sk_load_tables_rgb_u16_be_avx+0x2bd>
-  DB  233,137,253,255,255                 ; jmpq          1e31 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  DB  117,5                               ; jne           2464 <_sk_load_tables_rgb_u16_be_avx+0x2bd>
+  DB  233,137,253,255,255                 ; jmpq          21ed <_sk_load_tables_rgb_u16_be_avx+0x46>
   DB  196,129,121,110,76,72,6             ; vmovd         0x6(%r8,%r9,2),%xmm1
   DB  196,1,113,196,68,72,10,2            ; vpinsrw       $0x2,0xa(%r8,%r9,2),%xmm1,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,26                              ; jb            20d7 <_sk_load_tables_rgb_u16_be_avx+0x2ec>
+  DB  114,26                              ; jb            2493 <_sk_load_tables_rgb_u16_be_avx+0x2ec>
   DB  196,129,121,110,76,72,12            ; vmovd         0xc(%r8,%r9,2),%xmm1
   DB  196,129,113,196,84,72,16,2          ; vpinsrw       $0x2,0x10(%r8,%r9,2),%xmm1,%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  117,10                              ; jne           20dc <_sk_load_tables_rgb_u16_be_avx+0x2f1>
-  DB  233,90,253,255,255                  ; jmpq          1e31 <_sk_load_tables_rgb_u16_be_avx+0x46>
-  DB  233,85,253,255,255                  ; jmpq          1e31 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  DB  117,10                              ; jne           2498 <_sk_load_tables_rgb_u16_be_avx+0x2f1>
+  DB  233,90,253,255,255                  ; jmpq          21ed <_sk_load_tables_rgb_u16_be_avx+0x46>
+  DB  233,85,253,255,255                  ; jmpq          21ed <_sk_load_tables_rgb_u16_be_avx+0x46>
   DB  196,129,121,110,76,72,18            ; vmovd         0x12(%r8,%r9,2),%xmm1
   DB  196,1,113,196,76,72,22,2            ; vpinsrw       $0x2,0x16(%r8,%r9,2),%xmm1,%xmm9
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,26                              ; jb            210b <_sk_load_tables_rgb_u16_be_avx+0x320>
+  DB  114,26                              ; jb            24c7 <_sk_load_tables_rgb_u16_be_avx+0x320>
   DB  196,129,121,110,76,72,24            ; vmovd         0x18(%r8,%r9,2),%xmm1
   DB  196,129,113,196,76,72,28,2          ; vpinsrw       $0x2,0x1c(%r8,%r9,2),%xmm1,%xmm1
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  117,10                              ; jne           2110 <_sk_load_tables_rgb_u16_be_avx+0x325>
-  DB  233,38,253,255,255                  ; jmpq          1e31 <_sk_load_tables_rgb_u16_be_avx+0x46>
-  DB  233,33,253,255,255                  ; jmpq          1e31 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  DB  117,10                              ; jne           24cc <_sk_load_tables_rgb_u16_be_avx+0x325>
+  DB  233,38,253,255,255                  ; jmpq          21ed <_sk_load_tables_rgb_u16_be_avx+0x46>
+  DB  233,33,253,255,255                  ; jmpq          21ed <_sk_load_tables_rgb_u16_be_avx+0x46>
   DB  196,129,121,110,92,72,30            ; vmovd         0x1e(%r8,%r9,2),%xmm3
   DB  196,1,97,196,92,72,34,2             ; vpinsrw       $0x2,0x22(%r8,%r9,2),%xmm3,%xmm11
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,20                              ; jb            2139 <_sk_load_tables_rgb_u16_be_avx+0x34e>
+  DB  114,20                              ; jb            24f5 <_sk_load_tables_rgb_u16_be_avx+0x34e>
   DB  196,129,121,110,92,72,36            ; vmovd         0x24(%r8,%r9,2),%xmm3
   DB  196,129,97,196,92,72,40,2           ; vpinsrw       $0x2,0x28(%r8,%r9,2),%xmm3,%xmm3
-  DB  233,248,252,255,255                 ; jmpq          1e31 <_sk_load_tables_rgb_u16_be_avx+0x46>
-  DB  233,243,252,255,255                 ; jmpq          1e31 <_sk_load_tables_rgb_u16_be_avx+0x46>
+  DB  233,248,252,255,255                 ; jmpq          21ed <_sk_load_tables_rgb_u16_be_avx+0x46>
+  DB  233,243,252,255,255                 ; jmpq          21ed <_sk_load_tables_rgb_u16_be_avx+0x46>
 
 PUBLIC _sk_byte_tables_avx
 _sk_byte_tables_avx LABEL PROC
@@ -7344,7 +7680,7 @@ _sk_load_a8_avx LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,1,248                            ; add           %rdi,%rax
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,74                              ; jne           32d0 <_sk_load_a8_avx+0x5a>
+  DB  117,74                              ; jne           368c <_sk_load_a8_avx+0x5a>
   DB  197,250,126,0                       ; vmovq         (%rax),%xmm0
   DB  196,226,121,49,200                  ; vpmovzxbd     %xmm0,%xmm1
   DB  196,227,121,4,192,229               ; vpermilps     $0xe5,%xmm0,%xmm0
@@ -7371,9 +7707,9 @@ _sk_load_a8_avx LABEL PROC
   DB  77,9,217                            ; or            %r11,%r9
   DB  72,131,193,8                        ; add           $0x8,%rcx
   DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           32d8 <_sk_load_a8_avx+0x62>
+  DB  117,234                             ; jne           3694 <_sk_load_a8_avx+0x62>
   DB  196,193,249,110,193                 ; vmovq         %r9,%xmm0
-  DB  235,149                             ; jmp           328a <_sk_load_a8_avx+0x14>
+  DB  235,149                             ; jmp           3646 <_sk_load_a8_avx+0x14>
 
 PUBLIC _sk_gather_a8_avx
 _sk_gather_a8_avx LABEL PROC
@@ -7450,7 +7786,7 @@ _sk_store_a8_avx LABEL PROC
   DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
   DB  196,65,57,103,192                   ; vpackuswb     %xmm8,%xmm8,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           3431 <_sk_store_a8_avx+0x42>
+  DB  117,10                              ; jne           37ed <_sk_store_a8_avx+0x42>
   DB  196,65,123,17,4,57                  ; vmovsd        %xmm8,(%r9,%rdi,1)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
@@ -7458,10 +7794,10 @@ _sk_store_a8_avx LABEL PROC
   DB  65,128,224,7                        ; and           $0x7,%r8b
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            342d <_sk_store_a8_avx+0x3e>
+  DB  119,236                             ; ja            37e9 <_sk_store_a8_avx+0x3e>
   DB  196,66,121,48,192                   ; vpmovzxbw     %xmm8,%xmm8
   DB  65,15,182,192                       ; movzbl        %r8b,%eax
-  DB  76,141,5,67,0,0,0                   ; lea           0x43(%rip),%r8        # 3494 <_sk_store_a8_avx+0xa5>
+  DB  76,141,5,67,0,0,0                   ; lea           0x43(%rip),%r8        # 3850 <_sk_store_a8_avx+0xa5>
   DB  73,99,4,128                         ; movslq        (%r8,%rax,4),%rax
   DB  76,1,192                            ; add           %r8,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -7472,7 +7808,7 @@ _sk_store_a8_avx LABEL PROC
   DB  196,67,121,20,68,57,2,4             ; vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
   DB  196,67,121,20,68,57,1,2             ; vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
   DB  196,67,121,20,4,57,0                ; vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
-  DB  235,154                             ; jmp           342d <_sk_store_a8_avx+0x3e>
+  DB  235,154                             ; jmp           37e9 <_sk_store_a8_avx+0x3e>
   DB  144                                 ; nop
   DB  246,255                             ; idiv          %bh
   DB  255                                 ; (bad)
@@ -7504,7 +7840,7 @@ _sk_load_g8_avx LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  72,1,248                            ; add           %rdi,%rax
   DB  77,133,192                          ; test          %r8,%r8
-  DB  117,91                              ; jne           351b <_sk_load_g8_avx+0x6b>
+  DB  117,91                              ; jne           38d7 <_sk_load_g8_avx+0x6b>
   DB  197,250,126,0                       ; vmovq         (%rax),%xmm0
   DB  196,226,121,49,200                  ; vpmovzxbd     %xmm0,%xmm1
   DB  196,227,121,4,192,229               ; vpermilps     $0xe5,%xmm0,%xmm0
@@ -7534,9 +7870,9 @@ _sk_load_g8_avx LABEL PROC
   DB  77,9,217                            ; or            %r11,%r9
   DB  72,131,193,8                        ; add           $0x8,%rcx
   DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           3523 <_sk_load_g8_avx+0x73>
+  DB  117,234                             ; jne           38df <_sk_load_g8_avx+0x73>
   DB  196,193,249,110,193                 ; vmovq         %r9,%xmm0
-  DB  235,132                             ; jmp           34c4 <_sk_load_g8_avx+0x14>
+  DB  235,132                             ; jmp           3880 <_sk_load_g8_avx+0x14>
 
 PUBLIC _sk_gather_g8_avx
 _sk_gather_g8_avx LABEL PROC
@@ -7607,9 +7943,9 @@ _sk_gather_i8_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  73,137,192                          ; mov           %rax,%r8
   DB  77,133,192                          ; test          %r8,%r8
-  DB  116,5                               ; je            365a <_sk_gather_i8_avx+0xf>
+  DB  116,5                               ; je            3a16 <_sk_gather_i8_avx+0xf>
   DB  76,137,192                          ; mov           %r8,%rax
-  DB  235,2                               ; jmp           365c <_sk_gather_i8_avx+0x11>
+  DB  235,2                               ; jmp           3a18 <_sk_gather_i8_avx+0x11>
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  65,87                               ; push          %r15
   DB  65,86                               ; push          %r14
@@ -7712,7 +8048,7 @@ _sk_load_565_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,16                           ; mov           (%rax),%r10
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,209,0,0,0                    ; jne           38f6 <_sk_load_565_avx+0xdf>
+  DB  15,133,209,0,0,0                    ; jne           3cb2 <_sk_load_565_avx+0xdf>
   DB  196,193,122,111,4,122               ; vmovdqu       (%r10,%rdi,2),%xmm0
   DB  197,241,239,201                     ; vpxor         %xmm1,%xmm1,%xmm1
   DB  197,249,105,201                     ; vpunpckhwd    %xmm1,%xmm0,%xmm1
@@ -7762,9 +8098,9 @@ _sk_load_565_avx LABEL PROC
   DB  197,249,239,192                     ; vpxor         %xmm0,%xmm0,%xmm0
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,29,255,255,255               ; ja            382b <_sk_load_565_avx+0x14>
+  DB  15,135,29,255,255,255               ; ja            3be7 <_sk_load_565_avx+0x14>
   DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  76,141,13,75,0,0,0                  ; lea           0x4b(%rip),%r9        # 3964 <_sk_load_565_avx+0x14d>
+  DB  76,141,13,75,0,0,0                  ; lea           0x4b(%rip),%r9        # 3d20 <_sk_load_565_avx+0x14d>
   DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
   DB  76,1,200                            ; add           %r9,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -7776,7 +8112,7 @@ _sk_load_565_avx LABEL PROC
   DB  196,193,121,196,68,122,4,2          ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
   DB  196,193,121,196,68,122,2,1          ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
   DB  196,193,121,196,4,122,0             ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  DB  233,201,254,255,255                 ; jmpq          382b <_sk_load_565_avx+0x14>
+  DB  233,201,254,255,255                 ; jmpq          3be7 <_sk_load_565_avx+0x14>
   DB  102,144                             ; xchg          %ax,%ax
   DB  242,255                             ; repnz         (bad)
   DB  255                                 ; (bad)
@@ -7929,7 +8265,7 @@ _sk_store_565_avx LABEL PROC
   DB  196,67,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm9
   DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           3baf <_sk_store_565_avx+0x9e>
+  DB  117,10                              ; jne           3f6b <_sk_store_565_avx+0x9e>
   DB  196,65,122,127,4,121                ; vmovdqu       %xmm8,(%r9,%rdi,2)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
@@ -7937,9 +8273,9 @@ _sk_store_565_avx LABEL PROC
   DB  65,128,224,7                        ; and           $0x7,%r8b
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            3bab <_sk_store_565_avx+0x9a>
+  DB  119,236                             ; ja            3f67 <_sk_store_565_avx+0x9a>
   DB  65,15,182,192                       ; movzbl        %r8b,%eax
-  DB  76,141,5,66,0,0,0                   ; lea           0x42(%rip),%r8        # 3c0c <_sk_store_565_avx+0xfb>
+  DB  76,141,5,66,0,0,0                   ; lea           0x42(%rip),%r8        # 3fc8 <_sk_store_565_avx+0xfb>
   DB  73,99,4,128                         ; movslq        (%r8,%rax,4),%rax
   DB  76,1,192                            ; add           %r8,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -7950,7 +8286,7 @@ _sk_store_565_avx LABEL PROC
   DB  196,67,121,21,68,121,4,2            ; vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
   DB  196,67,121,21,68,121,2,1            ; vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
   DB  196,67,121,21,4,121,0               ; vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
-  DB  235,159                             ; jmp           3bab <_sk_store_565_avx+0x9a>
+  DB  235,159                             ; jmp           3f67 <_sk_store_565_avx+0x9a>
   DB  247,255                             ; idiv          %edi
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -7979,7 +8315,7 @@ _sk_load_4444_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,16                           ; mov           (%rax),%r10
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,245,0,0,0                    ; jne           3d2b <_sk_load_4444_avx+0x103>
+  DB  15,133,245,0,0,0                    ; jne           40e7 <_sk_load_4444_avx+0x103>
   DB  196,193,122,111,4,122               ; vmovdqu       (%r10,%rdi,2),%xmm0
   DB  197,241,239,201                     ; vpxor         %xmm1,%xmm1,%xmm1
   DB  197,249,105,201                     ; vpunpckhwd    %xmm1,%xmm0,%xmm1
@@ -8036,9 +8372,9 @@ _sk_load_4444_avx LABEL PROC
   DB  197,249,239,192                     ; vpxor         %xmm0,%xmm0,%xmm0
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,249,254,255,255              ; ja            3c3c <_sk_load_4444_avx+0x14>
+  DB  15,135,249,254,255,255              ; ja            3ff8 <_sk_load_4444_avx+0x14>
   DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  76,141,13,74,0,0,0                  ; lea           0x4a(%rip),%r9        # 3d98 <_sk_load_4444_avx+0x170>
+  DB  76,141,13,74,0,0,0                  ; lea           0x4a(%rip),%r9        # 4154 <_sk_load_4444_avx+0x170>
   DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
   DB  76,1,200                            ; add           %r9,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -8050,12 +8386,12 @@ _sk_load_4444_avx LABEL PROC
   DB  196,193,121,196,68,122,4,2          ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
   DB  196,193,121,196,68,122,2,1          ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
   DB  196,193,121,196,4,122,0             ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  DB  233,165,254,255,255                 ; jmpq          3c3c <_sk_load_4444_avx+0x14>
+  DB  233,165,254,255,255                 ; jmpq          3ff8 <_sk_load_4444_avx+0x14>
   DB  144                                 ; nop
   DB  243,255                             ; repz          (bad)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
-  DB  235,255                             ; jmp           3d9d <_sk_load_4444_avx+0x175>
+  DB  235,255                             ; jmp           4159 <_sk_load_4444_avx+0x175>
   DB  255                                 ; (bad)
   DB  255,227                             ; jmpq          *%rbx
   DB  255                                 ; (bad)
@@ -8212,7 +8548,7 @@ _sk_store_4444_avx LABEL PROC
   DB  196,67,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm9
   DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           4018 <_sk_store_4444_avx+0xaf>
+  DB  117,10                              ; jne           43d4 <_sk_store_4444_avx+0xaf>
   DB  196,65,122,127,4,121                ; vmovdqu       %xmm8,(%r9,%rdi,2)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
@@ -8220,9 +8556,9 @@ _sk_store_4444_avx LABEL PROC
   DB  65,128,224,7                        ; and           $0x7,%r8b
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            4014 <_sk_store_4444_avx+0xab>
+  DB  119,236                             ; ja            43d0 <_sk_store_4444_avx+0xab>
   DB  65,15,182,192                       ; movzbl        %r8b,%eax
-  DB  76,141,5,69,0,0,0                   ; lea           0x45(%rip),%r8        # 4078 <_sk_store_4444_avx+0x10f>
+  DB  76,141,5,69,0,0,0                   ; lea           0x45(%rip),%r8        # 4434 <_sk_store_4444_avx+0x10f>
   DB  73,99,4,128                         ; movslq        (%r8,%rax,4),%rax
   DB  76,1,192                            ; add           %r8,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -8233,7 +8569,7 @@ _sk_store_4444_avx LABEL PROC
   DB  196,67,121,21,68,121,4,2            ; vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
   DB  196,67,121,21,68,121,2,1            ; vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
   DB  196,67,121,21,4,121,0               ; vpextrw       $0x0,%xmm8,(%r9,%rdi,2)
-  DB  235,159                             ; jmp           4014 <_sk_store_4444_avx+0xab>
+  DB  235,159                             ; jmp           43d0 <_sk_store_4444_avx+0xab>
   DB  15,31,0                             ; nopl          (%rax)
   DB  244                                 ; hlt
   DB  255                                 ; (bad)
@@ -8264,7 +8600,7 @@ _sk_load_8888_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,16                           ; mov           (%rax),%r10
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,157,0,0,0                    ; jne           413f <_sk_load_8888_avx+0xab>
+  DB  15,133,157,0,0,0                    ; jne           44fb <_sk_load_8888_avx+0xab>
   DB  196,65,124,16,12,186                ; vmovups       (%r10,%rdi,4),%ymm9
   DB  184,255,0,0,0                       ; mov           $0xff,%eax
   DB  197,249,110,192                     ; vmovd         %eax,%xmm0
@@ -8302,9 +8638,9 @@ _sk_load_8888_avx LABEL PROC
   DB  196,65,52,87,201                    ; vxorps        %ymm9,%ymm9,%ymm9
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,80,255,255,255               ; ja            40a8 <_sk_load_8888_avx+0x14>
+  DB  15,135,80,255,255,255               ; ja            4464 <_sk_load_8888_avx+0x14>
   DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  76,141,13,137,0,0,0                 ; lea           0x89(%rip),%r9        # 41ec <_sk_load_8888_avx+0x158>
+  DB  76,141,13,137,0,0,0                 ; lea           0x89(%rip),%r9        # 45a8 <_sk_load_8888_avx+0x158>
   DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
   DB  76,1,200                            ; add           %r9,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -8327,7 +8663,7 @@ _sk_load_8888_avx LABEL PROC
   DB  196,99,53,12,200,15                 ; vblendps      $0xf,%ymm0,%ymm9,%ymm9
   DB  196,195,49,34,4,186,0               ; vpinsrd       $0x0,(%r10,%rdi,4),%xmm9,%xmm0
   DB  196,99,53,12,200,15                 ; vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  DB  233,188,254,255,255                 ; jmpq          40a8 <_sk_load_8888_avx+0x14>
+  DB  233,188,254,255,255                 ; jmpq          4464 <_sk_load_8888_avx+0x14>
   DB  238                                 ; out           %al,(%dx)
   DB  255                                 ; (bad)
   DB  255                                 ; (bad)
@@ -8453,7 +8789,7 @@ _sk_store_8888_avx LABEL PROC
   DB  196,65,45,86,192                    ; vorpd         %ymm8,%ymm10,%ymm8
   DB  196,65,53,86,192                    ; vorpd         %ymm8,%ymm9,%ymm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           43ed <_sk_store_8888_avx+0xa4>
+  DB  117,10                              ; jne           47a9 <_sk_store_8888_avx+0xa4>
   DB  196,65,124,17,4,185                 ; vmovups       %ymm8,(%r9,%rdi,4)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
@@ -8461,9 +8797,9 @@ _sk_store_8888_avx LABEL PROC
   DB  65,128,224,7                        ; and           $0x7,%r8b
   DB  65,254,200                          ; dec           %r8b
   DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            43e9 <_sk_store_8888_avx+0xa0>
+  DB  119,236                             ; ja            47a5 <_sk_store_8888_avx+0xa0>
   DB  65,15,182,192                       ; movzbl        %r8b,%eax
-  DB  76,141,5,84,0,0,0                   ; lea           0x54(%rip),%r8        # 445c <_sk_store_8888_avx+0x113>
+  DB  76,141,5,84,0,0,0                   ; lea           0x54(%rip),%r8        # 4818 <_sk_store_8888_avx+0x113>
   DB  73,99,4,128                         ; movslq        (%r8,%rax,4),%rax
   DB  76,1,192                            ; add           %r8,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -8477,7 +8813,7 @@ _sk_store_8888_avx LABEL PROC
   DB  196,67,121,22,68,185,8,2            ; vpextrd       $0x2,%xmm8,0x8(%r9,%rdi,4)
   DB  196,67,121,22,68,185,4,1            ; vpextrd       $0x1,%xmm8,0x4(%r9,%rdi,4)
   DB  196,65,121,126,4,185                ; vmovd         %xmm8,(%r9,%rdi,4)
-  DB  235,143                             ; jmp           43e9 <_sk_store_8888_avx+0xa0>
+  DB  235,143                             ; jmp           47a5 <_sk_store_8888_avx+0xa0>
   DB  102,144                             ; xchg          %ax,%ax
   DB  246,255                             ; idiv          %bh
   DB  255                                 ; (bad)
@@ -8511,7 +8847,7 @@ _sk_load_f16_avx LABEL PROC
   DB  197,252,17,124,36,64                ; vmovups       %ymm7,0x40(%rsp)
   DB  197,252,17,116,36,32                ; vmovups       %ymm6,0x20(%rsp)
   DB  197,252,17,44,36                    ; vmovups       %ymm5,(%rsp)
-  DB  15,133,49,2,0,0                     ; jne           46cc <_sk_load_f16_avx+0x254>
+  DB  15,133,49,2,0,0                     ; jne           4a88 <_sk_load_f16_avx+0x254>
   DB  197,121,16,4,248                    ; vmovupd       (%rax,%rdi,8),%xmm8
   DB  197,249,16,84,248,16                ; vmovupd       0x10(%rax,%rdi,8),%xmm2
   DB  197,249,16,76,248,32                ; vmovupd       0x20(%rax,%rdi,8),%xmm1
@@ -8629,29 +8965,29 @@ _sk_load_f16_avx LABEL PROC
   DB  197,123,16,4,248                    ; vmovsd        (%rax,%rdi,8),%xmm8
   DB  196,65,49,239,201                   ; vpxor         %xmm9,%xmm9,%xmm9
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,79                              ; je            472b <_sk_load_f16_avx+0x2b3>
+  DB  116,79                              ; je            4ae7 <_sk_load_f16_avx+0x2b3>
   DB  197,57,22,68,248,8                  ; vmovhpd       0x8(%rax,%rdi,8),%xmm8,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,67                              ; jb            472b <_sk_load_f16_avx+0x2b3>
+  DB  114,67                              ; jb            4ae7 <_sk_load_f16_avx+0x2b3>
   DB  197,251,16,84,248,16                ; vmovsd        0x10(%rax,%rdi,8),%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  116,68                              ; je            4738 <_sk_load_f16_avx+0x2c0>
+  DB  116,68                              ; je            4af4 <_sk_load_f16_avx+0x2c0>
   DB  197,233,22,84,248,24                ; vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,56                              ; jb            4738 <_sk_load_f16_avx+0x2c0>
+  DB  114,56                              ; jb            4af4 <_sk_load_f16_avx+0x2c0>
   DB  197,251,16,76,248,32                ; vmovsd        0x20(%rax,%rdi,8),%xmm1
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  15,132,162,253,255,255              ; je            44b2 <_sk_load_f16_avx+0x3a>
+  DB  15,132,162,253,255,255              ; je            486e <_sk_load_f16_avx+0x3a>
   DB  197,241,22,76,248,40                ; vmovhpd       0x28(%rax,%rdi,8),%xmm1,%xmm1
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,146,253,255,255              ; jb            44b2 <_sk_load_f16_avx+0x3a>
+  DB  15,130,146,253,255,255              ; jb            486e <_sk_load_f16_avx+0x3a>
   DB  197,122,126,76,248,48               ; vmovq         0x30(%rax,%rdi,8),%xmm9
-  DB  233,135,253,255,255                 ; jmpq          44b2 <_sk_load_f16_avx+0x3a>
+  DB  233,135,253,255,255                 ; jmpq          486e <_sk_load_f16_avx+0x3a>
   DB  197,241,87,201                      ; vxorpd        %xmm1,%xmm1,%xmm1
   DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,122,253,255,255                 ; jmpq          44b2 <_sk_load_f16_avx+0x3a>
+  DB  233,122,253,255,255                 ; jmpq          486e <_sk_load_f16_avx+0x3a>
   DB  197,241,87,201                      ; vxorpd        %xmm1,%xmm1,%xmm1
-  DB  233,113,253,255,255                 ; jmpq          44b2 <_sk_load_f16_avx+0x3a>
+  DB  233,113,253,255,255                 ; jmpq          486e <_sk_load_f16_avx+0x3a>
 
 PUBLIC _sk_gather_f16_avx
 _sk_gather_f16_avx LABEL PROC
@@ -8924,7 +9260,7 @@ _sk_store_f16_avx LABEL PROC
   DB  197,113,98,202                      ; vpunpckldq    %xmm2,%xmm1,%xmm9
   DB  197,113,106,194                     ; vpunpckhdq    %xmm2,%xmm1,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,79                              ; jne           4ca2 <_sk_store_f16_avx+0x271>
+  DB  117,79                              ; jne           505e <_sk_store_f16_avx+0x271>
   DB  196,65,120,17,28,248                ; vmovups       %xmm11,(%r8,%rdi,8)
   DB  196,65,120,17,84,248,16             ; vmovups       %xmm10,0x10(%r8,%rdi,8)
   DB  196,65,120,17,76,248,32             ; vmovups       %xmm9,0x20(%r8,%rdi,8)
@@ -8940,22 +9276,22 @@ _sk_store_f16_avx LABEL PROC
   DB  255,224                             ; jmpq          *%rax
   DB  196,65,121,214,28,248               ; vmovq         %xmm11,(%r8,%rdi,8)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,192                             ; je            4c6e <_sk_store_f16_avx+0x23d>
+  DB  116,192                             ; je            502a <_sk_store_f16_avx+0x23d>
   DB  196,65,121,23,92,248,8              ; vmovhpd       %xmm11,0x8(%r8,%rdi,8)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,179                             ; jb            4c6e <_sk_store_f16_avx+0x23d>
+  DB  114,179                             ; jb            502a <_sk_store_f16_avx+0x23d>
   DB  196,65,121,214,84,248,16            ; vmovq         %xmm10,0x10(%r8,%rdi,8)
-  DB  116,170                             ; je            4c6e <_sk_store_f16_avx+0x23d>
+  DB  116,170                             ; je            502a <_sk_store_f16_avx+0x23d>
   DB  196,65,121,23,84,248,24             ; vmovhpd       %xmm10,0x18(%r8,%rdi,8)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,157                             ; jb            4c6e <_sk_store_f16_avx+0x23d>
+  DB  114,157                             ; jb            502a <_sk_store_f16_avx+0x23d>
   DB  196,65,121,214,76,248,32            ; vmovq         %xmm9,0x20(%r8,%rdi,8)
-  DB  116,148                             ; je            4c6e <_sk_store_f16_avx+0x23d>
+  DB  116,148                             ; je            502a <_sk_store_f16_avx+0x23d>
   DB  196,65,121,23,76,248,40             ; vmovhpd       %xmm9,0x28(%r8,%rdi,8)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,135                             ; jb            4c6e <_sk_store_f16_avx+0x23d>
+  DB  114,135                             ; jb            502a <_sk_store_f16_avx+0x23d>
   DB  196,65,121,214,68,248,48            ; vmovq         %xmm8,0x30(%r8,%rdi,8)
-  DB  233,123,255,255,255                 ; jmpq          4c6e <_sk_store_f16_avx+0x23d>
+  DB  233,123,255,255,255                 ; jmpq          502a <_sk_store_f16_avx+0x23d>
 
 PUBLIC _sk_load_u16_be_avx
 _sk_load_u16_be_avx LABEL PROC
@@ -8963,7 +9299,7 @@ _sk_load_u16_be_avx LABEL PROC
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  72,141,4,189,0,0,0,0                ; lea           0x0(,%rdi,4),%rax
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,5,1,0,0                      ; jne           4e0e <_sk_load_u16_be_avx+0x11b>
+  DB  15,133,5,1,0,0                      ; jne           51ca <_sk_load_u16_be_avx+0x11b>
   DB  196,65,121,16,4,64                  ; vmovupd       (%r8,%rax,2),%xmm8
   DB  196,193,121,16,84,64,16             ; vmovupd       0x10(%r8,%rax,2),%xmm2
   DB  196,193,121,16,92,64,32             ; vmovupd       0x20(%r8,%rax,2),%xmm3
@@ -9022,29 +9358,29 @@ _sk_load_u16_be_avx LABEL PROC
   DB  196,65,123,16,4,64                  ; vmovsd        (%r8,%rax,2),%xmm8
   DB  196,65,49,239,201                   ; vpxor         %xmm9,%xmm9,%xmm9
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,85                              ; je            4e74 <_sk_load_u16_be_avx+0x181>
+  DB  116,85                              ; je            5230 <_sk_load_u16_be_avx+0x181>
   DB  196,65,57,22,68,64,8                ; vmovhpd       0x8(%r8,%rax,2),%xmm8,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,72                              ; jb            4e74 <_sk_load_u16_be_avx+0x181>
+  DB  114,72                              ; jb            5230 <_sk_load_u16_be_avx+0x181>
   DB  196,193,123,16,84,64,16             ; vmovsd        0x10(%r8,%rax,2),%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  116,72                              ; je            4e81 <_sk_load_u16_be_avx+0x18e>
+  DB  116,72                              ; je            523d <_sk_load_u16_be_avx+0x18e>
   DB  196,193,105,22,84,64,24             ; vmovhpd       0x18(%r8,%rax,2),%xmm2,%xmm2
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,59                              ; jb            4e81 <_sk_load_u16_be_avx+0x18e>
+  DB  114,59                              ; jb            523d <_sk_load_u16_be_avx+0x18e>
   DB  196,193,123,16,92,64,32             ; vmovsd        0x20(%r8,%rax,2),%xmm3
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  15,132,205,254,255,255              ; je            4d24 <_sk_load_u16_be_avx+0x31>
+  DB  15,132,205,254,255,255              ; je            50e0 <_sk_load_u16_be_avx+0x31>
   DB  196,193,97,22,92,64,40              ; vmovhpd       0x28(%r8,%rax,2),%xmm3,%xmm3
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,188,254,255,255              ; jb            4d24 <_sk_load_u16_be_avx+0x31>
+  DB  15,130,188,254,255,255              ; jb            50e0 <_sk_load_u16_be_avx+0x31>
   DB  196,65,122,126,76,64,48             ; vmovq         0x30(%r8,%rax,2),%xmm9
-  DB  233,176,254,255,255                 ; jmpq          4d24 <_sk_load_u16_be_avx+0x31>
+  DB  233,176,254,255,255                 ; jmpq          50e0 <_sk_load_u16_be_avx+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
   DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,163,254,255,255                 ; jmpq          4d24 <_sk_load_u16_be_avx+0x31>
+  DB  233,163,254,255,255                 ; jmpq          50e0 <_sk_load_u16_be_avx+0x31>
   DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  233,154,254,255,255                 ; jmpq          4d24 <_sk_load_u16_be_avx+0x31>
+  DB  233,154,254,255,255                 ; jmpq          50e0 <_sk_load_u16_be_avx+0x31>
 
 PUBLIC _sk_load_rgb_u16_be_avx
 _sk_load_rgb_u16_be_avx LABEL PROC
@@ -9052,7 +9388,7 @@ _sk_load_rgb_u16_be_avx LABEL PROC
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  72,141,4,127                        ; lea           (%rdi,%rdi,2),%rax
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,8,1,0,0                      ; jne           4fa4 <_sk_load_rgb_u16_be_avx+0x11a>
+  DB  15,133,8,1,0,0                      ; jne           5360 <_sk_load_rgb_u16_be_avx+0x11a>
   DB  196,193,122,111,4,64                ; vmovdqu       (%r8,%rax,2),%xmm0
   DB  196,193,122,111,84,64,12            ; vmovdqu       0xc(%r8,%rax,2),%xmm2
   DB  196,193,122,111,76,64,24            ; vmovdqu       0x18(%r8,%rax,2),%xmm1
@@ -9111,36 +9447,36 @@ _sk_load_rgb_u16_be_avx LABEL PROC
   DB  196,193,121,110,4,64                ; vmovd         (%r8,%rax,2),%xmm0
   DB  196,193,121,196,68,64,4,2           ; vpinsrw       $0x2,0x4(%r8,%rax,2),%xmm0,%xmm0
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  117,5                               ; jne           4fbd <_sk_load_rgb_u16_be_avx+0x133>
-  DB  233,19,255,255,255                  ; jmpq          4ed0 <_sk_load_rgb_u16_be_avx+0x46>
+  DB  117,5                               ; jne           5379 <_sk_load_rgb_u16_be_avx+0x133>
+  DB  233,19,255,255,255                  ; jmpq          528c <_sk_load_rgb_u16_be_avx+0x46>
   DB  196,193,121,110,76,64,6             ; vmovd         0x6(%r8,%rax,2),%xmm1
   DB  196,65,113,196,68,64,10,2           ; vpinsrw       $0x2,0xa(%r8,%rax,2),%xmm1,%xmm8
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,26                              ; jb            4fec <_sk_load_rgb_u16_be_avx+0x162>
+  DB  114,26                              ; jb            53a8 <_sk_load_rgb_u16_be_avx+0x162>
   DB  196,193,121,110,76,64,12            ; vmovd         0xc(%r8,%rax,2),%xmm1
   DB  196,193,113,196,84,64,16,2          ; vpinsrw       $0x2,0x10(%r8,%rax,2),%xmm1,%xmm2
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  117,10                              ; jne           4ff1 <_sk_load_rgb_u16_be_avx+0x167>
-  DB  233,228,254,255,255                 ; jmpq          4ed0 <_sk_load_rgb_u16_be_avx+0x46>
-  DB  233,223,254,255,255                 ; jmpq          4ed0 <_sk_load_rgb_u16_be_avx+0x46>
+  DB  117,10                              ; jne           53ad <_sk_load_rgb_u16_be_avx+0x167>
+  DB  233,228,254,255,255                 ; jmpq          528c <_sk_load_rgb_u16_be_avx+0x46>
+  DB  233,223,254,255,255                 ; jmpq          528c <_sk_load_rgb_u16_be_avx+0x46>
   DB  196,193,121,110,76,64,18            ; vmovd         0x12(%r8,%rax,2),%xmm1
   DB  196,65,113,196,76,64,22,2           ; vpinsrw       $0x2,0x16(%r8,%rax,2),%xmm1,%xmm9
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,26                              ; jb            5020 <_sk_load_rgb_u16_be_avx+0x196>
+  DB  114,26                              ; jb            53dc <_sk_load_rgb_u16_be_avx+0x196>
   DB  196,193,121,110,76,64,24            ; vmovd         0x18(%r8,%rax,2),%xmm1
   DB  196,193,113,196,76,64,28,2          ; vpinsrw       $0x2,0x1c(%r8,%rax,2),%xmm1,%xmm1
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  117,10                              ; jne           5025 <_sk_load_rgb_u16_be_avx+0x19b>
-  DB  233,176,254,255,255                 ; jmpq          4ed0 <_sk_load_rgb_u16_be_avx+0x46>
-  DB  233,171,254,255,255                 ; jmpq          4ed0 <_sk_load_rgb_u16_be_avx+0x46>
+  DB  117,10                              ; jne           53e1 <_sk_load_rgb_u16_be_avx+0x19b>
+  DB  233,176,254,255,255                 ; jmpq          528c <_sk_load_rgb_u16_be_avx+0x46>
+  DB  233,171,254,255,255                 ; jmpq          528c <_sk_load_rgb_u16_be_avx+0x46>
   DB  196,193,121,110,92,64,30            ; vmovd         0x1e(%r8,%rax,2),%xmm3
   DB  196,65,97,196,92,64,34,2            ; vpinsrw       $0x2,0x22(%r8,%rax,2),%xmm3,%xmm11
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,20                              ; jb            504e <_sk_load_rgb_u16_be_avx+0x1c4>
+  DB  114,20                              ; jb            540a <_sk_load_rgb_u16_be_avx+0x1c4>
   DB  196,193,121,110,92,64,36            ; vmovd         0x24(%r8,%rax,2),%xmm3
   DB  196,193,97,196,92,64,40,2           ; vpinsrw       $0x2,0x28(%r8,%rax,2),%xmm3,%xmm3
-  DB  233,130,254,255,255                 ; jmpq          4ed0 <_sk_load_rgb_u16_be_avx+0x46>
-  DB  233,125,254,255,255                 ; jmpq          4ed0 <_sk_load_rgb_u16_be_avx+0x46>
+  DB  233,130,254,255,255                 ; jmpq          528c <_sk_load_rgb_u16_be_avx+0x46>
+  DB  233,125,254,255,255                 ; jmpq          528c <_sk_load_rgb_u16_be_avx+0x46>
 
 PUBLIC _sk_store_u16_be_avx
 _sk_store_u16_be_avx LABEL PROC
@@ -9188,7 +9524,7 @@ _sk_store_u16_be_avx LABEL PROC
   DB  196,65,17,98,200                    ; vpunpckldq    %xmm8,%xmm13,%xmm9
   DB  196,65,17,106,192                   ; vpunpckhdq    %xmm8,%xmm13,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,31                              ; jne           5155 <_sk_store_u16_be_avx+0x102>
+  DB  117,31                              ; jne           5511 <_sk_store_u16_be_avx+0x102>
   DB  196,1,120,17,28,72                  ; vmovups       %xmm11,(%r8,%r9,2)
   DB  196,1,120,17,84,72,16               ; vmovups       %xmm10,0x10(%r8,%r9,2)
   DB  196,1,120,17,76,72,32               ; vmovups       %xmm9,0x20(%r8,%r9,2)
@@ -9197,31 +9533,31 @@ _sk_store_u16_be_avx LABEL PROC
   DB  255,224                             ; jmpq          *%rax
   DB  196,1,121,214,28,72                 ; vmovq         %xmm11,(%r8,%r9,2)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            5151 <_sk_store_u16_be_avx+0xfe>
+  DB  116,240                             ; je            550d <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,23,92,72,8                ; vmovhpd       %xmm11,0x8(%r8,%r9,2)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            5151 <_sk_store_u16_be_avx+0xfe>
+  DB  114,227                             ; jb            550d <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,214,84,72,16              ; vmovq         %xmm10,0x10(%r8,%r9,2)
-  DB  116,218                             ; je            5151 <_sk_store_u16_be_avx+0xfe>
+  DB  116,218                             ; je            550d <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,23,84,72,24               ; vmovhpd       %xmm10,0x18(%r8,%r9,2)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            5151 <_sk_store_u16_be_avx+0xfe>
+  DB  114,205                             ; jb            550d <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,214,76,72,32              ; vmovq         %xmm9,0x20(%r8,%r9,2)
-  DB  116,196                             ; je            5151 <_sk_store_u16_be_avx+0xfe>
+  DB  116,196                             ; je            550d <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,23,76,72,40               ; vmovhpd       %xmm9,0x28(%r8,%r9,2)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,183                             ; jb            5151 <_sk_store_u16_be_avx+0xfe>
+  DB  114,183                             ; jb            550d <_sk_store_u16_be_avx+0xfe>
   DB  196,1,121,214,68,72,48              ; vmovq         %xmm8,0x30(%r8,%r9,2)
-  DB  235,174                             ; jmp           5151 <_sk_store_u16_be_avx+0xfe>
+  DB  235,174                             ; jmp           550d <_sk_store_u16_be_avx+0xfe>
 
 PUBLIC _sk_load_f32_avx
 _sk_load_f32_avx LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  119,110                             ; ja            5219 <_sk_load_f32_avx+0x76>
+  DB  119,110                             ; ja            55d5 <_sk_load_f32_avx+0x76>
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
-  DB  76,141,21,135,0,0,0                 ; lea           0x87(%rip),%r10        # 5244 <_sk_load_f32_avx+0xa1>
+  DB  76,141,21,135,0,0,0                 ; lea           0x87(%rip),%r10        # 5600 <_sk_load_f32_avx+0xa1>
   DB  73,99,4,138                         ; movslq        (%r10,%rcx,4),%rax
   DB  76,1,208                            ; add           %r10,%rax
   DB  255,224                             ; jmpq          *%rax
@@ -9280,7 +9616,7 @@ _sk_store_f32_avx LABEL PROC
   DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
   DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,55                              ; jne           52d1 <_sk_store_f32_avx+0x6d>
+  DB  117,55                              ; jne           568d <_sk_store_f32_avx+0x6d>
   DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -9293,22 +9629,22 @@ _sk_store_f32_avx LABEL PROC
   DB  255,224                             ; jmpq          *%rax
   DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            52cd <_sk_store_f32_avx+0x69>
+  DB  116,240                             ; je            5689 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            52cd <_sk_store_f32_avx+0x69>
+  DB  114,227                             ; jb            5689 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
-  DB  116,218                             ; je            52cd <_sk_store_f32_avx+0x69>
+  DB  116,218                             ; je            5689 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            52cd <_sk_store_f32_avx+0x69>
+  DB  114,205                             ; jb            5689 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  DB  116,195                             ; je            52cd <_sk_store_f32_avx+0x69>
+  DB  116,195                             ; je            5689 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,181                             ; jb            52cd <_sk_store_f32_avx+0x69>
+  DB  114,181                             ; jb            5689 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  DB  235,171                             ; jmp           52cd <_sk_store_f32_avx+0x69>
+  DB  235,171                             ; jmp           5689 <_sk_store_f32_avx+0x69>
 
 PUBLIC _sk_clamp_x_avx
 _sk_clamp_x_avx LABEL PROC
@@ -9612,7 +9948,7 @@ _sk_linear_gradient_avx LABEL PROC
   DB  196,226,125,24,88,28                ; vbroadcastss  0x1c(%rax),%ymm3
   DB  76,139,0                            ; mov           (%rax),%r8
   DB  77,133,192                          ; test          %r8,%r8
-  DB  15,132,146,0,0,0                    ; je            5885 <_sk_linear_gradient_avx+0xb8>
+  DB  15,132,146,0,0,0                    ; je            5c41 <_sk_linear_gradient_avx+0xb8>
   DB  72,139,64,8                         ; mov           0x8(%rax),%rax
   DB  72,131,192,32                       ; add           $0x20,%rax
   DB  196,65,28,87,228                    ; vxorps        %ymm12,%ymm12,%ymm12
@@ -9639,8 +9975,8 @@ _sk_linear_gradient_avx LABEL PROC
   DB  196,227,13,74,219,208               ; vblendvps     %ymm13,%ymm3,%ymm14,%ymm3
   DB  72,131,192,36                       ; add           $0x24,%rax
   DB  73,255,200                          ; dec           %r8
-  DB  117,140                             ; jne           580f <_sk_linear_gradient_avx+0x42>
-  DB  235,20                              ; jmp           5899 <_sk_linear_gradient_avx+0xcc>
+  DB  117,140                             ; jne           5bcb <_sk_linear_gradient_avx+0x42>
+  DB  235,20                              ; jmp           5c55 <_sk_linear_gradient_avx+0xcc>
   DB  196,65,36,87,219                    ; vxorps        %ymm11,%ymm11,%ymm11
   DB  196,65,44,87,210                    ; vxorps        %ymm10,%ymm10,%ymm10
   DB  196,65,52,87,201                    ; vxorps        %ymm9,%ymm9,%ymm9
@@ -11327,89 +11663,276 @@ _sk_to_srgb_sse41 LABEL PROC
 
 PUBLIC _sk_from_2dot2_sse41
 _sk_from_2dot2_sse41 LABEL PROC
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  65,15,82,192                        ; rsqrtps       %xmm8,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  68,15,82,200                        ; rsqrtps       %xmm0,%xmm9
-  DB  65,15,82,193                        ; rsqrtps       %xmm9,%xmm0
-  DB  68,15,82,208                        ; rsqrtps       %xmm0,%xmm10
-  DB  69,15,89,192                        ; mulps         %xmm8,%xmm8
-  DB  65,15,40,193                        ; movaps        %xmm9,%xmm0
-  DB  15,89,192                           ; mulps         %xmm0,%xmm0
-  DB  65,15,89,193                        ; mulps         %xmm9,%xmm0
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,89,194                        ; mulps         %xmm10,%xmm0
-  DB  69,15,87,210                        ; xorps         %xmm10,%xmm10
-  DB  65,15,95,194                        ; maxps         %xmm10,%xmm0
-  DB  68,15,82,193                        ; rsqrtps       %xmm1,%xmm8
-  DB  69,15,82,192                        ; rsqrtps       %xmm8,%xmm8
-  DB  69,15,82,192                        ; rsqrtps       %xmm8,%xmm8
-  DB  69,15,82,200                        ; rsqrtps       %xmm8,%xmm9
-  DB  69,15,82,193                        ; rsqrtps       %xmm9,%xmm8
-  DB  69,15,82,216                        ; rsqrtps       %xmm8,%xmm11
-  DB  15,89,201                           ; mulps         %xmm1,%xmm1
-  DB  69,15,40,193                        ; movaps        %xmm9,%xmm8
-  DB  69,15,89,192                        ; mulps         %xmm8,%xmm8
-  DB  69,15,89,193                        ; mulps         %xmm9,%xmm8
-  DB  68,15,89,193                        ; mulps         %xmm1,%xmm8
-  DB  69,15,89,195                        ; mulps         %xmm11,%xmm8
-  DB  69,15,95,194                        ; maxps         %xmm10,%xmm8
-  DB  15,82,202                           ; rsqrtps       %xmm2,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  68,15,82,217                        ; rsqrtps       %xmm1,%xmm11
-  DB  65,15,82,203                        ; rsqrtps       %xmm11,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  15,89,210                           ; mulps         %xmm2,%xmm2
-  DB  69,15,40,203                        ; movaps        %xmm11,%xmm9
-  DB  69,15,89,201                        ; mulps         %xmm9,%xmm9
-  DB  69,15,89,203                        ; mulps         %xmm11,%xmm9
-  DB  68,15,89,202                        ; mulps         %xmm2,%xmm9
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  69,15,95,202                        ; maxps         %xmm10,%xmm9
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,200                        ; movaps        %xmm8,%xmm1
-  DB  65,15,40,209                        ; movaps        %xmm9,%xmm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_to_2dot2_sse41
-_sk_to_2dot2_sse41 LABEL PROC
-  DB  68,15,82,192                        ; rsqrtps       %xmm0,%xmm8
-  DB  65,15,82,192                        ; rsqrtps       %xmm8,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  68,15,82,200                        ; rsqrtps       %xmm0,%xmm9
-  DB  69,15,83,192                        ; rcpps         %xmm8,%xmm8
-  DB  68,15,89,192                        ; mulps         %xmm0,%xmm8
-  DB  65,15,83,193                        ; rcpps         %xmm9,%xmm0
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  65,15,95,192                        ; maxps         %xmm8,%xmm0
-  DB  68,15,82,201                        ; rsqrtps       %xmm1,%xmm9
-  DB  65,15,82,201                        ; rsqrtps       %xmm9,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  68,15,82,209                        ; rsqrtps       %xmm1,%xmm10
-  DB  69,15,83,201                        ; rcpps         %xmm9,%xmm9
+  DB  72,131,236,120                      ; sub           $0x78,%rsp
+  DB  15,41,124,36,96                     ; movaps        %xmm7,0x60(%rsp)
+  DB  15,41,116,36,80                     ; movaps        %xmm6,0x50(%rsp)
+  DB  15,41,108,36,64                     ; movaps        %xmm5,0x40(%rsp)
+  DB  15,41,100,36,48                     ; movaps        %xmm4,0x30(%rsp)
+  DB  15,41,92,36,32                      ; movaps        %xmm3,0x20(%rsp)
+  DB  15,41,84,36,16                      ; movaps        %xmm2,0x10(%rsp)
+  DB  15,40,209                           ; movaps        %xmm1,%xmm2
+  DB  184,205,204,12,64                   ; mov           $0x400ccccd,%eax
+  DB  15,91,216                           ; cvtdq2ps      %xmm0,%xmm3
+  DB  185,0,0,0,52                        ; mov           $0x34000000,%ecx
+  DB  102,68,15,110,209                   ; movd          %ecx,%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  65,15,89,218                        ; mulps         %xmm10,%xmm3
+  DB  185,255,255,127,0                   ; mov           $0x7fffff,%ecx
+  DB  102,15,110,201                      ; movd          %ecx,%xmm1
+  DB  102,68,15,112,193,0                 ; pshufd        $0x0,%xmm1,%xmm8
+  DB  65,15,84,192                        ; andps         %xmm8,%xmm0
+  DB  185,0,0,0,63                        ; mov           $0x3f000000,%ecx
+  DB  102,15,110,201                      ; movd          %ecx,%xmm1
+  DB  102,15,112,201,0                    ; pshufd        $0x0,%xmm1,%xmm1
+  DB  15,86,193                           ; orps          %xmm1,%xmm0
+  DB  15,40,241                           ; movaps        %xmm1,%xmm6
+  DB  15,41,52,36                         ; movaps        %xmm6,(%rsp)
+  DB  185,119,115,248,66                  ; mov           $0x42f87377,%ecx
+  DB  102,68,15,110,217                   ; movd          %ecx,%xmm11
+  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
+  DB  65,15,92,219                        ; subps         %xmm11,%xmm3
+  DB  185,117,191,191,63                  ; mov           $0x3fbfbf75,%ecx
+  DB  102,68,15,110,225                   ; movd          %ecx,%xmm12
+  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
+  DB  15,40,200                           ; movaps        %xmm0,%xmm1
+  DB  65,15,89,204                        ; mulps         %xmm12,%xmm1
+  DB  15,92,217                           ; subps         %xmm1,%xmm3
+  DB  185,163,233,220,63                  ; mov           $0x3fdce9a3,%ecx
+  DB  102,68,15,110,233                   ; movd          %ecx,%xmm13
+  DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
+  DB  185,249,68,180,62                   ; mov           $0x3eb444f9,%ecx
+  DB  102,68,15,110,241                   ; movd          %ecx,%xmm14
+  DB  69,15,198,246,0                     ; shufps        $0x0,%xmm14,%xmm14
+  DB  65,15,88,198                        ; addps         %xmm14,%xmm0
+  DB  65,15,40,205                        ; movaps        %xmm13,%xmm1
+  DB  15,94,200                           ; divps         %xmm0,%xmm1
+  DB  15,92,217                           ; subps         %xmm1,%xmm3
+  DB  102,68,15,110,248                   ; movd          %eax,%xmm15
+  DB  69,15,198,255,0                     ; shufps        $0x0,%xmm15,%xmm15
+  DB  65,15,89,223                        ; mulps         %xmm15,%xmm3
+  DB  65,184,0,0,0,75                     ; mov           $0x4b000000,%r8d
+  DB  185,81,140,242,66                   ; mov           $0x42f28c51,%ecx
+  DB  102,15,110,225                      ; movd          %ecx,%xmm4
+  DB  15,198,228,0                        ; shufps        $0x0,%xmm4,%xmm4
+  DB  15,40,204                           ; movaps        %xmm4,%xmm1
+  DB  15,88,203                           ; addps         %xmm3,%xmm1
+  DB  102,15,58,8,195,1                   ; roundps       $0x1,%xmm3,%xmm0
+  DB  15,92,216                           ; subps         %xmm0,%xmm3
+  DB  185,141,188,190,63                  ; mov           $0x3fbebc8d,%ecx
+  DB  102,68,15,110,201                   ; movd          %ecx,%xmm9
+  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
+  DB  65,15,40,193                        ; movaps        %xmm9,%xmm0
+  DB  15,89,195                           ; mulps         %xmm3,%xmm0
+  DB  15,92,200                           ; subps         %xmm0,%xmm1
+  DB  185,254,210,221,65                  ; mov           $0x41ddd2fe,%ecx
+  DB  184,248,245,154,64                  ; mov           $0x409af5f8,%eax
+  DB  102,15,110,248                      ; movd          %eax,%xmm7
+  DB  15,198,255,0                        ; shufps        $0x0,%xmm7,%xmm7
+  DB  15,40,239                           ; movaps        %xmm7,%xmm5
+  DB  15,92,235                           ; subps         %xmm3,%xmm5
+  DB  102,15,110,193                      ; movd          %ecx,%xmm0
+  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  15,94,221                           ; divps         %xmm5,%xmm3
+  DB  15,88,217                           ; addps         %xmm1,%xmm3
+  DB  15,91,202                           ; cvtdq2ps      %xmm2,%xmm1
+  DB  65,15,89,202                        ; mulps         %xmm10,%xmm1
+  DB  65,15,84,208                        ; andps         %xmm8,%xmm2
+  DB  15,86,214                           ; orps          %xmm6,%xmm2
+  DB  65,15,92,203                        ; subps         %xmm11,%xmm1
+  DB  15,40,234                           ; movaps        %xmm2,%xmm5
+  DB  65,15,89,236                        ; mulps         %xmm12,%xmm5
+  DB  15,92,205                           ; subps         %xmm5,%xmm1
+  DB  65,15,88,214                        ; addps         %xmm14,%xmm2
+  DB  65,15,40,237                        ; movaps        %xmm13,%xmm5
+  DB  15,94,234                           ; divps         %xmm2,%xmm5
+  DB  15,92,205                           ; subps         %xmm5,%xmm1
+  DB  65,15,89,207                        ; mulps         %xmm15,%xmm1
+  DB  15,40,236                           ; movaps        %xmm4,%xmm5
+  DB  15,88,233                           ; addps         %xmm1,%xmm5
+  DB  102,15,58,8,209,1                   ; roundps       $0x1,%xmm1,%xmm2
+  DB  15,92,202                           ; subps         %xmm2,%xmm1
+  DB  65,15,40,209                        ; movaps        %xmm9,%xmm2
+  DB  15,89,209                           ; mulps         %xmm1,%xmm2
+  DB  15,92,234                           ; subps         %xmm2,%xmm5
+  DB  15,40,247                           ; movaps        %xmm7,%xmm6
+  DB  15,92,241                           ; subps         %xmm1,%xmm6
+  DB  15,40,208                           ; movaps        %xmm0,%xmm2
+  DB  15,94,214                           ; divps         %xmm6,%xmm2
+  DB  15,88,213                           ; addps         %xmm5,%xmm2
+  DB  15,40,108,36,16                     ; movaps        0x10(%rsp),%xmm5
+  DB  15,91,205                           ; cvtdq2ps      %xmm5,%xmm1
+  DB  65,15,89,202                        ; mulps         %xmm10,%xmm1
+  DB  68,15,84,197                        ; andps         %xmm5,%xmm8
+  DB  68,15,86,4,36                       ; orps          (%rsp),%xmm8
+  DB  65,15,92,203                        ; subps         %xmm11,%xmm1
+  DB  69,15,89,224                        ; mulps         %xmm8,%xmm12
+  DB  65,15,92,204                        ; subps         %xmm12,%xmm1
+  DB  69,15,88,198                        ; addps         %xmm14,%xmm8
+  DB  69,15,94,232                        ; divps         %xmm8,%xmm13
+  DB  65,15,92,205                        ; subps         %xmm13,%xmm1
+  DB  65,15,89,207                        ; mulps         %xmm15,%xmm1
+  DB  102,15,58,8,233,1                   ; roundps       $0x1,%xmm1,%xmm5
+  DB  15,88,225                           ; addps         %xmm1,%xmm4
+  DB  15,92,205                           ; subps         %xmm5,%xmm1
   DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  65,15,83,202                        ; rcpps         %xmm10,%xmm1
-  DB  65,15,89,201                        ; mulps         %xmm9,%xmm1
-  DB  65,15,95,200                        ; maxps         %xmm8,%xmm1
-  DB  68,15,82,202                        ; rsqrtps       %xmm2,%xmm9
-  DB  65,15,82,209                        ; rsqrtps       %xmm9,%xmm2
-  DB  15,82,210                           ; rsqrtps       %xmm2,%xmm2
-  DB  15,82,210                           ; rsqrtps       %xmm2,%xmm2
-  DB  15,82,210                           ; rsqrtps       %xmm2,%xmm2
-  DB  68,15,82,210                        ; rsqrtps       %xmm2,%xmm10
-  DB  69,15,83,201                        ; rcpps         %xmm9,%xmm9
-  DB  68,15,89,202                        ; mulps         %xmm2,%xmm9
-  DB  65,15,83,210                        ; rcpps         %xmm10,%xmm2
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  65,15,95,208                        ; maxps         %xmm8,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  65,15,92,225                        ; subps         %xmm9,%xmm4
+  DB  15,92,249                           ; subps         %xmm1,%xmm7
+  DB  15,94,199                           ; divps         %xmm7,%xmm0
+  DB  15,88,196                           ; addps         %xmm4,%xmm0
+  DB  102,65,15,110,200                   ; movd          %r8d,%xmm1
+  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
+  DB  15,89,217                           ; mulps         %xmm1,%xmm3
+  DB  15,89,209                           ; mulps         %xmm1,%xmm2
+  DB  15,89,193                           ; mulps         %xmm1,%xmm0
+  DB  102,15,91,219                       ; cvtps2dq      %xmm3,%xmm3
+  DB  102,15,91,202                       ; cvtps2dq      %xmm2,%xmm1
+  DB  102,15,91,208                       ; cvtps2dq      %xmm0,%xmm2
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  102,15,40,195                       ; movapd        %xmm3,%xmm0
+  DB  15,40,92,36,32                      ; movaps        0x20(%rsp),%xmm3
+  DB  15,40,100,36,48                     ; movaps        0x30(%rsp),%xmm4
+  DB  15,40,108,36,64                     ; movaps        0x40(%rsp),%xmm5
+  DB  15,40,116,36,80                     ; movaps        0x50(%rsp),%xmm6
+  DB  15,40,124,36,96                     ; movaps        0x60(%rsp),%xmm7
+  DB  72,131,196,120                      ; add           $0x78,%rsp
+  DB  255,224                             ; jmpq          *%rax
+
+PUBLIC _sk_to_2dot2_sse41
+_sk_to_2dot2_sse41 LABEL PROC
+  DB  72,131,236,120                      ; sub           $0x78,%rsp
+  DB  15,41,124,36,96                     ; movaps        %xmm7,0x60(%rsp)
+  DB  15,41,116,36,80                     ; movaps        %xmm6,0x50(%rsp)
+  DB  15,41,108,36,64                     ; movaps        %xmm5,0x40(%rsp)
+  DB  15,41,100,36,48                     ; movaps        %xmm4,0x30(%rsp)
+  DB  15,41,92,36,32                      ; movaps        %xmm3,0x20(%rsp)
+  DB  15,41,84,36,16                      ; movaps        %xmm2,0x10(%rsp)
+  DB  15,40,209                           ; movaps        %xmm1,%xmm2
+  DB  184,46,186,232,62                   ; mov           $0x3ee8ba2e,%eax
+  DB  15,91,216                           ; cvtdq2ps      %xmm0,%xmm3
+  DB  185,0,0,0,52                        ; mov           $0x34000000,%ecx
+  DB  102,68,15,110,209                   ; movd          %ecx,%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  65,15,89,218                        ; mulps         %xmm10,%xmm3
+  DB  185,255,255,127,0                   ; mov           $0x7fffff,%ecx
+  DB  102,15,110,201                      ; movd          %ecx,%xmm1
+  DB  102,68,15,112,193,0                 ; pshufd        $0x0,%xmm1,%xmm8
+  DB  65,15,84,192                        ; andps         %xmm8,%xmm0
+  DB  185,0,0,0,63                        ; mov           $0x3f000000,%ecx
+  DB  102,15,110,201                      ; movd          %ecx,%xmm1
+  DB  102,15,112,201,0                    ; pshufd        $0x0,%xmm1,%xmm1
+  DB  15,86,193                           ; orps          %xmm1,%xmm0
+  DB  15,40,241                           ; movaps        %xmm1,%xmm6
+  DB  15,41,52,36                         ; movaps        %xmm6,(%rsp)
+  DB  185,119,115,248,66                  ; mov           $0x42f87377,%ecx
+  DB  102,68,15,110,217                   ; movd          %ecx,%xmm11
+  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
+  DB  65,15,92,219                        ; subps         %xmm11,%xmm3
+  DB  185,117,191,191,63                  ; mov           $0x3fbfbf75,%ecx
+  DB  102,68,15,110,225                   ; movd          %ecx,%xmm12
+  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
+  DB  15,40,200                           ; movaps        %xmm0,%xmm1
+  DB  65,15,89,204                        ; mulps         %xmm12,%xmm1
+  DB  15,92,217                           ; subps         %xmm1,%xmm3
+  DB  185,163,233,220,63                  ; mov           $0x3fdce9a3,%ecx
+  DB  102,68,15,110,233                   ; movd          %ecx,%xmm13
+  DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
+  DB  185,249,68,180,62                   ; mov           $0x3eb444f9,%ecx
+  DB  102,68,15,110,241                   ; movd          %ecx,%xmm14
+  DB  69,15,198,246,0                     ; shufps        $0x0,%xmm14,%xmm14
+  DB  65,15,88,198                        ; addps         %xmm14,%xmm0
+  DB  65,15,40,205                        ; movaps        %xmm13,%xmm1
+  DB  15,94,200                           ; divps         %xmm0,%xmm1
+  DB  15,92,217                           ; subps         %xmm1,%xmm3
+  DB  102,68,15,110,248                   ; movd          %eax,%xmm15
+  DB  69,15,198,255,0                     ; shufps        $0x0,%xmm15,%xmm15
+  DB  65,15,89,223                        ; mulps         %xmm15,%xmm3
+  DB  65,184,0,0,0,75                     ; mov           $0x4b000000,%r8d
+  DB  185,81,140,242,66                   ; mov           $0x42f28c51,%ecx
+  DB  102,15,110,225                      ; movd          %ecx,%xmm4
+  DB  15,198,228,0                        ; shufps        $0x0,%xmm4,%xmm4
+  DB  15,40,204                           ; movaps        %xmm4,%xmm1
+  DB  15,88,203                           ; addps         %xmm3,%xmm1
+  DB  102,15,58,8,195,1                   ; roundps       $0x1,%xmm3,%xmm0
+  DB  15,92,216                           ; subps         %xmm0,%xmm3
+  DB  185,141,188,190,63                  ; mov           $0x3fbebc8d,%ecx
+  DB  102,68,15,110,201                   ; movd          %ecx,%xmm9
+  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
+  DB  65,15,40,193                        ; movaps        %xmm9,%xmm0
+  DB  15,89,195                           ; mulps         %xmm3,%xmm0
+  DB  15,92,200                           ; subps         %xmm0,%xmm1
+  DB  185,254,210,221,65                  ; mov           $0x41ddd2fe,%ecx
+  DB  184,248,245,154,64                  ; mov           $0x409af5f8,%eax
+  DB  102,15,110,248                      ; movd          %eax,%xmm7
+  DB  15,198,255,0                        ; shufps        $0x0,%xmm7,%xmm7
+  DB  15,40,239                           ; movaps        %xmm7,%xmm5
+  DB  15,92,235                           ; subps         %xmm3,%xmm5
+  DB  102,15,110,193                      ; movd          %ecx,%xmm0
+  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  15,94,221                           ; divps         %xmm5,%xmm3
+  DB  15,88,217                           ; addps         %xmm1,%xmm3
+  DB  15,91,202                           ; cvtdq2ps      %xmm2,%xmm1
+  DB  65,15,89,202                        ; mulps         %xmm10,%xmm1
+  DB  65,15,84,208                        ; andps         %xmm8,%xmm2
+  DB  15,86,214                           ; orps          %xmm6,%xmm2
+  DB  65,15,92,203                        ; subps         %xmm11,%xmm1
+  DB  15,40,234                           ; movaps        %xmm2,%xmm5
+  DB  65,15,89,236                        ; mulps         %xmm12,%xmm5
+  DB  15,92,205                           ; subps         %xmm5,%xmm1
+  DB  65,15,88,214                        ; addps         %xmm14,%xmm2
+  DB  65,15,40,237                        ; movaps        %xmm13,%xmm5
+  DB  15,94,234                           ; divps         %xmm2,%xmm5
+  DB  15,92,205                           ; subps         %xmm5,%xmm1
+  DB  65,15,89,207                        ; mulps         %xmm15,%xmm1
+  DB  15,40,236                           ; movaps        %xmm4,%xmm5
+  DB  15,88,233                           ; addps         %xmm1,%xmm5
+  DB  102,15,58,8,209,1                   ; roundps       $0x1,%xmm1,%xmm2
+  DB  15,92,202                           ; subps         %xmm2,%xmm1
+  DB  65,15,40,209                        ; movaps        %xmm9,%xmm2
+  DB  15,89,209                           ; mulps         %xmm1,%xmm2
+  DB  15,92,234                           ; subps         %xmm2,%xmm5
+  DB  15,40,247                           ; movaps        %xmm7,%xmm6
+  DB  15,92,241                           ; subps         %xmm1,%xmm6
+  DB  15,40,208                           ; movaps        %xmm0,%xmm2
+  DB  15,94,214                           ; divps         %xmm6,%xmm2
+  DB  15,88,213                           ; addps         %xmm5,%xmm2
+  DB  15,40,108,36,16                     ; movaps        0x10(%rsp),%xmm5
+  DB  15,91,205                           ; cvtdq2ps      %xmm5,%xmm1
+  DB  65,15,89,202                        ; mulps         %xmm10,%xmm1
+  DB  68,15,84,197                        ; andps         %xmm5,%xmm8
+  DB  68,15,86,4,36                       ; orps          (%rsp),%xmm8
+  DB  65,15,92,203                        ; subps         %xmm11,%xmm1
+  DB  69,15,89,224                        ; mulps         %xmm8,%xmm12
+  DB  65,15,92,204                        ; subps         %xmm12,%xmm1
+  DB  69,15,88,198                        ; addps         %xmm14,%xmm8
+  DB  69,15,94,232                        ; divps         %xmm8,%xmm13
+  DB  65,15,92,205                        ; subps         %xmm13,%xmm1
+  DB  65,15,89,207                        ; mulps         %xmm15,%xmm1
+  DB  102,15,58,8,233,1                   ; roundps       $0x1,%xmm1,%xmm5
+  DB  15,88,225                           ; addps         %xmm1,%xmm4
+  DB  15,92,205                           ; subps         %xmm5,%xmm1
+  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
+  DB  65,15,92,225                        ; subps         %xmm9,%xmm4
+  DB  15,92,249                           ; subps         %xmm1,%xmm7
+  DB  15,94,199                           ; divps         %xmm7,%xmm0
+  DB  15,88,196                           ; addps         %xmm4,%xmm0
+  DB  102,65,15,110,200                   ; movd          %r8d,%xmm1
+  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
+  DB  15,89,217                           ; mulps         %xmm1,%xmm3
+  DB  15,89,209                           ; mulps         %xmm1,%xmm2
+  DB  15,89,193                           ; mulps         %xmm1,%xmm0
+  DB  102,15,91,219                       ; cvtps2dq      %xmm3,%xmm3
+  DB  102,15,91,202                       ; cvtps2dq      %xmm2,%xmm1
+  DB  102,15,91,208                       ; cvtps2dq      %xmm0,%xmm2
+  DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  102,15,40,195                       ; movapd        %xmm3,%xmm0
+  DB  15,40,92,36,32                      ; movaps        0x20(%rsp),%xmm3
+  DB  15,40,100,36,48                     ; movaps        0x30(%rsp),%xmm4
+  DB  15,40,108,36,64                     ; movaps        0x40(%rsp),%xmm5
+  DB  15,40,116,36,80                     ; movaps        0x50(%rsp),%xmm6
+  DB  15,40,124,36,96                     ; movaps        0x60(%rsp),%xmm7
+  DB  72,131,196,120                      ; add           $0x78,%rsp
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_rgb_to_hsl_sse41
@@ -12819,9 +13342,9 @@ _sk_gather_i8_sse41 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  73,137,192                          ; mov           %rax,%r8
   DB  77,133,192                          ; test          %r8,%r8
-  DB  116,5                               ; je            28a1 <_sk_gather_i8_sse41+0xf>
+  DB  116,5                               ; je            2baf <_sk_gather_i8_sse41+0xf>
   DB  76,137,192                          ; mov           %r8,%rax
-  DB  235,2                               ; jmp           28a3 <_sk_gather_i8_sse41+0x11>
+  DB  235,2                               ; jmp           2bb1 <_sk_gather_i8_sse41+0x11>
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,16                           ; mov           (%rax),%r10
   DB  243,15,91,201                       ; cvttps2dq     %xmm1,%xmm1
@@ -13996,7 +14519,7 @@ _sk_linear_gradient_sse41 LABEL PROC
   DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
   DB  72,139,8                            ; mov           (%rax),%rcx
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,132,4,1,0,0                      ; je            3c92 <_sk_linear_gradient_sse41+0x13e>
+  DB  15,132,4,1,0,0                      ; je            3fa0 <_sk_linear_gradient_sse41+0x13e>
   DB  72,131,236,88                       ; sub           $0x58,%rsp
   DB  15,41,36,36                         ; movaps        %xmm4,(%rsp)
   DB  15,41,108,36,16                     ; movaps        %xmm5,0x10(%rsp)
@@ -14047,13 +14570,13 @@ _sk_linear_gradient_sse41 LABEL PROC
   DB  15,40,196                           ; movaps        %xmm4,%xmm0
   DB  72,131,192,36                       ; add           $0x24,%rax
   DB  72,255,201                          ; dec           %rcx
-  DB  15,133,65,255,255,255               ; jne           3bba <_sk_linear_gradient_sse41+0x66>
+  DB  15,133,65,255,255,255               ; jne           3ec8 <_sk_linear_gradient_sse41+0x66>
   DB  15,40,124,36,48                     ; movaps        0x30(%rsp),%xmm7
   DB  15,40,116,36,32                     ; movaps        0x20(%rsp),%xmm6
   DB  15,40,108,36,16                     ; movaps        0x10(%rsp),%xmm5
   DB  15,40,36,36                         ; movaps        (%rsp),%xmm4
   DB  72,131,196,88                       ; add           $0x58,%rsp
-  DB  235,13                              ; jmp           3c9f <_sk_linear_gradient_sse41+0x14b>
+  DB  235,13                              ; jmp           3fad <_sk_linear_gradient_sse41+0x14b>
   DB  15,87,201                           ; xorps         %xmm1,%xmm1
   DB  15,87,210                           ; xorps         %xmm2,%xmm2
   DB  15,87,219                           ; xorps         %xmm3,%xmm3
@@ -15744,89 +16267,320 @@ _sk_to_srgb_sse2 LABEL PROC
 
 PUBLIC _sk_from_2dot2_sse2
 _sk_from_2dot2_sse2 LABEL PROC
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  65,15,82,192                        ; rsqrtps       %xmm8,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  68,15,82,200                        ; rsqrtps       %xmm0,%xmm9
-  DB  65,15,82,193                        ; rsqrtps       %xmm9,%xmm0
-  DB  68,15,82,208                        ; rsqrtps       %xmm0,%xmm10
-  DB  69,15,89,192                        ; mulps         %xmm8,%xmm8
-  DB  65,15,40,193                        ; movaps        %xmm9,%xmm0
-  DB  15,89,192                           ; mulps         %xmm0,%xmm0
-  DB  65,15,89,193                        ; mulps         %xmm9,%xmm0
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,89,194                        ; mulps         %xmm10,%xmm0
-  DB  69,15,87,210                        ; xorps         %xmm10,%xmm10
-  DB  65,15,95,194                        ; maxps         %xmm10,%xmm0
-  DB  68,15,82,193                        ; rsqrtps       %xmm1,%xmm8
-  DB  69,15,82,192                        ; rsqrtps       %xmm8,%xmm8
-  DB  69,15,82,192                        ; rsqrtps       %xmm8,%xmm8
-  DB  69,15,82,200                        ; rsqrtps       %xmm8,%xmm9
-  DB  69,15,82,193                        ; rsqrtps       %xmm9,%xmm8
-  DB  69,15,82,216                        ; rsqrtps       %xmm8,%xmm11
-  DB  15,89,201                           ; mulps         %xmm1,%xmm1
-  DB  69,15,40,193                        ; movaps        %xmm9,%xmm8
-  DB  69,15,89,192                        ; mulps         %xmm8,%xmm8
-  DB  69,15,89,193                        ; mulps         %xmm9,%xmm8
-  DB  68,15,89,193                        ; mulps         %xmm1,%xmm8
-  DB  69,15,89,195                        ; mulps         %xmm11,%xmm8
-  DB  69,15,95,194                        ; maxps         %xmm10,%xmm8
-  DB  15,82,202                           ; rsqrtps       %xmm2,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  68,15,82,217                        ; rsqrtps       %xmm1,%xmm11
-  DB  65,15,82,203                        ; rsqrtps       %xmm11,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  15,89,210                           ; mulps         %xmm2,%xmm2
-  DB  69,15,40,203                        ; movaps        %xmm11,%xmm9
-  DB  69,15,89,201                        ; mulps         %xmm9,%xmm9
-  DB  69,15,89,203                        ; mulps         %xmm11,%xmm9
-  DB  68,15,89,202                        ; mulps         %xmm2,%xmm9
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  69,15,95,202                        ; maxps         %xmm10,%xmm9
+  DB  72,129,236,152,0,0,0                ; sub           $0x98,%rsp
+  DB  15,41,188,36,128,0,0,0              ; movaps        %xmm7,0x80(%rsp)
+  DB  15,41,116,36,112                    ; movaps        %xmm6,0x70(%rsp)
+  DB  15,41,108,36,96                     ; movaps        %xmm5,0x60(%rsp)
+  DB  15,41,100,36,80                     ; movaps        %xmm4,0x50(%rsp)
+  DB  15,41,92,36,64                      ; movaps        %xmm3,0x40(%rsp)
+  DB  15,41,84,36,48                      ; movaps        %xmm2,0x30(%rsp)
+  DB  15,40,208                           ; movaps        %xmm0,%xmm2
+  DB  184,205,204,12,64                   ; mov           $0x400ccccd,%eax
+  DB  15,91,194                           ; cvtdq2ps      %xmm2,%xmm0
+  DB  185,0,0,0,52                        ; mov           $0x34000000,%ecx
+  DB  102,15,110,217                      ; movd          %ecx,%xmm3
+  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
+  DB  15,89,195                           ; mulps         %xmm3,%xmm0
+  DB  68,15,40,219                        ; movaps        %xmm3,%xmm11
+  DB  68,15,41,92,36,16                   ; movaps        %xmm11,0x10(%rsp)
+  DB  185,255,255,127,0                   ; mov           $0x7fffff,%ecx
+  DB  102,15,110,217                      ; movd          %ecx,%xmm3
+  DB  102,68,15,112,195,0                 ; pshufd        $0x0,%xmm3,%xmm8
+  DB  65,15,84,208                        ; andps         %xmm8,%xmm2
+  DB  185,0,0,0,63                        ; mov           $0x3f000000,%ecx
+  DB  102,15,110,217                      ; movd          %ecx,%xmm3
+  DB  102,15,112,219,0                    ; pshufd        $0x0,%xmm3,%xmm3
+  DB  102,15,127,92,36,32                 ; movdqa        %xmm3,0x20(%rsp)
+  DB  15,86,211                           ; orps          %xmm3,%xmm2
+  DB  185,119,115,248,66                  ; mov           $0x42f87377,%ecx
+  DB  102,15,110,233                      ; movd          %ecx,%xmm5
+  DB  15,198,237,0                        ; shufps        $0x0,%xmm5,%xmm5
+  DB  15,92,197                           ; subps         %xmm5,%xmm0
+  DB  15,41,44,36                         ; movaps        %xmm5,(%rsp)
+  DB  185,117,191,191,63                  ; mov           $0x3fbfbf75,%ecx
+  DB  102,68,15,110,225                   ; movd          %ecx,%xmm12
+  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
+  DB  15,40,218                           ; movaps        %xmm2,%xmm3
+  DB  65,15,89,220                        ; mulps         %xmm12,%xmm3
+  DB  15,92,195                           ; subps         %xmm3,%xmm0
+  DB  185,163,233,220,63                  ; mov           $0x3fdce9a3,%ecx
+  DB  102,68,15,110,233                   ; movd          %ecx,%xmm13
+  DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
+  DB  185,249,68,180,62                   ; mov           $0x3eb444f9,%ecx
+  DB  102,68,15,110,241                   ; movd          %ecx,%xmm14
+  DB  69,15,198,246,0                     ; shufps        $0x0,%xmm14,%xmm14
+  DB  65,15,88,214                        ; addps         %xmm14,%xmm2
+  DB  65,15,40,221                        ; movaps        %xmm13,%xmm3
+  DB  15,94,218                           ; divps         %xmm2,%xmm3
+  DB  15,92,195                           ; subps         %xmm3,%xmm0
+  DB  102,68,15,110,248                   ; movd          %eax,%xmm15
+  DB  69,15,198,255,0                     ; shufps        $0x0,%xmm15,%xmm15
+  DB  65,15,89,199                        ; mulps         %xmm15,%xmm0
+  DB  243,15,91,208                       ; cvttps2dq     %xmm0,%xmm2
+  DB  15,91,210                           ; cvtdq2ps      %xmm2,%xmm2
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  15,194,218,1                        ; cmpltps       %xmm2,%xmm3
+  DB  184,0,0,128,63                      ; mov           $0x3f800000,%eax
+  DB  102,68,15,110,208                   ; movd          %eax,%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  65,15,84,218                        ; andps         %xmm10,%xmm3
+  DB  15,92,211                           ; subps         %xmm3,%xmm2
+  DB  15,40,224                           ; movaps        %xmm0,%xmm4
+  DB  15,92,226                           ; subps         %xmm2,%xmm4
+  DB  65,184,0,0,0,75                     ; mov           $0x4b000000,%r8d
+  DB  185,81,140,242,66                   ; mov           $0x42f28c51,%ecx
+  DB  102,68,15,110,201                   ; movd          %ecx,%xmm9
+  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
+  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
+  DB  185,141,188,190,63                  ; mov           $0x3fbebc8d,%ecx
+  DB  102,15,110,249                      ; movd          %ecx,%xmm7
+  DB  15,198,255,0                        ; shufps        $0x0,%xmm7,%xmm7
+  DB  15,40,215                           ; movaps        %xmm7,%xmm2
+  DB  15,89,212                           ; mulps         %xmm4,%xmm2
+  DB  15,92,194                           ; subps         %xmm2,%xmm0
+  DB  185,254,210,221,65                  ; mov           $0x41ddd2fe,%ecx
+  DB  184,248,245,154,64                  ; mov           $0x409af5f8,%eax
+  DB  102,15,110,240                      ; movd          %eax,%xmm6
+  DB  15,198,246,0                        ; shufps        $0x0,%xmm6,%xmm6
+  DB  15,40,222                           ; movaps        %xmm6,%xmm3
+  DB  15,92,220                           ; subps         %xmm4,%xmm3
+  DB  102,15,110,209                      ; movd          %ecx,%xmm2
+  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
+  DB  15,40,226                           ; movaps        %xmm2,%xmm4
+  DB  15,94,227                           ; divps         %xmm3,%xmm4
+  DB  15,88,224                           ; addps         %xmm0,%xmm4
+  DB  15,91,193                           ; cvtdq2ps      %xmm1,%xmm0
+  DB  65,15,89,195                        ; mulps         %xmm11,%xmm0
+  DB  65,15,84,200                        ; andps         %xmm8,%xmm1
+  DB  68,15,40,92,36,32                   ; movaps        0x20(%rsp),%xmm11
+  DB  65,15,86,203                        ; orps          %xmm11,%xmm1
+  DB  15,92,197                           ; subps         %xmm5,%xmm0
+  DB  15,40,217                           ; movaps        %xmm1,%xmm3
+  DB  65,15,89,220                        ; mulps         %xmm12,%xmm3
+  DB  15,92,195                           ; subps         %xmm3,%xmm0
+  DB  65,15,88,206                        ; addps         %xmm14,%xmm1
+  DB  65,15,40,221                        ; movaps        %xmm13,%xmm3
+  DB  15,94,217                           ; divps         %xmm1,%xmm3
+  DB  15,92,195                           ; subps         %xmm3,%xmm0
+  DB  65,15,89,199                        ; mulps         %xmm15,%xmm0
+  DB  243,15,91,200                       ; cvttps2dq     %xmm0,%xmm1
+  DB  15,91,201                           ; cvtdq2ps      %xmm1,%xmm1
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  15,194,217,1                        ; cmpltps       %xmm1,%xmm3
+  DB  65,15,84,218                        ; andps         %xmm10,%xmm3
+  DB  15,92,203                           ; subps         %xmm3,%xmm1
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  15,92,217                           ; subps         %xmm1,%xmm3
+  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
+  DB  15,40,207                           ; movaps        %xmm7,%xmm1
+  DB  15,89,203                           ; mulps         %xmm3,%xmm1
+  DB  15,92,193                           ; subps         %xmm1,%xmm0
+  DB  15,40,238                           ; movaps        %xmm6,%xmm5
+  DB  15,92,235                           ; subps         %xmm3,%xmm5
+  DB  15,40,202                           ; movaps        %xmm2,%xmm1
+  DB  15,94,205                           ; divps         %xmm5,%xmm1
+  DB  15,88,200                           ; addps         %xmm0,%xmm1
+  DB  15,40,92,36,48                      ; movaps        0x30(%rsp),%xmm3
+  DB  15,91,195                           ; cvtdq2ps      %xmm3,%xmm0
+  DB  15,89,68,36,16                      ; mulps         0x10(%rsp),%xmm0
+  DB  68,15,84,195                        ; andps         %xmm3,%xmm8
+  DB  69,15,86,195                        ; orps          %xmm11,%xmm8
+  DB  15,92,4,36                          ; subps         (%rsp),%xmm0
+  DB  69,15,89,224                        ; mulps         %xmm8,%xmm12
+  DB  65,15,92,196                        ; subps         %xmm12,%xmm0
+  DB  69,15,88,198                        ; addps         %xmm14,%xmm8
+  DB  69,15,94,232                        ; divps         %xmm8,%xmm13
+  DB  65,15,92,197                        ; subps         %xmm13,%xmm0
+  DB  65,15,89,199                        ; mulps         %xmm15,%xmm0
+  DB  243,15,91,216                       ; cvttps2dq     %xmm0,%xmm3
+  DB  15,91,219                           ; cvtdq2ps      %xmm3,%xmm3
+  DB  15,40,232                           ; movaps        %xmm0,%xmm5
+  DB  15,194,235,1                        ; cmpltps       %xmm3,%xmm5
+  DB  65,15,84,234                        ; andps         %xmm10,%xmm5
+  DB  15,92,221                           ; subps         %xmm5,%xmm3
+  DB  15,40,232                           ; movaps        %xmm0,%xmm5
+  DB  15,92,235                           ; subps         %xmm3,%xmm5
+  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
+  DB  15,89,253                           ; mulps         %xmm5,%xmm7
+  DB  15,92,199                           ; subps         %xmm7,%xmm0
+  DB  15,92,245                           ; subps         %xmm5,%xmm6
+  DB  15,94,214                           ; divps         %xmm6,%xmm2
+  DB  15,88,208                           ; addps         %xmm0,%xmm2
+  DB  102,65,15,110,192                   ; movd          %r8d,%xmm0
+  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
+  DB  15,89,224                           ; mulps         %xmm0,%xmm4
+  DB  15,89,200                           ; mulps         %xmm0,%xmm1
+  DB  15,89,208                           ; mulps         %xmm0,%xmm2
+  DB  102,15,91,220                       ; cvtps2dq      %xmm4,%xmm3
+  DB  102,15,91,201                       ; cvtps2dq      %xmm1,%xmm1
+  DB  102,15,91,210                       ; cvtps2dq      %xmm2,%xmm2
   DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,200                        ; movaps        %xmm8,%xmm1
-  DB  65,15,40,209                        ; movaps        %xmm9,%xmm2
+  DB  102,15,40,195                       ; movapd        %xmm3,%xmm0
+  DB  15,40,92,36,64                      ; movaps        0x40(%rsp),%xmm3
+  DB  15,40,100,36,80                     ; movaps        0x50(%rsp),%xmm4
+  DB  15,40,108,36,96                     ; movaps        0x60(%rsp),%xmm5
+  DB  15,40,116,36,112                    ; movaps        0x70(%rsp),%xmm6
+  DB  15,40,188,36,128,0,0,0              ; movaps        0x80(%rsp),%xmm7
+  DB  72,129,196,152,0,0,0                ; add           $0x98,%rsp
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_to_2dot2_sse2
 _sk_to_2dot2_sse2 LABEL PROC
-  DB  68,15,82,192                        ; rsqrtps       %xmm0,%xmm8
-  DB  65,15,82,192                        ; rsqrtps       %xmm8,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  68,15,82,200                        ; rsqrtps       %xmm0,%xmm9
-  DB  69,15,83,192                        ; rcpps         %xmm8,%xmm8
-  DB  68,15,89,192                        ; mulps         %xmm0,%xmm8
-  DB  65,15,83,193                        ; rcpps         %xmm9,%xmm0
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  65,15,95,192                        ; maxps         %xmm8,%xmm0
-  DB  68,15,82,201                        ; rsqrtps       %xmm1,%xmm9
-  DB  65,15,82,201                        ; rsqrtps       %xmm9,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  15,82,201                           ; rsqrtps       %xmm1,%xmm1
-  DB  68,15,82,209                        ; rsqrtps       %xmm1,%xmm10
-  DB  69,15,83,201                        ; rcpps         %xmm9,%xmm9
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  65,15,83,202                        ; rcpps         %xmm10,%xmm1
-  DB  65,15,89,201                        ; mulps         %xmm9,%xmm1
-  DB  65,15,95,200                        ; maxps         %xmm8,%xmm1
-  DB  68,15,82,202                        ; rsqrtps       %xmm2,%xmm9
-  DB  65,15,82,209                        ; rsqrtps       %xmm9,%xmm2
-  DB  15,82,210                           ; rsqrtps       %xmm2,%xmm2
-  DB  15,82,210                           ; rsqrtps       %xmm2,%xmm2
-  DB  15,82,210                           ; rsqrtps       %xmm2,%xmm2
-  DB  68,15,82,210                        ; rsqrtps       %xmm2,%xmm10
-  DB  69,15,83,201                        ; rcpps         %xmm9,%xmm9
-  DB  68,15,89,202                        ; mulps         %xmm2,%xmm9
-  DB  65,15,83,210                        ; rcpps         %xmm10,%xmm2
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  65,15,95,208                        ; maxps         %xmm8,%xmm2
+  DB  72,129,236,152,0,0,0                ; sub           $0x98,%rsp
+  DB  15,41,188,36,128,0,0,0              ; movaps        %xmm7,0x80(%rsp)
+  DB  15,41,116,36,112                    ; movaps        %xmm6,0x70(%rsp)
+  DB  15,41,108,36,96                     ; movaps        %xmm5,0x60(%rsp)
+  DB  15,41,100,36,80                     ; movaps        %xmm4,0x50(%rsp)
+  DB  15,41,92,36,64                      ; movaps        %xmm3,0x40(%rsp)
+  DB  15,41,84,36,48                      ; movaps        %xmm2,0x30(%rsp)
+  DB  15,40,208                           ; movaps        %xmm0,%xmm2
+  DB  184,46,186,232,62                   ; mov           $0x3ee8ba2e,%eax
+  DB  15,91,194                           ; cvtdq2ps      %xmm2,%xmm0
+  DB  185,0,0,0,52                        ; mov           $0x34000000,%ecx
+  DB  102,15,110,217                      ; movd          %ecx,%xmm3
+  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
+  DB  15,89,195                           ; mulps         %xmm3,%xmm0
+  DB  68,15,40,219                        ; movaps        %xmm3,%xmm11
+  DB  68,15,41,92,36,16                   ; movaps        %xmm11,0x10(%rsp)
+  DB  185,255,255,127,0                   ; mov           $0x7fffff,%ecx
+  DB  102,15,110,217                      ; movd          %ecx,%xmm3
+  DB  102,68,15,112,195,0                 ; pshufd        $0x0,%xmm3,%xmm8
+  DB  65,15,84,208                        ; andps         %xmm8,%xmm2
+  DB  185,0,0,0,63                        ; mov           $0x3f000000,%ecx
+  DB  102,15,110,217                      ; movd          %ecx,%xmm3
+  DB  102,15,112,219,0                    ; pshufd        $0x0,%xmm3,%xmm3
+  DB  102,15,127,92,36,32                 ; movdqa        %xmm3,0x20(%rsp)
+  DB  15,86,211                           ; orps          %xmm3,%xmm2
+  DB  185,119,115,248,66                  ; mov           $0x42f87377,%ecx
+  DB  102,15,110,233                      ; movd          %ecx,%xmm5
+  DB  15,198,237,0                        ; shufps        $0x0,%xmm5,%xmm5
+  DB  15,92,197                           ; subps         %xmm5,%xmm0
+  DB  15,41,44,36                         ; movaps        %xmm5,(%rsp)
+  DB  185,117,191,191,63                  ; mov           $0x3fbfbf75,%ecx
+  DB  102,68,15,110,225                   ; movd          %ecx,%xmm12
+  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
+  DB  15,40,218                           ; movaps        %xmm2,%xmm3
+  DB  65,15,89,220                        ; mulps         %xmm12,%xmm3
+  DB  15,92,195                           ; subps         %xmm3,%xmm0
+  DB  185,163,233,220,63                  ; mov           $0x3fdce9a3,%ecx
+  DB  102,68,15,110,233                   ; movd          %ecx,%xmm13
+  DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
+  DB  185,249,68,180,62                   ; mov           $0x3eb444f9,%ecx
+  DB  102,68,15,110,241                   ; movd          %ecx,%xmm14
+  DB  69,15,198,246,0                     ; shufps        $0x0,%xmm14,%xmm14
+  DB  65,15,88,214                        ; addps         %xmm14,%xmm2
+  DB  65,15,40,221                        ; movaps        %xmm13,%xmm3
+  DB  15,94,218                           ; divps         %xmm2,%xmm3
+  DB  15,92,195                           ; subps         %xmm3,%xmm0
+  DB  102,68,15,110,248                   ; movd          %eax,%xmm15
+  DB  69,15,198,255,0                     ; shufps        $0x0,%xmm15,%xmm15
+  DB  65,15,89,199                        ; mulps         %xmm15,%xmm0
+  DB  243,15,91,208                       ; cvttps2dq     %xmm0,%xmm2
+  DB  15,91,210                           ; cvtdq2ps      %xmm2,%xmm2
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  15,194,218,1                        ; cmpltps       %xmm2,%xmm3
+  DB  184,0,0,128,63                      ; mov           $0x3f800000,%eax
+  DB  102,68,15,110,208                   ; movd          %eax,%xmm10
+  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
+  DB  65,15,84,218                        ; andps         %xmm10,%xmm3
+  DB  15,92,211                           ; subps         %xmm3,%xmm2
+  DB  15,40,224                           ; movaps        %xmm0,%xmm4
+  DB  15,92,226                           ; subps         %xmm2,%xmm4
+  DB  65,184,0,0,0,75                     ; mov           $0x4b000000,%r8d
+  DB  185,81,140,242,66                   ; mov           $0x42f28c51,%ecx
+  DB  102,68,15,110,201                   ; movd          %ecx,%xmm9
+  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
+  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
+  DB  185,141,188,190,63                  ; mov           $0x3fbebc8d,%ecx
+  DB  102,15,110,249                      ; movd          %ecx,%xmm7
+  DB  15,198,255,0                        ; shufps        $0x0,%xmm7,%xmm7
+  DB  15,40,215                           ; movaps        %xmm7,%xmm2
+  DB  15,89,212                           ; mulps         %xmm4,%xmm2
+  DB  15,92,194                           ; subps         %xmm2,%xmm0
+  DB  185,254,210,221,65                  ; mov           $0x41ddd2fe,%ecx
+  DB  184,248,245,154,64                  ; mov           $0x409af5f8,%eax
+  DB  102,15,110,240                      ; movd          %eax,%xmm6
+  DB  15,198,246,0                        ; shufps        $0x0,%xmm6,%xmm6
+  DB  15,40,222                           ; movaps        %xmm6,%xmm3
+  DB  15,92,220                           ; subps         %xmm4,%xmm3
+  DB  102,15,110,209                      ; movd          %ecx,%xmm2
+  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
+  DB  15,40,226                           ; movaps        %xmm2,%xmm4
+  DB  15,94,227                           ; divps         %xmm3,%xmm4
+  DB  15,88,224                           ; addps         %xmm0,%xmm4
+  DB  15,91,193                           ; cvtdq2ps      %xmm1,%xmm0
+  DB  65,15,89,195                        ; mulps         %xmm11,%xmm0
+  DB  65,15,84,200                        ; andps         %xmm8,%xmm1
+  DB  68,15,40,92,36,32                   ; movaps        0x20(%rsp),%xmm11
+  DB  65,15,86,203                        ; orps          %xmm11,%xmm1
+  DB  15,92,197                           ; subps         %xmm5,%xmm0
+  DB  15,40,217                           ; movaps        %xmm1,%xmm3
+  DB  65,15,89,220                        ; mulps         %xmm12,%xmm3
+  DB  15,92,195                           ; subps         %xmm3,%xmm0
+  DB  65,15,88,206                        ; addps         %xmm14,%xmm1
+  DB  65,15,40,221                        ; movaps        %xmm13,%xmm3
+  DB  15,94,217                           ; divps         %xmm1,%xmm3
+  DB  15,92,195                           ; subps         %xmm3,%xmm0
+  DB  65,15,89,199                        ; mulps         %xmm15,%xmm0
+  DB  243,15,91,200                       ; cvttps2dq     %xmm0,%xmm1
+  DB  15,91,201                           ; cvtdq2ps      %xmm1,%xmm1
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  15,194,217,1                        ; cmpltps       %xmm1,%xmm3
+  DB  65,15,84,218                        ; andps         %xmm10,%xmm3
+  DB  15,92,203                           ; subps         %xmm3,%xmm1
+  DB  15,40,216                           ; movaps        %xmm0,%xmm3
+  DB  15,92,217                           ; subps         %xmm1,%xmm3
+  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
+  DB  15,40,207                           ; movaps        %xmm7,%xmm1
+  DB  15,89,203                           ; mulps         %xmm3,%xmm1
+  DB  15,92,193                           ; subps         %xmm1,%xmm0
+  DB  15,40,238                           ; movaps        %xmm6,%xmm5
+  DB  15,92,235                           ; subps         %xmm3,%xmm5
+  DB  15,40,202                           ; movaps        %xmm2,%xmm1
+  DB  15,94,205                           ; divps         %xmm5,%xmm1
+  DB  15,88,200                           ; addps         %xmm0,%xmm1
+  DB  15,40,92,36,48                      ; movaps        0x30(%rsp),%xmm3
+  DB  15,91,195                           ; cvtdq2ps      %xmm3,%xmm0
+  DB  15,89,68,36,16                      ; mulps         0x10(%rsp),%xmm0
+  DB  68,15,84,195                        ; andps         %xmm3,%xmm8
+  DB  69,15,86,195                        ; orps          %xmm11,%xmm8
+  DB  15,92,4,36                          ; subps         (%rsp),%xmm0
+  DB  69,15,89,224                        ; mulps         %xmm8,%xmm12
+  DB  65,15,92,196                        ; subps         %xmm12,%xmm0
+  DB  69,15,88,198                        ; addps         %xmm14,%xmm8
+  DB  69,15,94,232                        ; divps         %xmm8,%xmm13
+  DB  65,15,92,197                        ; subps         %xmm13,%xmm0
+  DB  65,15,89,199                        ; mulps         %xmm15,%xmm0
+  DB  243,15,91,216                       ; cvttps2dq     %xmm0,%xmm3
+  DB  15,91,219                           ; cvtdq2ps      %xmm3,%xmm3
+  DB  15,40,232                           ; movaps        %xmm0,%xmm5
+  DB  15,194,235,1                        ; cmpltps       %xmm3,%xmm5
+  DB  65,15,84,234                        ; andps         %xmm10,%xmm5
+  DB  15,92,221                           ; subps         %xmm5,%xmm3
+  DB  15,40,232                           ; movaps        %xmm0,%xmm5
+  DB  15,92,235                           ; subps         %xmm3,%xmm5
+  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
+  DB  15,89,253                           ; mulps         %xmm5,%xmm7
+  DB  15,92,199                           ; subps         %xmm7,%xmm0
+  DB  15,92,245                           ; subps         %xmm5,%xmm6
+  DB  15,94,214                           ; divps         %xmm6,%xmm2
+  DB  15,88,208                           ; addps         %xmm0,%xmm2
+  DB  102,65,15,110,192                   ; movd          %r8d,%xmm0
+  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
+  DB  15,89,224                           ; mulps         %xmm0,%xmm4
+  DB  15,89,200                           ; mulps         %xmm0,%xmm1
+  DB  15,89,208                           ; mulps         %xmm0,%xmm2
+  DB  102,15,91,220                       ; cvtps2dq      %xmm4,%xmm3
+  DB  102,15,91,201                       ; cvtps2dq      %xmm1,%xmm1
+  DB  102,15,91,210                       ; cvtps2dq      %xmm2,%xmm2
   DB  72,173                              ; lods          %ds:(%rsi),%rax
+  DB  102,15,40,195                       ; movapd        %xmm3,%xmm0
+  DB  15,40,92,36,64                      ; movaps        0x40(%rsp),%xmm3
+  DB  15,40,100,36,80                     ; movaps        0x50(%rsp),%xmm4
+  DB  15,40,108,36,96                     ; movaps        0x60(%rsp),%xmm5
+  DB  15,40,116,36,112                    ; movaps        0x70(%rsp),%xmm6
+  DB  15,40,188,36,128,0,0,0              ; movaps        0x80(%rsp),%xmm7
+  DB  72,129,196,152,0,0,0                ; add           $0x98,%rsp
   DB  255,224                             ; jmpq          *%rax
 
 PUBLIC _sk_rgb_to_hsl_sse2
@@ -17388,9 +18142,9 @@ _sk_gather_i8_sse2 LABEL PROC
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  73,137,192                          ; mov           %rax,%r8
   DB  77,133,192                          ; test          %r8,%r8
-  DB  116,5                               ; je            2aa7 <_sk_gather_i8_sse2+0xf>
+  DB  116,5                               ; je            2e63 <_sk_gather_i8_sse2+0xf>
   DB  76,137,192                          ; mov           %r8,%rax
-  DB  235,2                               ; jmp           2aa9 <_sk_gather_i8_sse2+0x11>
+  DB  235,2                               ; jmp           2e65 <_sk_gather_i8_sse2+0x11>
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  76,139,16                           ; mov           (%rax),%r10
   DB  243,15,91,201                       ; cvttps2dq     %xmm1,%xmm1
@@ -18672,7 +19426,7 @@ _sk_linear_gradient_sse2 LABEL PROC
   DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
   DB  72,139,8                            ; mov           (%rax),%rcx
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,132,15,1,0,0                     ; je            4060 <_sk_linear_gradient_sse2+0x149>
+  DB  15,132,15,1,0,0                     ; je            441c <_sk_linear_gradient_sse2+0x149>
   DB  72,139,64,8                         ; mov           0x8(%rax),%rax
   DB  72,131,192,32                       ; add           $0x20,%rax
   DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
@@ -18733,8 +19487,8 @@ _sk_linear_gradient_sse2 LABEL PROC
   DB  69,15,86,231                        ; orps          %xmm15,%xmm12
   DB  72,131,192,36                       ; add           $0x24,%rax
   DB  72,255,201                          ; dec           %rcx
-  DB  15,133,8,255,255,255                ; jne           3f66 <_sk_linear_gradient_sse2+0x4f>
-  DB  235,13                              ; jmp           406d <_sk_linear_gradient_sse2+0x156>
+  DB  15,133,8,255,255,255                ; jne           4322 <_sk_linear_gradient_sse2+0x4f>
+  DB  235,13                              ; jmp           4429 <_sk_linear_gradient_sse2+0x156>
   DB  15,87,201                           ; xorps         %xmm1,%xmm1
   DB  15,87,210                           ; xorps         %xmm2,%xmm2
   DB  15,87,219                           ; xorps         %xmm3,%xmm3
index ea0e56f..26e2667 100644 (file)
@@ -480,32 +480,14 @@ STAGE(to_srgb) {
 }
 
 STAGE(from_2dot2) {
-    auto fn = [](F x) {
-        // x^(141/64) = x^(2.20312) is a great approximation of the true value, x^(2.2).
-        // (note: x^(35/16) = x^(2.1875) is an okay one as well and would be quicker)
-        F x16 = rsqrt(rsqrt(rsqrt(rsqrt(x)))),    // x^(1/16) = x^(4/64);
-          x64 = rsqrt(rsqrt(x16));                // x^(1/64)
-
-        // 141/64 = 128/64 + 12/64 + 1/64
-        return max((x*x) * (x16*x16*x16) * x64, 0);
-    };
-    r = fn(r);
-    g = fn(g);
-    b = fn(b);
+    r = approx_powf(r, C(2.2f));
+    g = approx_powf(g, C(2.2f));
+    b = approx_powf(b, C(2.2f));
 }
 STAGE(to_2dot2) {
-    auto fn = [](F x) {
-        // x^(29/64) is a very good approximation of the true value, x^(1/2.2).
-        F x2  = rsqrt(x),                         // x^(-1/2)
-          x32 = rsqrt(rsqrt(rsqrt(rsqrt(x2)))),   // x^(-1/32)
-          x64 = rsqrt(x32);                       // x^(+1/64)
-
-        // 29/64 = 32/64 - 2/64 - 1/64
-        return max(rcp(x2) * x32 * rcp(x64), 0);
-    };
-    r = fn(r);
-    g = fn(g);
-    b = fn(b);
+    r = approx_powf(r, C(1/2.2f));
+    g = approx_powf(g, C(1/2.2f));
+    b = approx_powf(b, C(1/2.2f));
 }
 
 STAGE(rgb_to_hsl) {
index 5810148..0185abc 100644 (file)
@@ -73,3 +73,36 @@ DEF_TEST(Parametric_inv_1dot8, r) { check_error(r, 1/510.0f, 1/1.8f); }
 DEF_TEST(Parametric_inv_2dot0, r) { check_error(r, 1/510.0f, 1/2.0f); }
 DEF_TEST(Parametric_inv_2dot2, r) { check_error(r, 1/510.0f, 1/2.2f); }
 DEF_TEST(Parametric_inv_2dot4, r) { check_error(r, 1/510.0f, 1/2.4f); }
+
+// As above, checking that the stage implements gamma within limit.
+static void check_error(skiatest::Reporter* r, float limit,
+                        float gamma, SkRasterPipeline::StockStage stage) {
+
+    // We expect the gamma will only be applied to R,G,B, leaving A alone.
+    // So this isn't quite exhaustive, but it's pretty good.
+    float in[256], out[256];
+    for (int i = 0; i < 256; i++) {
+        in [i] = i / 255.0f;
+        out[i] = 0.0f;  // Not likely important.  Just being tidy.
+    }
+
+    const float* ip = in;
+    float*       op = out;
+
+    SkRasterPipeline p;
+    p.append(SkRasterPipeline::load_f32, &ip);
+    p.append(stage);
+    p.append(SkRasterPipeline::store_f32, &op);
+    p.run(0, 256/4);
+
+    for (int i = 0; i < 256; i++) {
+        float want = powf(i/255.0f, (i%4) == 3 ? 1.0f
+                                               : gamma);
+        float err = fabsf(out[i] - want);
+        if (err > limit) {
+            ERRORF(r, "At %d, error was %g (got %g, want %g)", i, err, out[i], want);
+        }
+    }
+}
+DEF_TEST(from_2dot2, r) { check_error(r, 1/510.f, 2.2f,  SkRasterPipeline::from_2dot2); }
+DEF_TEST(  to_2dot2, r) { check_error(r, 1/510.f, 1/2.2f,SkRasterPipeline::  to_2dot2); }