SkJumper
authorMike Klein <mtklein@chromium.org>
Wed, 15 Feb 2017 18:31:12 +0000 (13:31 -0500)
committerSkia Commit-Bot <skia-commit-bot@chromium.org>
Thu, 16 Feb 2017 00:27:57 +0000 (00:27 +0000)
Change-Id: If9f73e712e429564fef58ccb838c212ec8d2e68c
Reviewed-on: https://skia-review.googlesource.com/8525
Commit-Queue: Mike Klein <mtklein@chromium.org>
Reviewed-by: Herb Derby <herb@google.com>
BUILD.gn
public.bzl
src/core/SkRasterPipeline.cpp
src/core/SkRasterPipeline.h
src/jumper/SkJumper.cpp [new file with mode: 0644]
src/jumper/SkJumper.h [new file with mode: 0644]
src/jumper/SkJumper_generated.h [new file with mode: 0644]
src/jumper/SkJumper_stages.cpp [new file with mode: 0644]
src/jumper/build_stages.py [new file with mode: 0755]

index 7a91297f38e7ce54cf26a0170de599ef54f71bd7..e47835055f798de21b498712227d546168ae4664 100644 (file)
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -29,6 +29,7 @@ declare_args() {
   skia_enable_android_framework_defines = false
   skia_enable_discrete_gpu = true
   skia_enable_effects = true
+  skia_enable_jumper = false
   skia_enable_gpu = true
   skia_enable_pdf = true
   skia_enable_tools = is_skia_standalone
@@ -493,6 +494,15 @@ optional("raw") {
   ]
 }
 
+optional("jumper") {
+  enabled = skia_enable_jumper
+  public_defines = [ "SK_JUMPER" ]
+  sources = [
+    "src/jumper/SkJumper.cpp",
+    "src/jumper/SkJumper_stages.cpp",
+  ]
+}
+
 optional("typeface_freetype") {
   enabled = skia_use_freetype
 
@@ -552,6 +562,7 @@ component("skia") {
     ":gpu",
     ":hsw",
     ":jpeg",
+    ":jumper",
     ":none",
     ":pdf",
     ":png",
index 9b0c1b520d64eb8ac97ef376a32a39c0d34e51d3..6d494f6b5521679f67fac91c7559568a5cab68a9 100644 (file)
@@ -106,6 +106,7 @@ BASE_SRCS_ALL = struct(
         "src/gpu/gl/mac/*",
         "src/gpu/gl/win/*",
         "src/images/*",
+        "src/jumper/*",
         "src/opts/**/*",
         "src/ports/**/*",
         "src/utils/android/**/*",
index 350c2f50b46215d991790210502095e1f53ef6ba..81b98677b2cf26d107e94768f72c17327a3d26fa 100644 (file)
@@ -22,6 +22,11 @@ void SkRasterPipeline::extend(const SkRasterPipeline& src) {
 
 void SkRasterPipeline::run(size_t x, size_t n) const {
     if (!fStages.empty()) {
+    #if defined(SK_JUMPER)
+        if (this->run_with_jumper(x, n)) {
+            return;
+        }
+    #endif
         SkOpts::run_pipeline(x,n, fStages.data(), SkToInt(fStages.size()));
     }
 }
index 9ab6667e5a5dc612b89d5826e672ac961664efc8..c5661a2e1dc41c2b326beb40dc60ffb21b7cd05f 100644 (file)
@@ -133,6 +133,8 @@ public:
     void append_from_srgb(SkAlphaType);
 
 private:
+    bool run_with_jumper(size_t x, size_t n) const;
+
     std::vector<Stage> fStages;
 };
 
diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp
new file mode 100644 (file)
index 0000000..a58a7d7
--- /dev/null
@@ -0,0 +1,205 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkCpu.h"
+#include "SkJumper.h"
+#include "SkJumper_generated.h"
+#include "SkRasterPipeline.h"
+#include "SkTemplates.h"
+
+// Stages expect these constants to be set to these values.
+// It's fine to rearrange and add new ones if you update SkJumper_constants.
+static const SkJumper_constants kConstants = {
+    1.0f, 0.5f, 255.0f, 1/255.0f, 0x000000ff,
+    {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f},
+    0.0025f, 0.6975f, 0.3000f, 1/12.92f, 0.055f,       // from_srgb
+    12.46f, 0.411192f, 0.689206f, -0.0988f, 0.0043f,   //   to_srgb
+    0x77800000, 0x07800000,                            // fp16 <-> fp32
+};
+
+using JumperStage = void(size_t, void**, const SkJumper_constants*);
+// Jumper stages actually pass around 8 floating point vectors too.
+// They're designed to work when those vectors start unintialized,
+// so we don't need to mention them here.
+
+#define STAGES(M)     \
+    M(seed_shader)    \
+    M(constant_color) \
+    M(clear)          \
+    M(plus_)          \
+    M(srcover)        \
+    M(dstover)        \
+    M(clamp_0)        \
+    M(clamp_1)        \
+    M(clamp_a)        \
+    M(swap)           \
+    M(move_src_dst)   \
+    M(move_dst_src)   \
+    M(premul)         \
+    M(unpremul)       \
+    M(from_srgb)      \
+    M(to_srgb)        \
+    M(scale_u8)       \
+    M(load_tables)    \
+    M(load_8888)      \
+    M(store_8888)     \
+    M(load_f16)       \
+    M(store_f16)      \
+    M(matrix_2x3)     \
+    M(matrix_3x4)     \
+    M(clamp_x)        \
+    M(clamp_y)        \
+    M(linear_gradient_2stops)
+
+// Declare the portable, single pixel stages that are linked into Skia from SkJumper_stages.o.
+extern "C" {
+    JumperStage sk_just_return;
+#define M(st) JumperStage sk_##st;
+    STAGES(M)
+#undef M
+}
+
+// Translate SkRasterPipeline's enum to pointers to our portable, single pixel stages.
+static void* portable_lookup(SkRasterPipeline::StockStage st) {
+    switch (st) {
+        default: return nullptr;
+    #define M(st) case SkRasterPipeline::st: return (void*)sk_##st;
+        STAGES(M)
+    #undef M
+    }
+}
+
+// The non-portable options are pre-compiled static data arrays pulled in from SkJumper_generated.h.
+#if defined(__aarch64__)
+    static void* aarch64_lookup(SkRasterPipeline::StockStage st) {
+        switch (st) {
+            default: return nullptr;
+        #define M(st) case SkRasterPipeline::st: return (void*)aarch64_sk_##st;
+            STAGES(M)
+        #undef M
+        }
+    }
+#elif defined(__ARM_NEON__)
+    static void* armv7_lookup(SkRasterPipeline::StockStage st) {
+        switch (st) {
+            default: return nullptr;
+        #define M(st) case SkRasterPipeline::st: return (void*)armv7_sk_##st;
+            STAGES(M)
+        #undef M
+        }
+    }
+#elif defined(__x86_64__) || defined(_M_X64)
+    static void* sse2_lookup(SkRasterPipeline::StockStage st) {
+        switch (st) {
+            default: return nullptr;
+        #define M(st) case SkRasterPipeline::st: return (void*)sse2_sk_##st;
+            STAGES(M)
+        #undef M
+        }
+    }
+    static void* sse41_lookup(SkRasterPipeline::StockStage st) {
+        switch (st) {
+            default: return nullptr;
+        #define M(st) case SkRasterPipeline::st: return (void*)sse41_sk_##st;
+            STAGES(M)
+        #undef M
+        }
+    }
+    static void* hsw_lookup(SkRasterPipeline::StockStage st) {
+        switch (st) {
+            default: return nullptr;
+        #define M(st) case SkRasterPipeline::st: return (void*)hsw_sk_##st;
+            STAGES(M)
+        #undef M
+        }
+    }
+#endif
+
+bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
+    // We'll look for the best vector instruction set and stride we can use.
+    size_t stride                                 = 0;
+    void* (*lookup)(SkRasterPipeline::StockStage) = nullptr;
+    void* just_return                             = nullptr;
+
+#if defined(__aarch64__)
+    stride      = 4;
+    lookup      = aarch64_lookup;
+    just_return = (void*)aarch64_sk_just_return;
+
+#elif defined(__ARM_NEON__)
+    if (SkCpu::Supports(SkCpu::NEON|SkCpu::NEON_FMA|SkCpu::VFP_FP16)) {
+        stride      = 2;
+        lookup      = armv7_lookup;
+        just_return = (void*)armv7_sk_just_return;
+    }
+
+#elif defined(__x86_64__) || defined(_M_X64)
+    stride      = 4;
+    lookup      = sse2_lookup;
+    just_return = (void*)sse2_sk_just_return;
+    if (SkCpu::Supports(SkCpu::SSE41)) {
+        stride      = 4;
+        lookup      = sse41_lookup;
+        just_return = (void*)sse41_sk_just_return;
+    }
+    if (SkCpu::Supports(SkCpu::HSW)) {
+        stride      = 8;
+        lookup      = hsw_lookup;
+        just_return = (void*)hsw_sk_just_return;
+    }
+#endif
+
+    SkAutoSTMalloc<64, void*> program(2*fStages.size() + 1);
+
+    // If possible, build and run a program to run at full vector stride.
+    const size_t limit = x+n;
+
+    if (stride) {
+        void** ip = program.get();
+        for (auto&& st : fStages) {
+            auto fn = lookup(st.stage);
+            if (!fn) {
+                return false;
+            }
+            *ip++ = fn;
+            *ip++ = st.ctx;
+        }
+        *ip = (void*)just_return;
+
+        ip = program.get();
+        auto start = (JumperStage*)*ip++;
+        while (x + stride <= limit) {
+            start(x, ip, &kConstants);
+            x += stride;
+        }
+    }
+
+    // If there's any leftover, build and run stride=1 portable code.
+    if (x < limit) {
+        stride = 1;
+
+        void** ip = program.get();
+        for (auto&& st : fStages) {
+            auto fn = portable_lookup(st.stage);
+            if (!fn) {
+                return false;
+            }
+            *ip++ = fn;
+            *ip++ = st.ctx;
+        }
+        *ip = (void*)sk_just_return;
+
+        ip = program.get();
+        auto start = (JumperStage*)*ip++;
+        while (x + stride <= limit) {
+            start(x, ip, &kConstants);
+            x += stride;
+        }
+    }
+
+    return true;
+}
diff --git a/src/jumper/SkJumper.h b/src/jumper/SkJumper.h
new file mode 100644 (file)
index 0000000..f6088dd
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkJumper_DEFINED
+#define SkJumper_DEFINED
+
+// This file contains definitions shared by SkJumper.cpp (compiled normally as part of Skia)
+// and SkJumper_stages.cpp (compiled into Skia _and_ offline into SkJumper_generated.h).
+// Keep it simple!
+
+#include <stdint.h>
+
+// SkJumper Stages can use constant literals only if they end up baked into the instruction,
+// like bit shifts and rounding modes.  Any other constant values must be pulled from this struct
+// (except 0, ~0, and 0.0f, which always end up as some sort of xor or cmpeq instruction).
+//
+// This constraint makes it much easier to move and reorder the code for each Stage.
+
+struct SkJumper_constants {
+    float    _1;           //  1.0f
+    float    _0_5;         //  0.5f
+    float    _255;         //  255.0f
+    float    _1_255;       //  1/255.0f
+    uint32_t _0x000000ff;  //  0x000000ff
+
+    float    iota[8];      //  0,1,2,3,4,5,6,7
+
+    // from_srgb
+    float    _00025;       //  0.0025f
+    float    _06975;       //  0.6975f
+    float    _03000;       //  0.3000f
+    float    _1_1292;      //  1/12.92f
+    float    _0055;        //  0.055f
+
+    // to_srgb
+    float    _1246;        //  12.46f
+    float    _0411192;     //  0.411192f
+    float    _0689206;     //  0.689206f
+    float   n_00988;       // -0.0988f
+    float    _00043;       //  0.0043f
+
+    // fp16 <-> fp32
+    uint32_t _0x77800000;
+    uint32_t _0x07800000;
+};
+
+#endif//SkJumper_DEFINED
diff --git a/src/jumper/SkJumper_generated.h b/src/jumper/SkJumper_generated.h
new file mode 100644 (file)
index 0000000..9641b5f
--- /dev/null
@@ -0,0 +1,2776 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkJumper_generated_DEFINED
+#define SkJumper_generated_DEFINED
+
+// This file is generated semi-automatically with this command:
+//   $ src/jumper/build_stages.py
+
+static const unsigned int aarch64_sk_just_return[] = {
+    0xd65f03c0,                                 //  ret
+};
+static const unsigned int aarch64_sk_seed_shader[] = {
+    0xaa0203e9,                                 //  mov           x9, x2
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0x4ddfc922,                                 //  ld1r          {v2.4s}, [x9], #4
+    0x3cc14041,                                 //  ldur          q1, [x2,#20]
+    0x4e040c00,                                 //  dup           v0.4s, w0
+    0x4d40c903,                                 //  ld1r          {v3.4s}, [x8]
+    0x4d40c924,                                 //  ld1r          {v4.4s}, [x9]
+    0x4e21d800,                                 //  scvtf         v0.4s, v0.4s
+    0x6f00e405,                                 //  movi          v5.2d, #0x0
+    0x4e21d863,                                 //  scvtf         v3.4s, v3.4s
+    0x4e24d400,                                 //  fadd          v0.4s, v0.4s, v4.4s
+    0x4e20d420,                                 //  fadd          v0.4s, v1.4s, v0.4s
+    0x4e24d461,                                 //  fadd          v1.4s, v3.4s, v4.4s
+    0x6f00e403,                                 //  movi          v3.2d, #0x0
+    0x6f00e404,                                 //  movi          v4.2d, #0x0
+    0x6f00e406,                                 //  movi          v6.2d, #0x0
+    0x6f00e407,                                 //  movi          v7.2d, #0x0
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_constant_color[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0x3dc00103,                                 //  ldr           q3, [x8]
+    0x4e040460,                                 //  dup           v0.4s, v3.s[0]
+    0x4e0c0461,                                 //  dup           v1.4s, v3.s[1]
+    0x4e140462,                                 //  dup           v2.4s, v3.s[2]
+    0x4e1c0463,                                 //  dup           v3.4s, v3.s[3]
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_clear[] = {
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x91004028,                                 //  add           x8, x1, #0x10
+    0x6f00e400,                                 //  movi          v0.2d, #0x0
+    0x6f00e401,                                 //  movi          v1.2d, #0x0
+    0x6f00e402,                                 //  movi          v2.2d, #0x0
+    0x6f00e403,                                 //  movi          v3.2d, #0x0
+    0xaa0803e1,                                 //  mov           x1, x8
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_plus_[] = {
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x4e24d400,                                 //  fadd          v0.4s, v0.4s, v4.4s
+    0x4e25d421,                                 //  fadd          v1.4s, v1.4s, v5.4s
+    0x4e26d442,                                 //  fadd          v2.4s, v2.4s, v6.4s
+    0x4e27d463,                                 //  fadd          v3.4s, v3.4s, v7.4s
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_srcover[] = {
+    0x4d40c850,                                 //  ld1r          {v16.4s}, [x2]
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0x4ea3d610,                                 //  fsub          v16.4s, v16.4s, v3.4s
+    0x4e24ce00,                                 //  fmla          v0.4s, v16.4s, v4.4s
+    0x4e25ce01,                                 //  fmla          v1.4s, v16.4s, v5.4s
+    0x4e26ce02,                                 //  fmla          v2.4s, v16.4s, v6.4s
+    0x4e27ce03,                                 //  fmla          v3.4s, v16.4s, v7.4s
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_dstover[] = {
+    0x4d40c851,                                 //  ld1r          {v17.4s}, [x2]
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x4ea41c90,                                 //  mov           v16.16b, v4.16b
+    0x4ea61cd2,                                 //  mov           v18.16b, v6.16b
+    0x4ea7d634,                                 //  fsub          v20.4s, v17.4s, v7.4s
+    0x4ea51cb1,                                 //  mov           v17.16b, v5.16b
+    0x4ea71cf3,                                 //  mov           v19.16b, v7.16b
+    0x4e20ce90,                                 //  fmla          v16.4s, v20.4s, v0.4s
+    0x4e21ce91,                                 //  fmla          v17.4s, v20.4s, v1.4s
+    0x4e22ce92,                                 //  fmla          v18.4s, v20.4s, v2.4s
+    0x4e23ce93,                                 //  fmla          v19.4s, v20.4s, v3.4s
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0x4eb01e00,                                 //  mov           v0.16b, v16.16b
+    0x4eb11e21,                                 //  mov           v1.16b, v17.16b
+    0x4eb21e42,                                 //  mov           v2.16b, v18.16b
+    0x4eb31e63,                                 //  mov           v3.16b, v19.16b
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_clamp_0[] = {
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x6f00e410,                                 //  movi          v16.2d, #0x0
+    0x4e30f400,                                 //  fmax          v0.4s, v0.4s, v16.4s
+    0x4e30f421,                                 //  fmax          v1.4s, v1.4s, v16.4s
+    0x4e30f442,                                 //  fmax          v2.4s, v2.4s, v16.4s
+    0x4e30f463,                                 //  fmax          v3.4s, v3.4s, v16.4s
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_clamp_1[] = {
+    0x4d40c850,                                 //  ld1r          {v16.4s}, [x2]
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0x4eb0f400,                                 //  fmin          v0.4s, v0.4s, v16.4s
+    0x4eb0f421,                                 //  fmin          v1.4s, v1.4s, v16.4s
+    0x4eb0f442,                                 //  fmin          v2.4s, v2.4s, v16.4s
+    0x4eb0f463,                                 //  fmin          v3.4s, v3.4s, v16.4s
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_clamp_a[] = {
+    0x4d40c850,                                 //  ld1r          {v16.4s}, [x2]
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0x4eb0f463,                                 //  fmin          v3.4s, v3.4s, v16.4s
+    0x4ea3f400,                                 //  fmin          v0.4s, v0.4s, v3.4s
+    0x4ea3f421,                                 //  fmin          v1.4s, v1.4s, v3.4s
+    0x4ea3f442,                                 //  fmin          v2.4s, v2.4s, v3.4s
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_swap[] = {
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x4ea31c70,                                 //  mov           v16.16b, v3.16b
+    0x4ea21c51,                                 //  mov           v17.16b, v2.16b
+    0x4ea11c32,                                 //  mov           v18.16b, v1.16b
+    0x4ea01c13,                                 //  mov           v19.16b, v0.16b
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0x4ea41c80,                                 //  mov           v0.16b, v4.16b
+    0x4ea51ca1,                                 //  mov           v1.16b, v5.16b
+    0x4ea61cc2,                                 //  mov           v2.16b, v6.16b
+    0x4ea71ce3,                                 //  mov           v3.16b, v7.16b
+    0x4eb31e64,                                 //  mov           v4.16b, v19.16b
+    0x4eb21e45,                                 //  mov           v5.16b, v18.16b
+    0x4eb11e26,                                 //  mov           v6.16b, v17.16b
+    0x4eb01e07,                                 //  mov           v7.16b, v16.16b
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_move_src_dst[] = {
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0x4ea01c04,                                 //  mov           v4.16b, v0.16b
+    0x4ea11c25,                                 //  mov           v5.16b, v1.16b
+    0x4ea21c46,                                 //  mov           v6.16b, v2.16b
+    0x4ea31c67,                                 //  mov           v7.16b, v3.16b
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_move_dst_src[] = {
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0x4ea41c80,                                 //  mov           v0.16b, v4.16b
+    0x4ea51ca1,                                 //  mov           v1.16b, v5.16b
+    0x4ea61cc2,                                 //  mov           v2.16b, v6.16b
+    0x4ea71ce3,                                 //  mov           v3.16b, v7.16b
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_premul[] = {
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x6e23dc00,                                 //  fmul          v0.4s, v0.4s, v3.4s
+    0x6e23dc21,                                 //  fmul          v1.4s, v1.4s, v3.4s
+    0x6e23dc42,                                 //  fmul          v2.4s, v2.4s, v3.4s
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_unpremul[] = {
+    0x4d40c850,                                 //  ld1r          {v16.4s}, [x2]
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x4ea0d871,                                 //  fcmeq         v17.4s, v3.4s, #0.0
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0x6e23fe10,                                 //  fdiv          v16.4s, v16.4s, v3.4s
+    0x4e711e10,                                 //  bic           v16.16b, v16.16b, v17.16b
+    0x6e20de00,                                 //  fmul          v0.4s, v16.4s, v0.4s
+    0x6e21de01,                                 //  fmul          v1.4s, v16.4s, v1.4s
+    0x6e22de02,                                 //  fmul          v2.4s, v16.4s, v2.4s
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_from_srgb[] = {
+    0x9100e048,                                 //  add           x8, x2, #0x38
+    0x4d40c910,                                 //  ld1r          {v16.4s}, [x8]
+    0x9100d048,                                 //  add           x8, x2, #0x34
+    0x4d40c911,                                 //  ld1r          {v17.4s}, [x8]
+    0x2d47cc52,                                 //  ldp           s18, s19, [x2,#60]
+    0x6e22dc54,                                 //  fmul          v20.4s, v2.4s, v2.4s
+    0x4eb01e15,                                 //  mov           v21.16b, v16.16b
+    0x4eb01e17,                                 //  mov           v23.16b, v16.16b
+    0x4f921050,                                 //  fmla          v16.4s, v2.4s, v18.s[0]
+    0x4eb11e36,                                 //  mov           v22.16b, v17.16b
+    0x4eb11e38,                                 //  mov           v24.16b, v17.16b
+    0x4e34ce11,                                 //  fmla          v17.4s, v16.4s, v20.4s
+    0x6e20dc10,                                 //  fmul          v16.4s, v0.4s, v0.4s
+    0x91011048,                                 //  add           x8, x2, #0x44
+    0x4f921015,                                 //  fmla          v21.4s, v0.4s, v18.s[0]
+    0x4e30ceb6,                                 //  fmla          v22.4s, v21.4s, v16.4s
+    0x4d40c910,                                 //  ld1r          {v16.4s}, [x8]
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x6e21dc34,                                 //  fmul          v20.4s, v1.4s, v1.4s
+    0x4f921037,                                 //  fmla          v23.4s, v1.4s, v18.s[0]
+    0x4f939015,                                 //  fmul          v21.4s, v0.4s, v19.s[0]
+    0x4f939032,                                 //  fmul          v18.4s, v1.4s, v19.s[0]
+    0x4f939053,                                 //  fmul          v19.4s, v2.4s, v19.s[0]
+    0x6ea0e600,                                 //  fcmgt         v0.4s, v16.4s, v0.4s
+    0x6ea1e601,                                 //  fcmgt         v1.4s, v16.4s, v1.4s
+    0x6ea2e602,                                 //  fcmgt         v2.4s, v16.4s, v2.4s
+    0x4e34cef8,                                 //  fmla          v24.4s, v23.4s, v20.4s
+    0x6e711e62,                                 //  bsl           v2.16b, v19.16b, v17.16b
+    0x6e761ea0,                                 //  bsl           v0.16b, v21.16b, v22.16b
+    0x6e781e41,                                 //  bsl           v1.16b, v18.16b, v24.16b
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_to_srgb[] = {
+    0x6ea1d811,                                 //  frsqrte       v17.4s, v0.4s
+    0x6ea1d835,                                 //  frsqrte       v21.4s, v1.4s
+    0x6e31de37,                                 //  fmul          v23.4s, v17.4s, v17.4s
+    0x6ea1d856,                                 //  frsqrte       v22.4s, v2.4s
+    0x6e35deb9,                                 //  fmul          v25.4s, v21.4s, v21.4s
+    0x4eb7fc17,                                 //  frsqrts       v23.4s, v0.4s, v23.4s
+    0x91015048,                                 //  add           x8, x2, #0x54
+    0x6e36deda,                                 //  fmul          v26.4s, v22.4s, v22.4s
+    0x4eb9fc39,                                 //  frsqrts       v25.4s, v1.4s, v25.4s
+    0x6e37de31,                                 //  fmul          v17.4s, v17.4s, v23.4s
+    0x2d494052,                                 //  ldp           s18, s16, [x2,#72]
+    0x4d40c914,                                 //  ld1r          {v20.4s}, [x8]
+    0x4ebafc5a,                                 //  frsqrts       v26.4s, v2.4s, v26.4s
+    0x6e39deb5,                                 //  fmul          v21.4s, v21.4s, v25.4s
+    0x4ea1da37,                                 //  frecpe        v23.4s, v17.4s
+    0xbd405053,                                 //  ldr           s19, [x2,#80]
+    0x91016048,                                 //  add           x8, x2, #0x58
+    0x6e3aded6,                                 //  fmul          v22.4s, v22.4s, v26.4s
+    0x4ea1dabb,                                 //  frecpe        v27.4s, v21.4s
+    0x4e37fe3d,                                 //  frecps        v29.4s, v17.4s, v23.4s
+    0x4d40c918,                                 //  ld1r          {v24.4s}, [x8]
+    0x4ea1dadc,                                 //  frecpe        v28.4s, v22.4s
+    0x6e3ddef7,                                 //  fmul          v23.4s, v23.4s, v29.4s
+    0x4e3bfebd,                                 //  frecps        v29.4s, v21.4s, v27.4s
+    0x6e3ddf7b,                                 //  fmul          v27.4s, v27.4s, v29.4s
+    0x4e3cfedd,                                 //  frecps        v29.4s, v22.4s, v28.4s
+    0x6e3ddf9c,                                 //  fmul          v28.4s, v28.4s, v29.4s
+    0x4eb41e9d,                                 //  mov           v29.16b, v20.16b
+    0x6ea1da39,                                 //  frsqrte       v25.4s, v17.4s
+    0x4f9312fd,                                 //  fmla          v29.4s, v23.4s, v19.s[0]
+    0x4eb41e97,                                 //  mov           v23.16b, v20.16b
+    0x4f92901a,                                 //  fmul          v26.4s, v0.4s, v18.s[0]
+    0x4f931377,                                 //  fmla          v23.4s, v27.4s, v19.s[0]
+    0x4f931394,                                 //  fmla          v20.4s, v28.4s, v19.s[0]
+    0x4f929033,                                 //  fmul          v19.4s, v1.4s, v18.s[0]
+    0x4f929052,                                 //  fmul          v18.4s, v2.4s, v18.s[0]
+    0x6ea0e700,                                 //  fcmgt         v0.4s, v24.4s, v0.4s
+    0x6ea1e701,                                 //  fcmgt         v1.4s, v24.4s, v1.4s
+    0x6ea2e702,                                 //  fcmgt         v2.4s, v24.4s, v2.4s
+    0x6e39df38,                                 //  fmul          v24.4s, v25.4s, v25.4s
+    0x6ea1dabb,                                 //  frsqrte       v27.4s, v21.4s
+    0x4eb8fe31,                                 //  frsqrts       v17.4s, v17.4s, v24.4s
+    0x6ea1dadc,                                 //  frsqrte       v28.4s, v22.4s
+    0x6e3bdf78,                                 //  fmul          v24.4s, v27.4s, v27.4s
+    0x6e31df31,                                 //  fmul          v17.4s, v25.4s, v17.4s
+    0x4eb8feb5,                                 //  frsqrts       v21.4s, v21.4s, v24.4s
+    0x6e3cdf98,                                 //  fmul          v24.4s, v28.4s, v28.4s
+    0x4f90123d,                                 //  fmla          v29.4s, v17.4s, v16.s[0]
+    0x4d40c851,                                 //  ld1r          {v17.4s}, [x2]
+    0x4eb8fed6,                                 //  frsqrts       v22.4s, v22.4s, v24.4s
+    0x6e35df75,                                 //  fmul          v21.4s, v27.4s, v21.4s
+    0x6e36df96,                                 //  fmul          v22.4s, v28.4s, v22.4s
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x4f9012b7,                                 //  fmla          v23.4s, v21.4s, v16.s[0]
+    0x4f9012d4,                                 //  fmla          v20.4s, v22.4s, v16.s[0]
+    0x4ebdf630,                                 //  fmin          v16.4s, v17.4s, v29.4s
+    0x4eb7f635,                                 //  fmin          v21.4s, v17.4s, v23.4s
+    0x4eb4f631,                                 //  fmin          v17.4s, v17.4s, v20.4s
+    0x6e701f40,                                 //  bsl           v0.16b, v26.16b, v16.16b
+    0x6e751e61,                                 //  bsl           v1.16b, v19.16b, v21.16b
+    0x6e711e42,                                 //  bsl           v2.16b, v18.16b, v17.16b
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_scale_u8[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0xbd400c51,                                 //  ldr           s17, [x2,#12]
+    0xf9400108,                                 //  ldr           x8, [x8]
+    0x8b000108,                                 //  add           x8, x8, x0
+    0x39400109,                                 //  ldrb          w9, [x8]
+    0x3940050a,                                 //  ldrb          w10, [x8,#1]
+    0x3940090b,                                 //  ldrb          w11, [x8,#2]
+    0x39400d08,                                 //  ldrb          w8, [x8,#3]
+    0x4e021d30,                                 //  mov           v16.h[0], w9
+    0x4e061d50,                                 //  mov           v16.h[1], w10
+    0x4e0a1d70,                                 //  mov           v16.h[2], w11
+    0x4e0e1d10,                                 //  mov           v16.h[3], w8
+    0x2f07b7f0,                                 //  bic           v16.4h, #0xff, lsl #8
+    0x2f10a610,                                 //  uxtl          v16.4s, v16.4h
+    0x6e21da10,                                 //  ucvtf         v16.4s, v16.4s
+    0x4f919210,                                 //  fmul          v16.4s, v16.4s, v17.s[0]
+    0x6e20de00,                                 //  fmul          v0.4s, v16.4s, v0.4s
+    0x6e21de01,                                 //  fmul          v1.4s, v16.4s, v1.4s
+    0x6e22de02,                                 //  fmul          v2.4s, v16.4s, v2.4s
+    0x6e23de03,                                 //  fmul          v3.4s, v16.4s, v3.4s
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_load_tables[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0x9100404b,                                 //  add           x11, x2, #0x10
+    0x4d40c960,                                 //  ld1r          {v0.4s}, [x11]
+    0xd37ef409,                                 //  lsl           x9, x0, #2
+    0xa9402d0a,                                 //  ldp           x10, x11, [x8]
+    0x3ce96942,                                 //  ldr           q2, [x10,x9]
+    0xa9412109,                                 //  ldp           x9, x8, [x8,#16]
+    0x4e221c01,                                 //  and           v1.16b, v0.16b, v2.16b
+    0x0e143c2c,                                 //  mov           w12, v1.s[2]
+    0xbc6c5971,                                 //  ldr           s17, [x11,w12,uxtw #2]
+    0x1e26002c,                                 //  fmov          w12, s1
+    0x6f380443,                                 //  ushr          v3.4s, v2.4s, #8
+    0x6f300450,                                 //  ushr          v16.4s, v2.4s, #16
+    0x8b2c496c,                                 //  add           x12, x11, w12, uxtw #2
+    0x0e0c3c2a,                                 //  mov           w10, v1.s[1]
+    0x0e1c3c2d,                                 //  mov           w13, v1.s[3]
+    0x4e231c01,                                 //  and           v1.16b, v0.16b, v3.16b
+    0x4e301c03,                                 //  and           v3.16b, v0.16b, v16.16b
+    0x0d408180,                                 //  ld1           {v0.s}[0], [x12]
+    0x0e143c2c,                                 //  mov           w12, v1.s[2]
+    0xbc6c5932,                                 //  ldr           s18, [x9,w12,uxtw #2]
+    0x1e26002c,                                 //  fmov          w12, s1
+    0x8b2a496a,                                 //  add           x10, x11, w10, uxtw #2
+    0xbc6d5970,                                 //  ldr           s16, [x11,w13,uxtw #2]
+    0x0e0c3c2b,                                 //  mov           w11, v1.s[1]
+    0x0e1c3c2d,                                 //  mov           w13, v1.s[3]
+    0x8b2c492c,                                 //  add           x12, x9, w12, uxtw #2
+    0xbc6d5933,                                 //  ldr           s19, [x9,w13,uxtw #2]
+    0x0e0c3c6d,                                 //  mov           w13, v3.s[1]
+    0x8b2b4929,                                 //  add           x9, x9, w11, uxtw #2
+    0x0e143c6b,                                 //  mov           w11, v3.s[2]
+    0x0d408181,                                 //  ld1           {v1.s}[0], [x12]
+    0x0e1c3c6c,                                 //  mov           w12, v3.s[3]
+    0x0d409140,                                 //  ld1           {v0.s}[1], [x10]
+    0x1e26006a,                                 //  fmov          w10, s3
+    0xbd400c43,                                 //  ldr           s3, [x2,#12]
+    0x6f280442,                                 //  ushr          v2.4s, v2.4s, #24
+    0x4e21d842,                                 //  scvtf         v2.4s, v2.4s
+    0x8b2a490a,                                 //  add           x10, x8, w10, uxtw #2
+    0x4f839043,                                 //  fmul          v3.4s, v2.4s, v3.s[0]
+    0x0d408142,                                 //  ld1           {v2.s}[0], [x10]
+    0x8b2d490a,                                 //  add           x10, x8, w13, uxtw #2
+    0x6e140620,                                 //  mov           v0.s[2], v17.s[0]
+    0xbc6b5911,                                 //  ldr           s17, [x8,w11,uxtw #2]
+    0x0d409121,                                 //  ld1           {v1.s}[1], [x9]
+    0x0d409142,                                 //  ld1           {v2.s}[1], [x10]
+    0x6e1c0600,                                 //  mov           v0.s[3], v16.s[0]
+    0xbc6c5910,                                 //  ldr           s16, [x8,w12,uxtw #2]
+    0x6e140641,                                 //  mov           v1.s[2], v18.s[0]
+    0x6e140622,                                 //  mov           v2.s[2], v17.s[0]
+    0x6e1c0661,                                 //  mov           v1.s[3], v19.s[0]
+    0x6e1c0602,                                 //  mov           v2.s[3], v16.s[0]
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_load_8888[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0xd37ef409,                                 //  lsl           x9, x0, #2
+    0xbd400c42,                                 //  ldr           s2, [x2,#12]
+    0xf9400108,                                 //  ldr           x8, [x8]
+    0x3ce96900,                                 //  ldr           q0, [x8,x9]
+    0x91004048,                                 //  add           x8, x2, #0x10
+    0x4d40c901,                                 //  ld1r          {v1.4s}, [x8]
+    0x6f380410,                                 //  ushr          v16.4s, v0.4s, #8
+    0x6f300411,                                 //  ushr          v17.4s, v0.4s, #16
+    0x4e201c23,                                 //  and           v3.16b, v1.16b, v0.16b
+    0x6f280400,                                 //  ushr          v0.4s, v0.4s, #24
+    0x4e301c30,                                 //  and           v16.16b, v1.16b, v16.16b
+    0x4e311c21,                                 //  and           v1.16b, v1.16b, v17.16b
+    0x4e21d863,                                 //  scvtf         v3.4s, v3.4s
+    0x4e21d811,                                 //  scvtf         v17.4s, v0.4s
+    0x4e21da10,                                 //  scvtf         v16.4s, v16.4s
+    0x4e21d832,                                 //  scvtf         v18.4s, v1.4s
+    0x4f829060,                                 //  fmul          v0.4s, v3.4s, v2.s[0]
+    0x4f829223,                                 //  fmul          v3.4s, v17.4s, v2.s[0]
+    0x4f829201,                                 //  fmul          v1.4s, v16.4s, v2.s[0]
+    0x4f829242,                                 //  fmul          v2.4s, v18.4s, v2.s[0]
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_store_8888[] = {
+    0xbd400850,                                 //  ldr           s16, [x2,#8]
+    0xf9400028,                                 //  ldr           x8, [x1]
+    0xd37ef409,                                 //  lsl           x9, x0, #2
+    0x4f909032,                                 //  fmul          v18.4s, v1.4s, v16.s[0]
+    0x4f909011,                                 //  fmul          v17.4s, v0.4s, v16.s[0]
+    0x6e21aa52,                                 //  fcvtnu        v18.4s, v18.4s
+    0x6e21aa31,                                 //  fcvtnu        v17.4s, v17.4s
+    0x4f285652,                                 //  shl           v18.4s, v18.4s, #8
+    0x4eb11e51,                                 //  orr           v17.16b, v18.16b, v17.16b
+    0x4f909052,                                 //  fmul          v18.4s, v2.4s, v16.s[0]
+    0xf9400108,                                 //  ldr           x8, [x8]
+    0x4f909070,                                 //  fmul          v16.4s, v3.4s, v16.s[0]
+    0x6e21aa52,                                 //  fcvtnu        v18.4s, v18.4s
+    0x6e21aa10,                                 //  fcvtnu        v16.4s, v16.4s
+    0x4f305652,                                 //  shl           v18.4s, v18.4s, #16
+    0x4eb21e31,                                 //  orr           v17.16b, v17.16b, v18.16b
+    0x4f385610,                                 //  shl           v16.4s, v16.4s, #24
+    0x4eb01e30,                                 //  orr           v16.16b, v17.16b, v16.16b
+    0x3ca96910,                                 //  str           q16, [x8,x9]
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_load_f16[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0xf9400108,                                 //  ldr           x8, [x8]
+    0x8b000d08,                                 //  add           x8, x8, x0, lsl #3
+    0x0c400510,                                 //  ld4           {v16.4h-v19.4h}, [x8]
+    0x0e217a00,                                 //  fcvtl         v0.4s, v16.4h
+    0x0e217a21,                                 //  fcvtl         v1.4s, v17.4h
+    0x0e217a42,                                 //  fcvtl         v2.4s, v18.4h
+    0x0e217a63,                                 //  fcvtl         v3.4s, v19.4h
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_store_f16[] = {
+    0xf9400028,                                 //  ldr           x8, [x1]
+    0x0e216810,                                 //  fcvtn         v16.4h, v0.4s
+    0x0e216831,                                 //  fcvtn         v17.4h, v1.4s
+    0x0e216852,                                 //  fcvtn         v18.4h, v2.4s
+    0xf9400108,                                 //  ldr           x8, [x8]
+    0x0e216873,                                 //  fcvtn         v19.4h, v3.4s
+    0x8b000d08,                                 //  add           x8, x8, x0, lsl #3
+    0x0c000510,                                 //  st4           {v16.4h-v19.4h}, [x8]
+    0xf9400423,                                 //  ldr           x3, [x1,#8]
+    0x91004021,                                 //  add           x1, x1, #0x10
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_clamp_x[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0x6f07e7f0,                                 //  movi          v16.2d, #0xffffffffffffffff
+    0x6f00e411,                                 //  movi          v17.2d, #0x0
+    0x4d40c912,                                 //  ld1r          {v18.4s}, [x8]
+    0x4eb08650,                                 //  add           v16.4s, v18.4s, v16.4s
+    0x4eb0f400,                                 //  fmin          v0.4s, v0.4s, v16.4s
+    0x4e20f620,                                 //  fmax          v0.4s, v17.4s, v0.4s
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_clamp_y[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0x6f07e7f0,                                 //  movi          v16.2d, #0xffffffffffffffff
+    0x6f00e411,                                 //  movi          v17.2d, #0x0
+    0x4d40c912,                                 //  ld1r          {v18.4s}, [x8]
+    0x4eb08650,                                 //  add           v16.4s, v18.4s, v16.4s
+    0x4eb0f421,                                 //  fmin          v1.4s, v1.4s, v16.4s
+    0x4e21f621,                                 //  fmax          v1.4s, v17.4s, v1.4s
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_matrix_2x3[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0xaa0803e9,                                 //  mov           x9, x8
+    0x9100410a,                                 //  add           x10, x8, #0x10
+    0x4ddfc932,                                 //  ld1r          {v18.4s}, [x9], #4
+    0x4d40c950,                                 //  ld1r          {v16.4s}, [x10]
+    0x2d415113,                                 //  ldp           s19, s20, [x8,#8]
+    0x9100510a,                                 //  add           x10, x8, #0x14
+    0x4d40c951,                                 //  ld1r          {v17.4s}, [x10]
+    0x4f931030,                                 //  fmla          v16.4s, v1.4s, v19.s[0]
+    0xbd400133,                                 //  ldr           s19, [x9]
+    0x4f941031,                                 //  fmla          v17.4s, v1.4s, v20.s[0]
+    0x4e20ce50,                                 //  fmla          v16.4s, v18.4s, v0.4s
+    0x4f931011,                                 //  fmla          v17.4s, v0.4s, v19.s[0]
+    0x4eb01e00,                                 //  mov           v0.16b, v16.16b
+    0x4eb11e21,                                 //  mov           v1.16b, v17.16b
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_matrix_3x4[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0xaa0803e9,                                 //  mov           x9, x8
+    0x9100910a,                                 //  add           x10, x8, #0x24
+    0x4ddfc933,                                 //  ld1r          {v19.4s}, [x9], #4
+    0x4d40c950,                                 //  ld1r          {v16.4s}, [x10]
+    0x9100a10a,                                 //  add           x10, x8, #0x28
+    0x4d40c951,                                 //  ld1r          {v17.4s}, [x10]
+    0x9100b10a,                                 //  add           x10, x8, #0x2c
+    0x2d435514,                                 //  ldp           s20, s21, [x8,#24]
+    0xbd402116,                                 //  ldr           s22, [x8,#32]
+    0x4d40c952,                                 //  ld1r          {v18.4s}, [x10]
+    0x4f941050,                                 //  fmla          v16.4s, v2.4s, v20.s[0]
+    0x2d415d14,                                 //  ldp           s20, s23, [x8,#8]
+    0x4f951051,                                 //  fmla          v17.4s, v2.4s, v21.s[0]
+    0x4f961052,                                 //  fmla          v18.4s, v2.4s, v22.s[0]
+    0x2d425502,                                 //  ldp           s2, s21, [x8,#16]
+    0x4f971030,                                 //  fmla          v16.4s, v1.4s, v23.s[0]
+    0x4e20ce70,                                 //  fmla          v16.4s, v19.4s, v0.4s
+    0x4f821031,                                 //  fmla          v17.4s, v1.4s, v2.s[0]
+    0xbd400122,                                 //  ldr           s2, [x9]
+    0x4f951032,                                 //  fmla          v18.4s, v1.4s, v21.s[0]
+    0x4f941012,                                 //  fmla          v18.4s, v0.4s, v20.s[0]
+    0x4f821011,                                 //  fmla          v17.4s, v0.4s, v2.s[0]
+    0x4eb01e00,                                 //  mov           v0.16b, v16.16b
+    0x4eb11e21,                                 //  mov           v1.16b, v17.16b
+    0x4eb21e42,                                 //  mov           v2.16b, v18.16b
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int aarch64_sk_linear_gradient_2stops[] = {
+    0xa8c10c28,                                 //  ldp           x8, x3, [x1],#16
+    0xad404503,                                 //  ldp           q3, q17, [x8]
+    0x4e040470,                                 //  dup           v16.4s, v3.s[0]
+    0x4e0c0461,                                 //  dup           v1.4s, v3.s[1]
+    0x4e140462,                                 //  dup           v2.4s, v3.s[2]
+    0x4e1c0463,                                 //  dup           v3.4s, v3.s[3]
+    0x4f911010,                                 //  fmla          v16.4s, v0.4s, v17.s[0]
+    0x4fb11001,                                 //  fmla          v1.4s, v0.4s, v17.s[1]
+    0x4f911802,                                 //  fmla          v2.4s, v0.4s, v17.s[2]
+    0x4fb11803,                                 //  fmla          v3.4s, v0.4s, v17.s[3]
+    0x4eb01e00,                                 //  mov           v0.16b, v16.16b
+    0xd61f0060,                                 //  br            x3
+};
+static const unsigned int armv7_sk_just_return[] = {
+    0xe12fff1e,                                 //  bx            lr
+};
+static const unsigned int armv7_sk_seed_shader[] = {
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xee800b90,                                 //  vdup.32       d16, r0
+    0xf3fb0620,                                 //  vcvt.f32.s32  d16, d16
+    0xedd23b05,                                 //  vldr          d19, [r2, #20]
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xf2803010,                                 //  vmov.i32      d3, #0
+    0xf4e31c9f,                                 //  vld1.32       {d17[]}, [r3 :32]
+    0xe2823004,                                 //  add           r3, r2, #4
+    0xf3fb1621,                                 //  vcvt.f32.s32  d17, d17
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf4e32c9f,                                 //  vld1.32       {d18[]}, [r3 :32]
+    0xf2804010,                                 //  vmov.i32      d4, #0
+    0xf2400da2,                                 //  vadd.f32      d16, d16, d18
+    0xf2805010,                                 //  vmov.i32      d5, #0
+    0xf4a22c9f,                                 //  vld1.32       {d2[]}, [r2 :32]
+    0xf2011da2,                                 //  vadd.f32      d1, d17, d18
+    0xf2806010,                                 //  vmov.i32      d6, #0
+    0xf2030da0,                                 //  vadd.f32      d0, d19, d16
+    0xf2807010,                                 //  vmov.i32      d7, #0
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_constant_color[] = {
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf4630a0f,                                 //  vld1.8        {d16-d17}, [r3]
+    0xf3b40c20,                                 //  vdup.32       d0, d16[0]
+    0xf3bc1c20,                                 //  vdup.32       d1, d16[1]
+    0xf3b42c21,                                 //  vdup.32       d2, d17[0]
+    0xf3bc3c21,                                 //  vdup.32       d3, d17[1]
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_clear[] = {
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xf2800010,                                 //  vmov.i32      d0, #0
+    0xf2801010,                                 //  vmov.i32      d1, #0
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf2802010,                                 //  vmov.i32      d2, #0
+    0xf2803010,                                 //  vmov.i32      d3, #0
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_plus_[] = {
+    0xf2000d04,                                 //  vadd.f32      d0, d0, d4
+    0xe2813008,                                 //  add           r3, r1, #8
+    0xf2011d05,                                 //  vadd.f32      d1, d1, d5
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xf2022d06,                                 //  vadd.f32      d2, d2, d6
+    0xe1a01003,                                 //  mov           r1, r3
+    0xf2033d07,                                 //  vadd.f32      d3, d3, d7
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_srcover[] = {
+    0xf4e20c9f,                                 //  vld1.32       {d16[]}, [r2 :32]
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xf2600d83,                                 //  vsub.f32      d16, d16, d3
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf2040c30,                                 //  vfma.f32      d0, d4, d16
+    0xf2051c30,                                 //  vfma.f32      d1, d5, d16
+    0xf2062c30,                                 //  vfma.f32      d2, d6, d16
+    0xf2073c30,                                 //  vfma.f32      d3, d7, d16
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_dstover[] = {
+    0xf4e20c9f,                                 //  vld1.32       {d16[]}, [r2 :32]
+    0xf2651115,                                 //  vorr          d17, d5, d5
+    0xf2604d87,                                 //  vsub.f32      d20, d16, d7
+    0xf2640114,                                 //  vorr          d16, d4, d4
+    0xf2662116,                                 //  vorr          d18, d6, d6
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xf2673117,                                 //  vorr          d19, d7, d7
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf2400c34,                                 //  vfma.f32      d16, d0, d20
+    0xf2411c34,                                 //  vfma.f32      d17, d1, d20
+    0xf2422c34,                                 //  vfma.f32      d18, d2, d20
+    0xf2433c34,                                 //  vfma.f32      d19, d3, d20
+    0xf22001b0,                                 //  vorr          d0, d16, d16
+    0xf22111b1,                                 //  vorr          d1, d17, d17
+    0xf22221b2,                                 //  vorr          d2, d18, d18
+    0xf22331b3,                                 //  vorr          d3, d19, d19
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_clamp_0[] = {
+    0xf2c00010,                                 //  vmov.i32      d16, #0
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf2000f20,                                 //  vmax.f32      d0, d0, d16
+    0xf2011f20,                                 //  vmax.f32      d1, d1, d16
+    0xf2022f20,                                 //  vmax.f32      d2, d2, d16
+    0xf2033f20,                                 //  vmax.f32      d3, d3, d16
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_clamp_1[] = {
+    0xf4e20c9f,                                 //  vld1.32       {d16[]}, [r2 :32]
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xf2200f20,                                 //  vmin.f32      d0, d0, d16
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf2211f20,                                 //  vmin.f32      d1, d1, d16
+    0xf2222f20,                                 //  vmin.f32      d2, d2, d16
+    0xf2233f20,                                 //  vmin.f32      d3, d3, d16
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_clamp_a[] = {
+    0xf4e20c9f,                                 //  vld1.32       {d16[]}, [r2 :32]
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xf2233f20,                                 //  vmin.f32      d3, d3, d16
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf2200f03,                                 //  vmin.f32      d0, d0, d3
+    0xf2211f03,                                 //  vmin.f32      d1, d1, d3
+    0xf2222f03,                                 //  vmin.f32      d2, d2, d3
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_swap[] = {
+    0xeef00b43,                                 //  vmov.f64      d16, d3
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xeef01b42,                                 //  vmov.f64      d17, d2
+    0xeef02b41,                                 //  vmov.f64      d18, d1
+    0xeef03b40,                                 //  vmov.f64      d19, d0
+    0xeeb00b44,                                 //  vmov.f64      d0, d4
+    0xeeb01b45,                                 //  vmov.f64      d1, d5
+    0xeeb02b46,                                 //  vmov.f64      d2, d6
+    0xeeb03b47,                                 //  vmov.f64      d3, d7
+    0xeeb04b63,                                 //  vmov.f64      d4, d19
+    0xeeb05b62,                                 //  vmov.f64      d5, d18
+    0xeeb06b61,                                 //  vmov.f64      d6, d17
+    0xeeb07b60,                                 //  vmov.f64      d7, d16
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_move_src_dst[] = {
+    0xeeb04b40,                                 //  vmov.f64      d4, d0
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xeeb05b41,                                 //  vmov.f64      d5, d1
+    0xeeb06b42,                                 //  vmov.f64      d6, d2
+    0xeeb07b43,                                 //  vmov.f64      d7, d3
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_move_dst_src[] = {
+    0xeeb00b44,                                 //  vmov.f64      d0, d4
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xeeb01b45,                                 //  vmov.f64      d1, d5
+    0xeeb02b46,                                 //  vmov.f64      d2, d6
+    0xeeb03b47,                                 //  vmov.f64      d3, d7
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_premul[] = {
+    0xf3000d13,                                 //  vmul.f32      d0, d0, d3
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xf3011d13,                                 //  vmul.f32      d1, d1, d3
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf3022d13,                                 //  vmul.f32      d2, d2, d3
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_unpremul[] = {
+    0xed2d8b04,                                 //  vpush         {d8-d9}
+    0xed928a00,                                 //  vldr          s16, [r2]
+    0xf2c00010,                                 //  vmov.i32      d16, #0
+    0xf3f91503,                                 //  vceq.f32      d17, d3, #0
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xeec89a23,                                 //  vdiv.f32      s19, s16, s7
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xee889a03,                                 //  vdiv.f32      s18, s16, s6
+    0xf3501199,                                 //  vbsl          d17, d16, d9
+    0xf3010d90,                                 //  vmul.f32      d0, d17, d0
+    0xf3011d91,                                 //  vmul.f32      d1, d17, d1
+    0xf3012d92,                                 //  vmul.f32      d2, d17, d2
+    0xecbd8b04,                                 //  vpop          {d8-d9}
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_from_srgb[] = {
+    0xed2d8b02,                                 //  vpush         {d8}
+    0xe282303c,                                 //  add           r3, r2, #60
+    0xed928a10,                                 //  vldr          s16, [r2, #64]
+    0xf3402d10,                                 //  vmul.f32      d18, d0, d0
+    0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
+    0xe2823038,                                 //  add           r3, r2, #56
+    0xf3413d11,                                 //  vmul.f32      d19, d1, d1
+    0xf4e31c9f,                                 //  vld1.32       {d17[]}, [r3 :32]
+    0xe2823044,                                 //  add           r3, r2, #68
+    0xf26141b1,                                 //  vorr          d20, d17, d17
+    0xf26171b1,                                 //  vorr          d23, d17, d17
+    0xf4e38c9f,                                 //  vld1.32       {d24[]}, [r3 :32]
+    0xf2404c30,                                 //  vfma.f32      d20, d0, d16
+    0xe2823034,                                 //  add           r3, r2, #52
+    0xf2417c30,                                 //  vfma.f32      d23, d1, d16
+    0xf2421c30,                                 //  vfma.f32      d17, d2, d16
+    0xf3425d12,                                 //  vmul.f32      d21, d2, d2
+    0xf2e16948,                                 //  vmul.f32      d22, d1, d8[0]
+    0xf2e00948,                                 //  vmul.f32      d16, d0, d8[0]
+    0xf2e29948,                                 //  vmul.f32      d25, d2, d8[0]
+    0xf3282e82,                                 //  vcgt.f32      d2, d24, d2
+    0xf3281e81,                                 //  vcgt.f32      d1, d24, d1
+    0xf3280e80,                                 //  vcgt.f32      d0, d24, d0
+    0xf4e38c9f,                                 //  vld1.32       {d24[]}, [r3 :32]
+    0xf268a1b8,                                 //  vorr          d26, d24, d24
+    0xf242acb4,                                 //  vfma.f32      d26, d18, d20
+    0xf26821b8,                                 //  vorr          d18, d24, d24
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xf2432cb7,                                 //  vfma.f32      d18, d19, d23
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf2458cb1,                                 //  vfma.f32      d24, d21, d17
+    0xf31001ba,                                 //  vbsl          d0, d16, d26
+    0xf31611b2,                                 //  vbsl          d1, d22, d18
+    0xf31921b8,                                 //  vbsl          d2, d25, d24
+    0xecbd8b02,                                 //  vpop          {d8}
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_to_srgb[] = {
+    0xed2d8b02,                                 //  vpush         {d8}
+    0xf3fb0580,                                 //  vrsqrte.f32   d16, d0
+    0xe2823050,                                 //  add           r3, r2, #80
+    0xf3fb1581,                                 //  vrsqrte.f32   d17, d1
+    0xed928a12,                                 //  vldr          s16, [r2, #72]
+    0xf3fb2582,                                 //  vrsqrte.f32   d18, d2
+    0xf3403db0,                                 //  vmul.f32      d19, d16, d16
+    0xf3414db1,                                 //  vmul.f32      d20, d17, d17
+    0xf3425db2,                                 //  vmul.f32      d21, d18, d18
+    0xf2603f33,                                 //  vrsqrts.f32   d19, d0, d19
+    0xf2614f34,                                 //  vrsqrts.f32   d20, d1, d20
+    0xf2625f35,                                 //  vrsqrts.f32   d21, d2, d21
+    0xf3400db3,                                 //  vmul.f32      d16, d16, d19
+    0xf3411db4,                                 //  vmul.f32      d17, d17, d20
+    0xf3422db5,                                 //  vmul.f32      d18, d18, d21
+    0xf3fb3520,                                 //  vrecpe.f32    d19, d16
+    0xf3fb4521,                                 //  vrecpe.f32    d20, d17
+    0xf3fb6522,                                 //  vrecpe.f32    d22, d18
+    0xf3fb55a2,                                 //  vrsqrte.f32   d21, d18
+    0xf3fb75a0,                                 //  vrsqrte.f32   d23, d16
+    0xf3fb85a1,                                 //  vrsqrte.f32   d24, d17
+    0xf2409fb3,                                 //  vrecps.f32    d25, d16, d19
+    0xf241afb4,                                 //  vrecps.f32    d26, d17, d20
+    0xf242bfb6,                                 //  vrecps.f32    d27, d18, d22
+    0xf345cdb5,                                 //  vmul.f32      d28, d21, d21
+    0xf347ddb7,                                 //  vmul.f32      d29, d23, d23
+    0xf348edb8,                                 //  vmul.f32      d30, d24, d24
+    0xf2622fbc,                                 //  vrsqrts.f32   d18, d18, d28
+    0xf2600fbd,                                 //  vrsqrts.f32   d16, d16, d29
+    0xf2611fbe,                                 //  vrsqrts.f32   d17, d17, d30
+    0xf3433db9,                                 //  vmul.f32      d19, d19, d25
+    0xf4e39c9f,                                 //  vld1.32       {d25[]}, [r3 :32]
+    0xe2823054,                                 //  add           r3, r2, #84
+    0xf3444dba,                                 //  vmul.f32      d20, d20, d26
+    0xf3466dbb,                                 //  vmul.f32      d22, d22, d27
+    0xf4e3ac9f,                                 //  vld1.32       {d26[]}, [r3 :32]
+    0xe282304c,                                 //  add           r3, r2, #76
+    0xf26ab1ba,                                 //  vorr          d27, d26, d26
+    0xf249bcb3,                                 //  vfma.f32      d27, d25, d19
+    0xf26a31ba,                                 //  vorr          d19, d26, d26
+    0xf2493cb4,                                 //  vfma.f32      d19, d25, d20
+    0xf4e34c9f,                                 //  vld1.32       {d20[]}, [r3 :32]
+    0xf249acb6,                                 //  vfma.f32      d26, d25, d22
+    0xe2823058,                                 //  add           r3, r2, #88
+    0xf3452db2,                                 //  vmul.f32      d18, d21, d18
+    0xf3470db0,                                 //  vmul.f32      d16, d23, d16
+    0xf3481db1,                                 //  vmul.f32      d17, d24, d17
+    0xf2e05948,                                 //  vmul.f32      d21, d0, d8[0]
+    0xf244bcb0,                                 //  vfma.f32      d27, d20, d16
+    0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
+    0xf2443cb1,                                 //  vfma.f32      d19, d20, d17
+    0xf244acb2,                                 //  vfma.f32      d26, d20, d18
+    0xf4e24c9f,                                 //  vld1.32       {d20[]}, [r2 :32]
+    0xf2e11948,                                 //  vmul.f32      d17, d1, d8[0]
+    0xf2e22948,                                 //  vmul.f32      d18, d2, d8[0]
+    0xf3201e81,                                 //  vcgt.f32      d1, d16, d1
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xf3200e80,                                 //  vcgt.f32      d0, d16, d0
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf3202e82,                                 //  vcgt.f32      d2, d16, d2
+    0xf2640fab,                                 //  vmin.f32      d16, d20, d27
+    0xf2643fa3,                                 //  vmin.f32      d19, d20, d19
+    0xf2644faa,                                 //  vmin.f32      d20, d20, d26
+    0xf31501b0,                                 //  vbsl          d0, d21, d16
+    0xf31111b3,                                 //  vbsl          d1, d17, d19
+    0xf31221b4,                                 //  vbsl          d2, d18, d20
+    0xecbd8b02,                                 //  vpop          {d8}
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_scale_u8[] = {
+    0xed2d8b02,                                 //  vpush         {d8}
+    0xe24dd008,                                 //  sub           sp, sp, #8
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xe5933000,                                 //  ldr           r3, [r3]
+    0xe0833000,                                 //  add           r3, r3, r0
+    0xe1d330b0,                                 //  ldrh          r3, [r3]
+    0xe1cd30b4,                                 //  strh          r3, [sp, #4]
+    0xe28d3004,                                 //  add           r3, sp, #4
+    0xed928a03,                                 //  vldr          s16, [r2, #12]
+    0xf4e3041f,                                 //  vld1.16       {d16[0]}, [r3 :16]
+    0xf3c80a30,                                 //  vmovl.u8      q8, d16
+    0xf3d00a30,                                 //  vmovl.u16     q8, d16
+    0xf3fb06a0,                                 //  vcvt.f32.u32  d16, d16
+    0xf2e009c8,                                 //  vmul.f32      d16, d16, d8[0]
+    0xf3000d90,                                 //  vmul.f32      d0, d16, d0
+    0xf3001d91,                                 //  vmul.f32      d1, d16, d1
+    0xf3002d92,                                 //  vmul.f32      d2, d16, d2
+    0xf3003d93,                                 //  vmul.f32      d3, d16, d3
+    0xe28dd008,                                 //  add           sp, sp, #8
+    0xecbd8b02,                                 //  vpop          {d8}
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_load_tables[] = {
+    0xe92d48f0,                                 //  push          {r4, r5, r6, r7, fp, lr}
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xe2826010,                                 //  add           r6, r2, #16
+    0xed922a03,                                 //  vldr          s4, [r2, #12]
+    0xf4e60c9f,                                 //  vld1.32       {d16[]}, [r6 :32]
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xe593e000,                                 //  ldr           lr, [r3]
+    0xe5934004,                                 //  ldr           r4, [r3, #4]
+    0xe08e6100,                                 //  add           r6, lr, r0, lsl #2
+    0xe5935008,                                 //  ldr           r5, [r3, #8]
+    0xe593700c,                                 //  ldr           r7, [r3, #12]
+    0xedd61b00,                                 //  vldr          d17, [r6]
+    0xf24021b1,                                 //  vand          d18, d16, d17
+    0xf3f03031,                                 //  vshr.u32      d19, d17, #16
+    0xee326b90,                                 //  vmov.32       r6, d18[1]
+    0xe0846106,                                 //  add           r6, r4, r6, lsl #2
+    0xedd60a00,                                 //  vldr          s1, [r6]
+    0xee126b90,                                 //  vmov.32       r6, d18[0]
+    0xf3f82031,                                 //  vshr.u32      d18, d17, #8
+    0xf24021b2,                                 //  vand          d18, d16, d18
+    0xf24001b3,                                 //  vand          d16, d16, d19
+    0xee103b90,                                 //  vmov.32       r3, d16[0]
+    0xe0846106,                                 //  add           r6, r4, r6, lsl #2
+    0xee304b90,                                 //  vmov.32       r4, d16[1]
+    0xf3e80031,                                 //  vshr.u32      d16, d17, #24
+    0xed960a00,                                 //  vldr          s0, [r6]
+    0xee326b90,                                 //  vmov.32       r6, d18[1]
+    0xf3fb0620,                                 //  vcvt.f32.s32  d16, d16
+    0xe0873103,                                 //  add           r3, r7, r3, lsl #2
+    0xf2a039c2,                                 //  vmul.f32      d3, d16, d2[0]
+    0xe0874104,                                 //  add           r4, r7, r4, lsl #2
+    0xedd42a00,                                 //  vldr          s5, [r4]
+    0xe0856106,                                 //  add           r6, r5, r6, lsl #2
+    0xed932a00,                                 //  vldr          s4, [r3]
+    0xedd61a00,                                 //  vldr          s3, [r6]
+    0xee126b90,                                 //  vmov.32       r6, d18[0]
+    0xe0856106,                                 //  add           r6, r5, r6, lsl #2
+    0xed961a00,                                 //  vldr          s2, [r6]
+    0xe8bd48f0,                                 //  pop           {r4, r5, r6, r7, fp, lr}
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_load_8888[] = {
+    0xe92d4800,                                 //  push          {fp, lr}
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xed922a03,                                 //  vldr          s4, [r2, #12]
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xe593e000,                                 //  ldr           lr, [r3]
+    0xe2823010,                                 //  add           r3, r2, #16
+    0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
+    0xe08e3100,                                 //  add           r3, lr, r0, lsl #2
+    0xedd31b00,                                 //  vldr          d17, [r3]
+    0xf24021b1,                                 //  vand          d18, d16, d17
+    0xf3f83031,                                 //  vshr.u32      d19, d17, #8
+    0xf3e84031,                                 //  vshr.u32      d20, d17, #24
+    0xf3f01031,                                 //  vshr.u32      d17, d17, #16
+    0xf24031b3,                                 //  vand          d19, d16, d19
+    0xf24001b1,                                 //  vand          d16, d16, d17
+    0xf3fb2622,                                 //  vcvt.f32.s32  d18, d18
+    0xf3fb4624,                                 //  vcvt.f32.s32  d20, d20
+    0xf3fb1623,                                 //  vcvt.f32.s32  d17, d19
+    0xf3fb0620,                                 //  vcvt.f32.s32  d16, d16
+    0xf2a209c2,                                 //  vmul.f32      d0, d18, d2[0]
+    0xf2a439c2,                                 //  vmul.f32      d3, d20, d2[0]
+    0xf2a119c2,                                 //  vmul.f32      d1, d17, d2[0]
+    0xf2a029c2,                                 //  vmul.f32      d2, d16, d2[0]
+    0xe8bd4800,                                 //  pop           {fp, lr}
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_store_8888[] = {
+    0xe2823008,                                 //  add           r3, r2, #8
+    0xf2c3261f,                                 //  vmov.i32      d18, #1056964608
+    0xf2c3361f,                                 //  vmov.i32      d19, #1056964608
+    0xf4e31c9f,                                 //  vld1.32       {d17[]}, [r3 :32]
+    0xf2c3061f,                                 //  vmov.i32      d16, #1056964608
+    0xf2412c31,                                 //  vfma.f32      d18, d1, d17
+    0xf2423c31,                                 //  vfma.f32      d19, d2, d17
+    0xf2c3461f,                                 //  vmov.i32      d20, #1056964608
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xf2400c31,                                 //  vfma.f32      d16, d0, d17
+    0xf2434c31,                                 //  vfma.f32      d20, d3, d17
+    0xe5933000,                                 //  ldr           r3, [r3]
+    0xe0833100,                                 //  add           r3, r3, r0, lsl #2
+    0xf3fb17a2,                                 //  vcvt.u32.f32  d17, d18
+    0xf3fb27a3,                                 //  vcvt.u32.f32  d18, d19
+    0xf3fb07a0,                                 //  vcvt.u32.f32  d16, d16
+    0xf3fb37a4,                                 //  vcvt.u32.f32  d19, d20
+    0xf2e81531,                                 //  vshl.s32      d17, d17, #8
+    0xf2f02532,                                 //  vshl.s32      d18, d18, #16
+    0xf26101b0,                                 //  vorr          d16, d17, d16
+    0xf2f81533,                                 //  vshl.s32      d17, d19, #24
+    0xf26001b2,                                 //  vorr          d16, d16, d18
+    0xf26001b1,                                 //  vorr          d16, d16, d17
+    0xedc30b00,                                 //  vstr          d16, [r3]
+    0xe5913004,                                 //  ldr           r3, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xe12fff13,                                 //  bx            r3
+};
+static const unsigned int armv7_sk_load_f16[] = {
+    0xed2d8b04,                                 //  vpush         {d8-d9}
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xe5933000,                                 //  ldr           r3, [r3]
+    0xe0833180,                                 //  add           r3, r3, r0, lsl #3
+    0xf463084f,                                 //  vld2.16       {d16-d17}, [r3]
+    0xf3b62720,                                 //  vcvt.f32.f16  q1, d16
+    0xf3b68721,                                 //  vcvt.f32.f16  q4, d17
+    0xf2220112,                                 //  vorr          d0, d2, d2
+    0xeef00a43,                                 //  vmov.f32      s1, s6
+    0xf2281118,                                 //  vorr          d1, d8, d8
+    0xeeb03a62,                                 //  vmov.f32      s6, s5
+    0xeef01a49,                                 //  vmov.f32      s3, s18
+    0xeeb09a68,                                 //  vmov.f32      s18, s17
+    0xeeb02b43,                                 //  vmov.f64      d2, d3
+    0xeeb03b49,                                 //  vmov.f64      d3, d9
+    0xecbd8b04,                                 //  vpop          {d8-d9}
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_store_f16[] = {
+    0xeef00b41,                                 //  vmov.f64      d16, d1
+    0xf2631113,                                 //  vorr          d17, d3, d3
+    0xeef03b42,                                 //  vmov.f64      d19, d2
+    0xf2602110,                                 //  vorr          d18, d0, d0
+    0xf3fa00a1,                                 //  vtrn.32       d16, d17
+    0xf3f61620,                                 //  vcvt.f16.f32  d17, q8
+    0xf3fa20a3,                                 //  vtrn.32       d18, d19
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xf3f60622,                                 //  vcvt.f16.f32  d16, q9
+    0xe5933000,                                 //  ldr           r3, [r3]
+    0xe0833180,                                 //  add           r3, r3, r0, lsl #3
+    0xf443084f,                                 //  vst2.16       {d16-d17}, [r3]
+    0xe2813008,                                 //  add           r3, r1, #8
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe1a01003,                                 //  mov           r1, r3
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_clamp_x[] = {
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xf3c70e1f,                                 //  vmov.i8       d16, #255
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf4e31c9f,                                 //  vld1.32       {d17[]}, [r3 :32]
+    0xf26108a0,                                 //  vadd.i32      d16, d17, d16
+    0xf2c01010,                                 //  vmov.i32      d17, #0
+    0xf2600f20,                                 //  vmin.f32      d16, d0, d16
+    0xf2010fa0,                                 //  vmax.f32      d0, d17, d16
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_clamp_y[] = {
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xf3c70e1f,                                 //  vmov.i8       d16, #255
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf4e31c9f,                                 //  vld1.32       {d17[]}, [r3 :32]
+    0xf26108a0,                                 //  vadd.i32      d16, d17, d16
+    0xf2c01010,                                 //  vmov.i32      d17, #0
+    0xf2610f20,                                 //  vmin.f32      d16, d1, d16
+    0xf2011fa0,                                 //  vmax.f32      d1, d17, d16
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_matrix_2x3[] = {
+    0xe92d4800,                                 //  push          {fp, lr}
+    0xe591e000,                                 //  ldr           lr, [r1]
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xe28e300c,                                 //  add           r3, lr, #12
+    0xf4e32c9f,                                 //  vld1.32       {d18[]}, [r3 :32]
+    0xe28e3008,                                 //  add           r3, lr, #8
+    0xf4e31c9f,                                 //  vld1.32       {d17[]}, [r3 :32]
+    0xe28e3010,                                 //  add           r3, lr, #16
+    0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
+    0xe28e3014,                                 //  add           r3, lr, #20
+    0xf2410c31,                                 //  vfma.f32      d16, d1, d17
+    0xf4e31c9f,                                 //  vld1.32       {d17[]}, [r3 :32]
+    0xe28e3004,                                 //  add           r3, lr, #4
+    0xf2411c32,                                 //  vfma.f32      d17, d1, d18
+    0xf4ee2c9f,                                 //  vld1.32       {d18[]}, [lr :32]
+    0xf4e33c9f,                                 //  vld1.32       {d19[]}, [r3 :32]
+    0xf2400c32,                                 //  vfma.f32      d16, d0, d18
+    0xf2401c33,                                 //  vfma.f32      d17, d0, d19
+    0xf22001b0,                                 //  vorr          d0, d16, d16
+    0xf22111b1,                                 //  vorr          d1, d17, d17
+    0xe8bd4800,                                 //  pop           {fp, lr}
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_matrix_3x4[] = {
+    0xe92d4800,                                 //  push          {fp, lr}
+    0xe591e000,                                 //  ldr           lr, [r1]
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xe28e3020,                                 //  add           r3, lr, #32
+    0xf4e33c9f,                                 //  vld1.32       {d19[]}, [r3 :32]
+    0xe28e302c,                                 //  add           r3, lr, #44
+    0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
+    0xe28e301c,                                 //  add           r3, lr, #28
+    0xf2420c33,                                 //  vfma.f32      d16, d2, d19
+    0xf4e34c9f,                                 //  vld1.32       {d20[]}, [r3 :32]
+    0xe28e3018,                                 //  add           r3, lr, #24
+    0xf4e32c9f,                                 //  vld1.32       {d18[]}, [r3 :32]
+    0xe28e3024,                                 //  add           r3, lr, #36
+    0xf4e31c9f,                                 //  vld1.32       {d17[]}, [r3 :32]
+    0xe28e3028,                                 //  add           r3, lr, #40
+    0xf2421c32,                                 //  vfma.f32      d17, d2, d18
+    0xf4e32c9f,                                 //  vld1.32       {d18[]}, [r3 :32]
+    0xe28e3010,                                 //  add           r3, lr, #16
+    0xf2422c34,                                 //  vfma.f32      d18, d2, d20
+    0xf4e33c9f,                                 //  vld1.32       {d19[]}, [r3 :32]
+    0xe28e300c,                                 //  add           r3, lr, #12
+    0xf4e34c9f,                                 //  vld1.32       {d20[]}, [r3 :32]
+    0xe28e3014,                                 //  add           r3, lr, #20
+    0xf2411c34,                                 //  vfma.f32      d17, d1, d20
+    0xf4e34c9f,                                 //  vld1.32       {d20[]}, [r3 :32]
+    0xf2410c34,                                 //  vfma.f32      d16, d1, d20
+    0xe28e3004,                                 //  add           r3, lr, #4
+    0xf2412c33,                                 //  vfma.f32      d18, d1, d19
+    0xf4ee3c9f,                                 //  vld1.32       {d19[]}, [lr :32]
+    0xf4e34c9f,                                 //  vld1.32       {d20[]}, [r3 :32]
+    0xe28e3008,                                 //  add           r3, lr, #8
+    0xf2401c33,                                 //  vfma.f32      d17, d0, d19
+    0xf4e33c9f,                                 //  vld1.32       {d19[]}, [r3 :32]
+    0xf2400c33,                                 //  vfma.f32      d16, d0, d19
+    0xf2402c34,                                 //  vfma.f32      d18, d0, d20
+    0xf22101b1,                                 //  vorr          d0, d17, d17
+    0xf22021b0,                                 //  vorr          d2, d16, d16
+    0xf22211b2,                                 //  vorr          d1, d18, d18
+    0xe8bd4800,                                 //  pop           {fp, lr}
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned int armv7_sk_linear_gradient_2stops[] = {
+    0xe5913000,                                 //  ldr           r3, [r1]
+    0xe591c004,                                 //  ldr           ip, [r1, #4]
+    0xe2811008,                                 //  add           r1, r1, #8
+    0xf4632a0d,                                 //  vld1.8        {d18-d19}, [r3]!
+    0xf4634a0f,                                 //  vld1.8        {d20-d21}, [r3]
+    0xf3f40c22,                                 //  vdup.32       d16, d18[0]
+    0xf3f41c24,                                 //  vdup.32       d17, d20[0]
+    0xf2400c31,                                 //  vfma.f32      d16, d0, d17
+    0xf3fc6c24,                                 //  vdup.32       d22, d20[1]
+    0xf3bc1c22,                                 //  vdup.32       d1, d18[1]
+    0xf3b42c23,                                 //  vdup.32       d2, d19[0]
+    0xf2001c36,                                 //  vfma.f32      d1, d0, d22
+    0xf3f41c25,                                 //  vdup.32       d17, d21[0]
+    0xf3fc4c25,                                 //  vdup.32       d20, d21[1]
+    0xf2002c31,                                 //  vfma.f32      d2, d0, d17
+    0xf3bc3c23,                                 //  vdup.32       d3, d19[1]
+    0xf2003c34,                                 //  vfma.f32      d3, d0, d20
+    0xf22001b0,                                 //  vorr          d0, d16, d16
+    0xe12fff1c,                                 //  bx            ip
+};
+static const unsigned char hsw_sk_just_return[] = {
+    0xc5,0xf8,0x77,                             //  vzeroupper
+    0xc5,0xf8,0x77,                             //  vzeroupper
+    0xc3,                                       //  retq
+};
+static const unsigned char hsw_sk_seed_shader[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0xf9,0x6e,0xc7,                        //  vmovd         %edi,%xmm0
+    0xc4,0xe2,0x7d,0x18,0xc0,                   //  vbroadcastss  %xmm0,%ymm0
+    0xc5,0xfc,0x5b,0xc0,                        //  vcvtdq2ps     %ymm0,%ymm0
+    0xc4,0xe2,0x7d,0x18,0x4a,0x04,              //  vbroadcastss  0x4(%rdx),%ymm1
+    0xc5,0xfc,0x58,0xc1,                        //  vaddps        %ymm1,%ymm0,%ymm0
+    0xc5,0xfc,0x58,0x42,0x14,                   //  vaddps        0x14(%rdx),%ymm0,%ymm0
+    0xc4,0xe2,0x7d,0x18,0x10,                   //  vbroadcastss  (%rax),%ymm2
+    0xc5,0xfc,0x5b,0xd2,                        //  vcvtdq2ps     %ymm2,%ymm2
+    0xc5,0xec,0x58,0xc9,                        //  vaddps        %ymm1,%ymm2,%ymm1
+    0xc4,0xe2,0x7d,0x18,0x12,                   //  vbroadcastss  (%rdx),%ymm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0xe4,0x57,0xdb,                        //  vxorps        %ymm3,%ymm3,%ymm3
+    0xc5,0xdc,0x57,0xe4,                        //  vxorps        %ymm4,%ymm4,%ymm4
+    0xc5,0xd4,0x57,0xed,                        //  vxorps        %ymm5,%ymm5,%ymm5
+    0xc5,0xcc,0x57,0xf6,                        //  vxorps        %ymm6,%ymm6,%ymm6
+    0xc5,0xc4,0x57,0xff,                        //  vxorps        %ymm7,%ymm7,%ymm7
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_constant_color[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0xe2,0x7d,0x18,0x00,                   //  vbroadcastss  (%rax),%ymm0
+    0xc4,0xe2,0x7d,0x18,0x48,0x04,              //  vbroadcastss  0x4(%rax),%ymm1
+    0xc4,0xe2,0x7d,0x18,0x50,0x08,              //  vbroadcastss  0x8(%rax),%ymm2
+    0xc4,0xe2,0x7d,0x18,0x58,0x0c,              //  vbroadcastss  0xc(%rax),%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_clear[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x57,0xc0,                        //  vxorps        %ymm0,%ymm0,%ymm0
+    0xc5,0xf4,0x57,0xc9,                        //  vxorps        %ymm1,%ymm1,%ymm1
+    0xc5,0xec,0x57,0xd2,                        //  vxorps        %ymm2,%ymm2,%ymm2
+    0xc5,0xe4,0x57,0xdb,                        //  vxorps        %ymm3,%ymm3,%ymm3
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_plus_[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x58,0xc4,                        //  vaddps        %ymm4,%ymm0,%ymm0
+    0xc5,0xf4,0x58,0xcd,                        //  vaddps        %ymm5,%ymm1,%ymm1
+    0xc5,0xec,0x58,0xd6,                        //  vaddps        %ymm6,%ymm2,%ymm2
+    0xc5,0xe4,0x58,0xdf,                        //  vaddps        %ymm7,%ymm3,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_srcover[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x62,0x7d,0x18,0x02,                   //  vbroadcastss  (%rdx),%ymm8
+    0xc5,0x3c,0x5c,0xc3,                        //  vsubps        %ymm3,%ymm8,%ymm8
+    0xc4,0xc2,0x5d,0xb8,0xc0,                   //  vfmadd231ps   %ymm8,%ymm4,%ymm0
+    0xc4,0xc2,0x55,0xb8,0xc8,                   //  vfmadd231ps   %ymm8,%ymm5,%ymm1
+    0xc4,0xc2,0x4d,0xb8,0xd0,                   //  vfmadd231ps   %ymm8,%ymm6,%ymm2
+    0xc4,0xc2,0x45,0xb8,0xd8,                   //  vfmadd231ps   %ymm8,%ymm7,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_dstover[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x62,0x7d,0x18,0x02,                   //  vbroadcastss  (%rdx),%ymm8
+    0xc5,0x3c,0x5c,0xc7,                        //  vsubps        %ymm7,%ymm8,%ymm8
+    0xc4,0xe2,0x3d,0xa8,0xc4,                   //  vfmadd213ps   %ymm4,%ymm8,%ymm0
+    0xc4,0xe2,0x3d,0xa8,0xcd,                   //  vfmadd213ps   %ymm5,%ymm8,%ymm1
+    0xc4,0xe2,0x3d,0xa8,0xd6,                   //  vfmadd213ps   %ymm6,%ymm8,%ymm2
+    0xc4,0xe2,0x3d,0xa8,0xdf,                   //  vfmadd213ps   %ymm7,%ymm8,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_clamp_0[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x41,0x3c,0x57,0xc0,                   //  vxorps        %ymm8,%ymm8,%ymm8
+    0xc4,0xc1,0x7c,0x5f,0xc0,                   //  vmaxps        %ymm8,%ymm0,%ymm0
+    0xc4,0xc1,0x74,0x5f,0xc8,                   //  vmaxps        %ymm8,%ymm1,%ymm1
+    0xc4,0xc1,0x6c,0x5f,0xd0,                   //  vmaxps        %ymm8,%ymm2,%ymm2
+    0xc4,0xc1,0x64,0x5f,0xd8,                   //  vmaxps        %ymm8,%ymm3,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_clamp_1[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x62,0x7d,0x18,0x02,                   //  vbroadcastss  (%rdx),%ymm8
+    0xc4,0xc1,0x7c,0x5d,0xc0,                   //  vminps        %ymm8,%ymm0,%ymm0
+    0xc4,0xc1,0x74,0x5d,0xc8,                   //  vminps        %ymm8,%ymm1,%ymm1
+    0xc4,0xc1,0x6c,0x5d,0xd0,                   //  vminps        %ymm8,%ymm2,%ymm2
+    0xc4,0xc1,0x64,0x5d,0xd8,                   //  vminps        %ymm8,%ymm3,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_clamp_a[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x62,0x7d,0x18,0x02,                   //  vbroadcastss  (%rdx),%ymm8
+    0xc4,0xc1,0x64,0x5d,0xd8,                   //  vminps        %ymm8,%ymm3,%ymm3
+    0xc5,0xfc,0x5d,0xc3,                        //  vminps        %ymm3,%ymm0,%ymm0
+    0xc5,0xf4,0x5d,0xcb,                        //  vminps        %ymm3,%ymm1,%ymm1
+    0xc5,0xec,0x5d,0xd3,                        //  vminps        %ymm3,%ymm2,%ymm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_swap[] = {
+    0xc5,0x7c,0x28,0xc3,                        //  vmovaps       %ymm3,%ymm8
+    0xc5,0x7c,0x28,0xca,                        //  vmovaps       %ymm2,%ymm9
+    0xc5,0x7c,0x28,0xd1,                        //  vmovaps       %ymm1,%ymm10
+    0xc5,0x7c,0x28,0xd8,                        //  vmovaps       %ymm0,%ymm11
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x28,0xc4,                        //  vmovaps       %ymm4,%ymm0
+    0xc5,0xfc,0x28,0xcd,                        //  vmovaps       %ymm5,%ymm1
+    0xc5,0xfc,0x28,0xd6,                        //  vmovaps       %ymm6,%ymm2
+    0xc5,0xfc,0x28,0xdf,                        //  vmovaps       %ymm7,%ymm3
+    0xc5,0x7c,0x29,0xdc,                        //  vmovaps       %ymm11,%ymm4
+    0xc5,0x7c,0x29,0xd5,                        //  vmovaps       %ymm10,%ymm5
+    0xc5,0x7c,0x29,0xce,                        //  vmovaps       %ymm9,%ymm6
+    0xc5,0x7c,0x29,0xc7,                        //  vmovaps       %ymm8,%ymm7
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_move_src_dst[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x28,0xe0,                        //  vmovaps       %ymm0,%ymm4
+    0xc5,0xfc,0x28,0xe9,                        //  vmovaps       %ymm1,%ymm5
+    0xc5,0xfc,0x28,0xf2,                        //  vmovaps       %ymm2,%ymm6
+    0xc5,0xfc,0x28,0xfb,                        //  vmovaps       %ymm3,%ymm7
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_move_dst_src[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x28,0xc4,                        //  vmovaps       %ymm4,%ymm0
+    0xc5,0xfc,0x28,0xcd,                        //  vmovaps       %ymm5,%ymm1
+    0xc5,0xfc,0x28,0xd6,                        //  vmovaps       %ymm6,%ymm2
+    0xc5,0xfc,0x28,0xdf,                        //  vmovaps       %ymm7,%ymm3
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_premul[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0xfc,0x59,0xc3,                        //  vmulps        %ymm3,%ymm0,%ymm0
+    0xc5,0xf4,0x59,0xcb,                        //  vmulps        %ymm3,%ymm1,%ymm1
+    0xc5,0xec,0x59,0xd3,                        //  vmulps        %ymm3,%ymm2,%ymm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_unpremul[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x41,0x3c,0x57,0xc0,                   //  vxorps        %ymm8,%ymm8,%ymm8
+    0xc4,0x41,0x64,0xc2,0xc8,0x00,              //  vcmpeqps      %ymm8,%ymm3,%ymm9
+    0xc4,0x62,0x7d,0x18,0x12,                   //  vbroadcastss  (%rdx),%ymm10
+    0xc5,0x2c,0x5e,0xd3,                        //  vdivps        %ymm3,%ymm10,%ymm10
+    0xc4,0x43,0x2d,0x4a,0xc0,0x90,              //  vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
+    0xc5,0xbc,0x59,0xc0,                        //  vmulps        %ymm0,%ymm8,%ymm0
+    0xc5,0xbc,0x59,0xc9,                        //  vmulps        %ymm1,%ymm8,%ymm1
+    0xc5,0xbc,0x59,0xd2,                        //  vmulps        %ymm2,%ymm8,%ymm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_from_srgb[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x62,0x7d,0x18,0x42,0x40,              //  vbroadcastss  0x40(%rdx),%ymm8
+    0xc5,0x3c,0x59,0xc8,                        //  vmulps        %ymm0,%ymm8,%ymm9
+    0xc5,0x7c,0x59,0xd0,                        //  vmulps        %ymm0,%ymm0,%ymm10
+    0xc4,0x62,0x7d,0x18,0x5a,0x3c,              //  vbroadcastss  0x3c(%rdx),%ymm11
+    0xc4,0x62,0x7d,0x18,0x62,0x38,              //  vbroadcastss  0x38(%rdx),%ymm12
+    0xc4,0x41,0x7c,0x28,0xeb,                   //  vmovaps       %ymm11,%ymm13
+    0xc4,0x42,0x7d,0xa8,0xec,                   //  vfmadd213ps   %ymm12,%ymm0,%ymm13
+    0xc4,0x62,0x7d,0x18,0x72,0x34,              //  vbroadcastss  0x34(%rdx),%ymm14
+    0xc4,0x42,0x2d,0xa8,0xee,                   //  vfmadd213ps   %ymm14,%ymm10,%ymm13
+    0xc4,0x62,0x7d,0x18,0x52,0x44,              //  vbroadcastss  0x44(%rdx),%ymm10
+    0xc4,0xc1,0x7c,0xc2,0xc2,0x01,              //  vcmpltps      %ymm10,%ymm0,%ymm0
+    0xc4,0xc3,0x15,0x4a,0xc1,0x00,              //  vblendvps     %ymm0,%ymm9,%ymm13,%ymm0
+    0xc5,0x3c,0x59,0xc9,                        //  vmulps        %ymm1,%ymm8,%ymm9
+    0xc5,0x74,0x59,0xe9,                        //  vmulps        %ymm1,%ymm1,%ymm13
+    0xc4,0x41,0x7c,0x28,0xfb,                   //  vmovaps       %ymm11,%ymm15
+    0xc4,0x42,0x75,0xa8,0xfc,                   //  vfmadd213ps   %ymm12,%ymm1,%ymm15
+    0xc4,0x42,0x15,0xa8,0xfe,                   //  vfmadd213ps   %ymm14,%ymm13,%ymm15
+    0xc4,0xc1,0x74,0xc2,0xca,0x01,              //  vcmpltps      %ymm10,%ymm1,%ymm1
+    0xc4,0xc3,0x05,0x4a,0xc9,0x10,              //  vblendvps     %ymm1,%ymm9,%ymm15,%ymm1
+    0xc5,0x3c,0x59,0xc2,                        //  vmulps        %ymm2,%ymm8,%ymm8
+    0xc5,0x6c,0x59,0xca,                        //  vmulps        %ymm2,%ymm2,%ymm9
+    0xc4,0x42,0x6d,0xa8,0xdc,                   //  vfmadd213ps   %ymm12,%ymm2,%ymm11
+    0xc4,0x42,0x35,0xa8,0xde,                   //  vfmadd213ps   %ymm14,%ymm9,%ymm11
+    0xc4,0xc1,0x6c,0xc2,0xd2,0x01,              //  vcmpltps      %ymm10,%ymm2,%ymm2
+    0xc4,0xc3,0x25,0x4a,0xd0,0x20,              //  vblendvps     %ymm2,%ymm8,%ymm11,%ymm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_to_srgb[] = {
+    0xc5,0x7c,0x52,0xc0,                        //  vrsqrtps      %ymm0,%ymm8
+    0xc4,0x41,0x7c,0x53,0xc8,                   //  vrcpps        %ymm8,%ymm9
+    0xc4,0x41,0x7c,0x52,0xd0,                   //  vrsqrtps      %ymm8,%ymm10
+    0xc4,0x62,0x7d,0x18,0x42,0x48,              //  vbroadcastss  0x48(%rdx),%ymm8
+    0xc5,0x3c,0x59,0xd8,                        //  vmulps        %ymm0,%ymm8,%ymm11
+    0xc4,0x62,0x7d,0x18,0x22,                   //  vbroadcastss  (%rdx),%ymm12
+    0xc4,0x62,0x7d,0x18,0x6a,0x4c,              //  vbroadcastss  0x4c(%rdx),%ymm13
+    0xc4,0x62,0x7d,0x18,0x72,0x50,              //  vbroadcastss  0x50(%rdx),%ymm14
+    0xc4,0x62,0x7d,0x18,0x7a,0x54,              //  vbroadcastss  0x54(%rdx),%ymm15
+    0xc4,0x42,0x0d,0xa8,0xcf,                   //  vfmadd213ps   %ymm15,%ymm14,%ymm9
+    0xc4,0x42,0x15,0xb8,0xca,                   //  vfmadd231ps   %ymm10,%ymm13,%ymm9
+    0xc4,0x41,0x1c,0x5d,0xc9,                   //  vminps        %ymm9,%ymm12,%ymm9
+    0xc4,0x62,0x7d,0x18,0x52,0x58,              //  vbroadcastss  0x58(%rdx),%ymm10
+    0xc4,0xc1,0x7c,0xc2,0xc2,0x01,              //  vcmpltps      %ymm10,%ymm0,%ymm0
+    0xc4,0xc3,0x35,0x4a,0xc3,0x00,              //  vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
+    0xc5,0x7c,0x52,0xc9,                        //  vrsqrtps      %ymm1,%ymm9
+    0xc4,0x41,0x7c,0x53,0xd9,                   //  vrcpps        %ymm9,%ymm11
+    0xc4,0x41,0x7c,0x52,0xc9,                   //  vrsqrtps      %ymm9,%ymm9
+    0xc4,0x42,0x0d,0xa8,0xdf,                   //  vfmadd213ps   %ymm15,%ymm14,%ymm11
+    0xc4,0x42,0x15,0xb8,0xd9,                   //  vfmadd231ps   %ymm9,%ymm13,%ymm11
+    0xc5,0x3c,0x59,0xc9,                        //  vmulps        %ymm1,%ymm8,%ymm9
+    0xc4,0x41,0x1c,0x5d,0xdb,                   //  vminps        %ymm11,%ymm12,%ymm11
+    0xc4,0xc1,0x74,0xc2,0xca,0x01,              //  vcmpltps      %ymm10,%ymm1,%ymm1
+    0xc4,0xc3,0x25,0x4a,0xc9,0x10,              //  vblendvps     %ymm1,%ymm9,%ymm11,%ymm1
+    0xc5,0x7c,0x52,0xca,                        //  vrsqrtps      %ymm2,%ymm9
+    0xc4,0x41,0x7c,0x53,0xd9,                   //  vrcpps        %ymm9,%ymm11
+    0xc4,0x42,0x0d,0xa8,0xdf,                   //  vfmadd213ps   %ymm15,%ymm14,%ymm11
+    0xc4,0x41,0x7c,0x52,0xc9,                   //  vrsqrtps      %ymm9,%ymm9
+    0xc4,0x42,0x15,0xb8,0xd9,                   //  vfmadd231ps   %ymm9,%ymm13,%ymm11
+    0xc4,0x41,0x1c,0x5d,0xcb,                   //  vminps        %ymm11,%ymm12,%ymm9
+    0xc5,0x3c,0x59,0xc2,                        //  vmulps        %ymm2,%ymm8,%ymm8
+    0xc4,0xc1,0x6c,0xc2,0xd2,0x01,              //  vcmpltps      %ymm10,%ymm2,%ymm2
+    0xc4,0xc3,0x35,0x4a,0xd0,0x20,              //  vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_scale_u8[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xc4,0x62,0x7d,0x31,0x04,0x38,              //  vpmovzxbd     (%rax,%rdi,1),%ymm8
+    0xc4,0x41,0x7c,0x5b,0xc0,                   //  vcvtdq2ps     %ymm8,%ymm8
+    0xc4,0x62,0x7d,0x18,0x4a,0x0c,              //  vbroadcastss  0xc(%rdx),%ymm9
+    0xc4,0x41,0x3c,0x59,0xc1,                   //  vmulps        %ymm9,%ymm8,%ymm8
+    0xc5,0xbc,0x59,0xc0,                        //  vmulps        %ymm0,%ymm8,%ymm0
+    0xc5,0xbc,0x59,0xc9,                        //  vmulps        %ymm1,%ymm8,%ymm1
+    0xc5,0xbc,0x59,0xd2,                        //  vmulps        %ymm2,%ymm8,%ymm2
+    0xc5,0xbc,0x59,0xdb,                        //  vmulps        %ymm3,%ymm8,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_load_tables[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x08,                             //  mov           (%rax),%rcx
+    0x4c,0x8b,0x40,0x08,                        //  mov           0x8(%rax),%r8
+    0xc5,0xfc,0x10,0x1c,0xb9,                   //  vmovups       (%rcx,%rdi,4),%ymm3
+    0xc4,0xe2,0x7d,0x18,0x52,0x10,              //  vbroadcastss  0x10(%rdx),%ymm2
+    0xc5,0xec,0x54,0xcb,                        //  vandps        %ymm3,%ymm2,%ymm1
+    0xc5,0xfc,0x57,0xc0,                        //  vxorps        %ymm0,%ymm0,%ymm0
+    0xc5,0x7c,0xc2,0xc0,0x00,                   //  vcmpeqps      %ymm0,%ymm0,%ymm8
+    0xc4,0x41,0x7c,0x28,0xc8,                   //  vmovaps       %ymm8,%ymm9
+    0xc4,0xc2,0x35,0x92,0x04,0x88,              //  vgatherdps    %ymm9,(%r8,%ymm1,4),%ymm0
+    0x48,0x8b,0x48,0x10,                        //  mov           0x10(%rax),%rcx
+    0xc5,0xf5,0x72,0xd3,0x08,                   //  vpsrld        $0x8,%ymm3,%ymm1
+    0xc5,0x6c,0x54,0xc9,                        //  vandps        %ymm1,%ymm2,%ymm9
+    0xc4,0x41,0x7c,0x28,0xd0,                   //  vmovaps       %ymm8,%ymm10
+    0xc4,0xa2,0x2d,0x92,0x0c,0x89,              //  vgatherdps    %ymm10,(%rcx,%ymm9,4),%ymm1
+    0x48,0x8b,0x40,0x18,                        //  mov           0x18(%rax),%rax
+    0xc5,0xb5,0x72,0xd3,0x10,                   //  vpsrld        $0x10,%ymm3,%ymm9
+    0xc4,0x41,0x6c,0x54,0xc9,                   //  vandps        %ymm9,%ymm2,%ymm9
+    0xc4,0xa2,0x3d,0x92,0x14,0x88,              //  vgatherdps    %ymm8,(%rax,%ymm9,4),%ymm2
+    0xc5,0xe5,0x72,0xd3,0x18,                   //  vpsrld        $0x18,%ymm3,%ymm3
+    0xc5,0xfc,0x5b,0xdb,                        //  vcvtdq2ps     %ymm3,%ymm3
+    0xc4,0x62,0x7d,0x18,0x42,0x0c,              //  vbroadcastss  0xc(%rdx),%ymm8
+    0xc4,0xc1,0x64,0x59,0xd8,                   //  vmulps        %ymm8,%ymm3,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_load_8888[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xc5,0xfc,0x10,0x1c,0xb8,                   //  vmovups       (%rax,%rdi,4),%ymm3
+    0xc4,0xe2,0x7d,0x18,0x52,0x10,              //  vbroadcastss  0x10(%rdx),%ymm2
+    0xc5,0xec,0x54,0xc3,                        //  vandps        %ymm3,%ymm2,%ymm0
+    0xc5,0xfc,0x5b,0xc0,                        //  vcvtdq2ps     %ymm0,%ymm0
+    0xc4,0x62,0x7d,0x18,0x42,0x0c,              //  vbroadcastss  0xc(%rdx),%ymm8
+    0xc5,0xbc,0x59,0xc0,                        //  vmulps        %ymm0,%ymm8,%ymm0
+    0xc5,0xf5,0x72,0xd3,0x08,                   //  vpsrld        $0x8,%ymm3,%ymm1
+    0xc5,0xec,0x54,0xc9,                        //  vandps        %ymm1,%ymm2,%ymm1
+    0xc5,0xfc,0x5b,0xc9,                        //  vcvtdq2ps     %ymm1,%ymm1
+    0xc5,0xbc,0x59,0xc9,                        //  vmulps        %ymm1,%ymm8,%ymm1
+    0xc5,0xb5,0x72,0xd3,0x10,                   //  vpsrld        $0x10,%ymm3,%ymm9
+    0xc4,0xc1,0x6c,0x54,0xd1,                   //  vandps        %ymm9,%ymm2,%ymm2
+    0xc5,0xfc,0x5b,0xd2,                        //  vcvtdq2ps     %ymm2,%ymm2
+    0xc5,0xbc,0x59,0xd2,                        //  vmulps        %ymm2,%ymm8,%ymm2
+    0xc5,0xe5,0x72,0xd3,0x18,                   //  vpsrld        $0x18,%ymm3,%ymm3
+    0xc5,0xfc,0x5b,0xdb,                        //  vcvtdq2ps     %ymm3,%ymm3
+    0xc4,0xc1,0x64,0x59,0xd8,                   //  vmulps        %ymm8,%ymm3,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_store_8888[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xc4,0x62,0x7d,0x18,0x42,0x08,              //  vbroadcastss  0x8(%rdx),%ymm8
+    0xc5,0x3c,0x59,0xc8,                        //  vmulps        %ymm0,%ymm8,%ymm9
+    0xc4,0x41,0x7d,0x5b,0xc9,                   //  vcvtps2dq     %ymm9,%ymm9
+    0xc5,0x3c,0x59,0xd1,                        //  vmulps        %ymm1,%ymm8,%ymm10
+    0xc4,0x41,0x7d,0x5b,0xd2,                   //  vcvtps2dq     %ymm10,%ymm10
+    0xc4,0xc1,0x2d,0x72,0xf2,0x08,              //  vpslld        $0x8,%ymm10,%ymm10
+    0xc4,0x41,0x2d,0xeb,0xc9,                   //  vpor          %ymm9,%ymm10,%ymm9
+    0xc5,0x3c,0x59,0xd2,                        //  vmulps        %ymm2,%ymm8,%ymm10
+    0xc4,0x41,0x7d,0x5b,0xd2,                   //  vcvtps2dq     %ymm10,%ymm10
+    0xc4,0xc1,0x2d,0x72,0xf2,0x10,              //  vpslld        $0x10,%ymm10,%ymm10
+    0xc5,0x3c,0x59,0xc3,                        //  vmulps        %ymm3,%ymm8,%ymm8
+    0xc4,0x41,0x7d,0x5b,0xc0,                   //  vcvtps2dq     %ymm8,%ymm8
+    0xc4,0xc1,0x3d,0x72,0xf0,0x18,              //  vpslld        $0x18,%ymm8,%ymm8
+    0xc4,0x41,0x2d,0xeb,0xc0,                   //  vpor          %ymm8,%ymm10,%ymm8
+    0xc4,0x41,0x35,0xeb,0xc0,                   //  vpor          %ymm8,%ymm9,%ymm8
+    0xc5,0x7e,0x7f,0x04,0xb8,                   //  vmovdqu       %ymm8,(%rax,%rdi,4)
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_load_f16[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xc5,0xfa,0x6f,0x04,0xf8,                   //  vmovdqu       (%rax,%rdi,8),%xmm0
+    0xc5,0xfa,0x6f,0x4c,0xf8,0x10,              //  vmovdqu       0x10(%rax,%rdi,8),%xmm1
+    0xc5,0xfa,0x6f,0x54,0xf8,0x20,              //  vmovdqu       0x20(%rax,%rdi,8),%xmm2
+    0xc5,0xfa,0x6f,0x5c,0xf8,0x30,              //  vmovdqu       0x30(%rax,%rdi,8),%xmm3
+    0xc5,0x79,0x61,0xc1,                        //  vpunpcklwd    %xmm1,%xmm0,%xmm8
+    0xc5,0xf9,0x69,0xc1,                        //  vpunpckhwd    %xmm1,%xmm0,%xmm0
+    0xc5,0xe9,0x61,0xcb,                        //  vpunpcklwd    %xmm3,%xmm2,%xmm1
+    0xc5,0xe9,0x69,0xd3,                        //  vpunpckhwd    %xmm3,%xmm2,%xmm2
+    0xc5,0x39,0x61,0xc8,                        //  vpunpcklwd    %xmm0,%xmm8,%xmm9
+    0xc5,0x39,0x69,0xc0,                        //  vpunpckhwd    %xmm0,%xmm8,%xmm8
+    0xc5,0xf1,0x61,0xda,                        //  vpunpcklwd    %xmm2,%xmm1,%xmm3
+    0xc5,0x71,0x69,0xd2,                        //  vpunpckhwd    %xmm2,%xmm1,%xmm10
+    0xc5,0xb1,0x6c,0xc3,                        //  vpunpcklqdq   %xmm3,%xmm9,%xmm0
+    0xc4,0xe2,0x7d,0x13,0xc0,                   //  vcvtph2ps     %xmm0,%ymm0
+    0xc5,0xb1,0x6d,0xcb,                        //  vpunpckhqdq   %xmm3,%xmm9,%xmm1
+    0xc4,0xe2,0x7d,0x13,0xc9,                   //  vcvtph2ps     %xmm1,%ymm1
+    0xc4,0xc1,0x39,0x6c,0xd2,                   //  vpunpcklqdq   %xmm10,%xmm8,%xmm2
+    0xc4,0xe2,0x7d,0x13,0xd2,                   //  vcvtph2ps     %xmm2,%ymm2
+    0xc4,0xc1,0x39,0x6d,0xda,                   //  vpunpckhqdq   %xmm10,%xmm8,%xmm3
+    0xc4,0xe2,0x7d,0x13,0xdb,                   //  vcvtph2ps     %xmm3,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_store_f16[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xc4,0xc3,0x7d,0x1d,0xc0,0x04,              //  vcvtps2ph     $0x4,%ymm0,%xmm8
+    0xc4,0xc3,0x7d,0x1d,0xc9,0x04,              //  vcvtps2ph     $0x4,%ymm1,%xmm9
+    0xc4,0xc3,0x7d,0x1d,0xd2,0x04,              //  vcvtps2ph     $0x4,%ymm2,%xmm10
+    0xc4,0xc3,0x7d,0x1d,0xdb,0x04,              //  vcvtps2ph     $0x4,%ymm3,%xmm11
+    0xc4,0x41,0x39,0x61,0xe1,                   //  vpunpcklwd    %xmm9,%xmm8,%xmm12
+    0xc4,0x41,0x39,0x69,0xc1,                   //  vpunpckhwd    %xmm9,%xmm8,%xmm8
+    0xc4,0x41,0x29,0x61,0xcb,                   //  vpunpcklwd    %xmm11,%xmm10,%xmm9
+    0xc4,0x41,0x29,0x69,0xd3,                   //  vpunpckhwd    %xmm11,%xmm10,%xmm10
+    0xc4,0x41,0x19,0x62,0xd9,                   //  vpunpckldq    %xmm9,%xmm12,%xmm11
+    0xc5,0x7a,0x7f,0x1c,0xf8,                   //  vmovdqu       %xmm11,(%rax,%rdi,8)
+    0xc4,0x41,0x19,0x6a,0xc9,                   //  vpunpckhdq    %xmm9,%xmm12,%xmm9
+    0xc5,0x7a,0x7f,0x4c,0xf8,0x10,              //  vmovdqu       %xmm9,0x10(%rax,%rdi,8)
+    0xc4,0x41,0x39,0x62,0xca,                   //  vpunpckldq    %xmm10,%xmm8,%xmm9
+    0xc5,0x7a,0x7f,0x4c,0xf8,0x20,              //  vmovdqu       %xmm9,0x20(%rax,%rdi,8)
+    0xc4,0x41,0x39,0x6a,0xc2,                   //  vpunpckhdq    %xmm10,%xmm8,%xmm8
+    0xc5,0x7a,0x7f,0x44,0xf8,0x30,              //  vmovdqu       %xmm8,0x30(%rax,%rdi,8)
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_clamp_x[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x62,0x7d,0x58,0x00,                   //  vpbroadcastd  (%rax),%ymm8
+    0xc4,0x41,0x35,0x76,0xc9,                   //  vpcmpeqd      %ymm9,%ymm9,%ymm9
+    0xc4,0x41,0x3d,0xfe,0xc1,                   //  vpaddd        %ymm9,%ymm8,%ymm8
+    0xc4,0xc1,0x7c,0x5d,0xc0,                   //  vminps        %ymm8,%ymm0,%ymm0
+    0xc4,0x41,0x3c,0x57,0xc0,                   //  vxorps        %ymm8,%ymm8,%ymm8
+    0xc5,0xbc,0x5f,0xc0,                        //  vmaxps        %ymm0,%ymm8,%ymm0
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_clamp_y[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x62,0x7d,0x58,0x00,                   //  vpbroadcastd  (%rax),%ymm8
+    0xc4,0x41,0x35,0x76,0xc9,                   //  vpcmpeqd      %ymm9,%ymm9,%ymm9
+    0xc4,0x41,0x3d,0xfe,0xc1,                   //  vpaddd        %ymm9,%ymm8,%ymm8
+    0xc4,0xc1,0x74,0x5d,0xc8,                   //  vminps        %ymm8,%ymm1,%ymm1
+    0xc4,0x41,0x3c,0x57,0xc0,                   //  vxorps        %ymm8,%ymm8,%ymm8
+    0xc5,0xbc,0x5f,0xc9,                        //  vmaxps        %ymm1,%ymm8,%ymm1
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_matrix_2x3[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x62,0x7d,0x18,0x08,                   //  vbroadcastss  (%rax),%ymm9
+    0xc4,0x62,0x7d,0x18,0x50,0x08,              //  vbroadcastss  0x8(%rax),%ymm10
+    0xc4,0x62,0x7d,0x18,0x40,0x10,              //  vbroadcastss  0x10(%rax),%ymm8
+    0xc4,0x42,0x75,0xb8,0xc2,                   //  vfmadd231ps   %ymm10,%ymm1,%ymm8
+    0xc4,0x42,0x7d,0xb8,0xc1,                   //  vfmadd231ps   %ymm9,%ymm0,%ymm8
+    0xc4,0x62,0x7d,0x18,0x50,0x04,              //  vbroadcastss  0x4(%rax),%ymm10
+    0xc4,0x62,0x7d,0x18,0x58,0x0c,              //  vbroadcastss  0xc(%rax),%ymm11
+    0xc4,0x62,0x7d,0x18,0x48,0x14,              //  vbroadcastss  0x14(%rax),%ymm9
+    0xc4,0x42,0x75,0xb8,0xcb,                   //  vfmadd231ps   %ymm11,%ymm1,%ymm9
+    0xc4,0x42,0x7d,0xb8,0xca,                   //  vfmadd231ps   %ymm10,%ymm0,%ymm9
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0x7c,0x29,0xc0,                        //  vmovaps       %ymm8,%ymm0
+    0xc5,0x7c,0x29,0xc9,                        //  vmovaps       %ymm9,%ymm1
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_matrix_3x4[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0x62,0x7d,0x18,0x08,                   //  vbroadcastss  (%rax),%ymm9
+    0xc4,0x62,0x7d,0x18,0x50,0x0c,              //  vbroadcastss  0xc(%rax),%ymm10
+    0xc4,0x62,0x7d,0x18,0x58,0x18,              //  vbroadcastss  0x18(%rax),%ymm11
+    0xc4,0x62,0x7d,0x18,0x40,0x24,              //  vbroadcastss  0x24(%rax),%ymm8
+    0xc4,0x42,0x6d,0xb8,0xc3,                   //  vfmadd231ps   %ymm11,%ymm2,%ymm8
+    0xc4,0x42,0x75,0xb8,0xc2,                   //  vfmadd231ps   %ymm10,%ymm1,%ymm8
+    0xc4,0x42,0x7d,0xb8,0xc1,                   //  vfmadd231ps   %ymm9,%ymm0,%ymm8
+    0xc4,0x62,0x7d,0x18,0x50,0x04,              //  vbroadcastss  0x4(%rax),%ymm10
+    0xc4,0x62,0x7d,0x18,0x58,0x10,              //  vbroadcastss  0x10(%rax),%ymm11
+    0xc4,0x62,0x7d,0x18,0x60,0x1c,              //  vbroadcastss  0x1c(%rax),%ymm12
+    0xc4,0x62,0x7d,0x18,0x48,0x28,              //  vbroadcastss  0x28(%rax),%ymm9
+    0xc4,0x42,0x6d,0xb8,0xcc,                   //  vfmadd231ps   %ymm12,%ymm2,%ymm9
+    0xc4,0x42,0x75,0xb8,0xcb,                   //  vfmadd231ps   %ymm11,%ymm1,%ymm9
+    0xc4,0x42,0x7d,0xb8,0xca,                   //  vfmadd231ps   %ymm10,%ymm0,%ymm9
+    0xc4,0x62,0x7d,0x18,0x58,0x08,              //  vbroadcastss  0x8(%rax),%ymm11
+    0xc4,0x62,0x7d,0x18,0x60,0x14,              //  vbroadcastss  0x14(%rax),%ymm12
+    0xc4,0x62,0x7d,0x18,0x68,0x20,              //  vbroadcastss  0x20(%rax),%ymm13
+    0xc4,0x62,0x7d,0x18,0x50,0x2c,              //  vbroadcastss  0x2c(%rax),%ymm10
+    0xc4,0x42,0x6d,0xb8,0xd5,                   //  vfmadd231ps   %ymm13,%ymm2,%ymm10
+    0xc4,0x42,0x75,0xb8,0xd4,                   //  vfmadd231ps   %ymm12,%ymm1,%ymm10
+    0xc4,0x42,0x7d,0xb8,0xd3,                   //  vfmadd231ps   %ymm11,%ymm0,%ymm10
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0x7c,0x29,0xc0,                        //  vmovaps       %ymm8,%ymm0
+    0xc5,0x7c,0x29,0xc9,                        //  vmovaps       %ymm9,%ymm1
+    0xc5,0x7c,0x29,0xd2,                        //  vmovaps       %ymm10,%ymm2
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char hsw_sk_linear_gradient_2stops[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc4,0xe2,0x7d,0x18,0x48,0x10,              //  vbroadcastss  0x10(%rax),%ymm1
+    0xc4,0x62,0x7d,0x18,0x00,                   //  vbroadcastss  (%rax),%ymm8
+    0xc4,0x62,0x7d,0xb8,0xc1,                   //  vfmadd231ps   %ymm1,%ymm0,%ymm8
+    0xc4,0xe2,0x7d,0x18,0x50,0x14,              //  vbroadcastss  0x14(%rax),%ymm2
+    0xc4,0xe2,0x7d,0x18,0x48,0x04,              //  vbroadcastss  0x4(%rax),%ymm1
+    0xc4,0xe2,0x7d,0xb8,0xca,                   //  vfmadd231ps   %ymm2,%ymm0,%ymm1
+    0xc4,0xe2,0x7d,0x18,0x58,0x18,              //  vbroadcastss  0x18(%rax),%ymm3
+    0xc4,0xe2,0x7d,0x18,0x50,0x08,              //  vbroadcastss  0x8(%rax),%ymm2
+    0xc4,0xe2,0x7d,0xb8,0xd3,                   //  vfmadd231ps   %ymm3,%ymm0,%ymm2
+    0xc4,0x62,0x7d,0x18,0x48,0x1c,              //  vbroadcastss  0x1c(%rax),%ymm9
+    0xc4,0xe2,0x7d,0x18,0x58,0x0c,              //  vbroadcastss  0xc(%rax),%ymm3
+    0xc4,0xc2,0x7d,0xb8,0xd9,                   //  vfmadd231ps   %ymm9,%ymm0,%ymm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xc5,0x7c,0x29,0xc0,                        //  vmovaps       %ymm8,%ymm0
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_just_return[] = {
+    0xc3,                                       //  retq
+};
+static const unsigned char sse41_sk_seed_shader[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x66,0x0f,0x6e,0xc7,                        //  movd          %edi,%xmm0
+    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
+    0x0f,0x5b,0xc8,                             //  cvtdq2ps      %xmm0,%xmm1
+    0xf3,0x0f,0x10,0x12,                        //  movss         (%rdx),%xmm2
+    0xf3,0x0f,0x10,0x5a,0x04,                   //  movss         0x4(%rdx),%xmm3
+    0x0f,0xc6,0xdb,0x00,                        //  shufps        $0x0,%xmm3,%xmm3
+    0x0f,0x58,0xcb,                             //  addps         %xmm3,%xmm1
+    0x0f,0x10,0x42,0x14,                        //  movups        0x14(%rdx),%xmm0
+    0x0f,0x58,0xc1,                             //  addps         %xmm1,%xmm0
+    0x66,0x0f,0x6e,0x08,                        //  movd          (%rax),%xmm1
+    0x66,0x0f,0x70,0xc9,0x00,                   //  pshufd        $0x0,%xmm1,%xmm1
+    0x0f,0x5b,0xc9,                             //  cvtdq2ps      %xmm1,%xmm1
+    0x0f,0x58,0xcb,                             //  addps         %xmm3,%xmm1
+    0x0f,0xc6,0xd2,0x00,                        //  shufps        $0x0,%xmm2,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xdb,                             //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                             //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                             //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                             //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                             //  xorps         %xmm7,%xmm7
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_constant_color[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x10,0x18,                             //  movups        (%rax),%xmm3
+    0x0f,0x28,0xc3,                             //  movaps        %xmm3,%xmm0
+    0x0f,0xc6,0xc0,0x00,                        //  shufps        $0x0,%xmm0,%xmm0
+    0x0f,0x28,0xcb,                             //  movaps        %xmm3,%xmm1
+    0x0f,0xc6,0xc9,0x55,                        //  shufps        $0x55,%xmm1,%xmm1
+    0x0f,0x28,0xd3,                             //  movaps        %xmm3,%xmm2
+    0x0f,0xc6,0xd2,0xaa,                        //  shufps        $0xaa,%xmm2,%xmm2
+    0x0f,0xc6,0xdb,0xff,                        //  shufps        $0xff,%xmm3,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_clear[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                             //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                             //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                             //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                             //  xorps         %xmm3,%xmm3
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_plus_[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x58,0xc4,                             //  addps         %xmm4,%xmm0
+    0x0f,0x58,0xcd,                             //  addps         %xmm5,%xmm1
+    0x0f,0x58,0xd6,                             //  addps         %xmm6,%xmm2
+    0x0f,0x58,0xdf,                             //  addps         %xmm7,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_srcover[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x02,                   //  movss         (%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x44,0x0f,0x5c,0xc3,                        //  subps         %xmm3,%xmm8
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xcc,                        //  mulps         %xmm4,%xmm9
+    0x41,0x0f,0x58,0xc1,                        //  addps         %xmm9,%xmm0
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xcd,                        //  mulps         %xmm5,%xmm9
+    0x41,0x0f,0x58,0xc9,                        //  addps         %xmm9,%xmm1
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xce,                        //  mulps         %xmm6,%xmm9
+    0x41,0x0f,0x58,0xd1,                        //  addps         %xmm9,%xmm2
+    0x44,0x0f,0x59,0xc7,                        //  mulps         %xmm7,%xmm8
+    0x41,0x0f,0x58,0xd8,                        //  addps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_dstover[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x02,                   //  movss         (%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x44,0x0f,0x5c,0xc7,                        //  subps         %xmm7,%xmm8
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x0f,0x58,0xc4,                             //  addps         %xmm4,%xmm0
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x0f,0x58,0xcd,                             //  addps         %xmm5,%xmm1
+    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
+    0x0f,0x58,0xd6,                             //  addps         %xmm6,%xmm2
+    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
+    0x0f,0x58,0xdf,                             //  addps         %xmm7,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_clamp_0[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
+    0x41,0x0f,0x5f,0xc0,                        //  maxps         %xmm8,%xmm0
+    0x41,0x0f,0x5f,0xc8,                        //  maxps         %xmm8,%xmm1
+    0x41,0x0f,0x5f,0xd0,                        //  maxps         %xmm8,%xmm2
+    0x41,0x0f,0x5f,0xd8,                        //  maxps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_clamp_1[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x02,                   //  movss         (%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x41,0x0f,0x5d,0xc0,                        //  minps         %xmm8,%xmm0
+    0x41,0x0f,0x5d,0xc8,                        //  minps         %xmm8,%xmm1
+    0x41,0x0f,0x5d,0xd0,                        //  minps         %xmm8,%xmm2
+    0x41,0x0f,0x5d,0xd8,                        //  minps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_clamp_a[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x02,                   //  movss         (%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x41,0x0f,0x5d,0xd8,                        //  minps         %xmm8,%xmm3
+    0x0f,0x5d,0xc3,                             //  minps         %xmm3,%xmm0
+    0x0f,0x5d,0xcb,                             //  minps         %xmm3,%xmm1
+    0x0f,0x5d,0xd3,                             //  minps         %xmm3,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_swap[] = {
+    0x44,0x0f,0x28,0xc3,                        //  movaps        %xmm3,%xmm8
+    0x44,0x0f,0x28,0xca,                        //  movaps        %xmm2,%xmm9
+    0x44,0x0f,0x28,0xd1,                        //  movaps        %xmm1,%xmm10
+    0x44,0x0f,0x28,0xd8,                        //  movaps        %xmm0,%xmm11
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x28,0xc4,                             //  movaps        %xmm4,%xmm0
+    0x0f,0x28,0xcd,                             //  movaps        %xmm5,%xmm1
+    0x0f,0x28,0xd6,                             //  movaps        %xmm6,%xmm2
+    0x0f,0x28,0xdf,                             //  movaps        %xmm7,%xmm3
+    0x41,0x0f,0x28,0xe3,                        //  movaps        %xmm11,%xmm4
+    0x41,0x0f,0x28,0xea,                        //  movaps        %xmm10,%xmm5
+    0x41,0x0f,0x28,0xf1,                        //  movaps        %xmm9,%xmm6
+    0x41,0x0f,0x28,0xf8,                        //  movaps        %xmm8,%xmm7
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_move_src_dst[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x28,0xe0,                             //  movaps        %xmm0,%xmm4
+    0x0f,0x28,0xe9,                             //  movaps        %xmm1,%xmm5
+    0x0f,0x28,0xf2,                             //  movaps        %xmm2,%xmm6
+    0x0f,0x28,0xfb,                             //  movaps        %xmm3,%xmm7
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_move_dst_src[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x28,0xc4,                             //  movaps        %xmm4,%xmm0
+    0x0f,0x28,0xcd,                             //  movaps        %xmm5,%xmm1
+    0x0f,0x28,0xd6,                             //  movaps        %xmm6,%xmm2
+    0x0f,0x28,0xdf,                             //  movaps        %xmm7,%xmm3
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_premul[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x59,0xc3,                             //  mulps         %xmm3,%xmm0
+    0x0f,0x59,0xcb,                             //  mulps         %xmm3,%xmm1
+    0x0f,0x59,0xd3,                             //  mulps         %xmm3,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_unpremul[] = {
+    0x44,0x0f,0x28,0xc0,                        //  movaps        %xmm0,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x45,0x0f,0x57,0xc9,                        //  xorps         %xmm9,%xmm9
+    0xf3,0x44,0x0f,0x10,0x12,                   //  movss         (%rdx),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0x44,0x0f,0x5e,0xd3,                        //  divps         %xmm3,%xmm10
+    0x0f,0x28,0xc3,                             //  movaps        %xmm3,%xmm0
+    0x41,0x0f,0xc2,0xc1,0x00,                   //  cmpeqps       %xmm9,%xmm0
+    0x66,0x45,0x0f,0x38,0x14,0xd1,              //  blendvps      %xmm0,%xmm9,%xmm10
+    0x45,0x0f,0x59,0xc2,                        //  mulps         %xmm10,%xmm8
+    0x41,0x0f,0x59,0xca,                        //  mulps         %xmm10,%xmm1
+    0x41,0x0f,0x59,0xd2,                        //  mulps         %xmm10,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xc0,                        //  movaps        %xmm8,%xmm0
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_from_srgb[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x5a,0x40,              //  movss         0x40(%rdx),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0x45,0x0f,0x28,0xd3,                        //  movaps        %xmm11,%xmm10
+    0x44,0x0f,0x59,0xd0,                        //  mulps         %xmm0,%xmm10
+    0x44,0x0f,0x28,0xf0,                        //  movaps        %xmm0,%xmm14
+    0x45,0x0f,0x59,0xf6,                        //  mulps         %xmm14,%xmm14
+    0xf3,0x44,0x0f,0x10,0x42,0x3c,              //  movss         0x3c(%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0xf3,0x44,0x0f,0x10,0x62,0x34,              //  movss         0x34(%rdx),%xmm12
+    0xf3,0x44,0x0f,0x10,0x6a,0x38,              //  movss         0x38(%rdx),%xmm13
+    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
+    0x45,0x0f,0x58,0xcd,                        //  addps         %xmm13,%xmm9
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0x45,0x0f,0x59,0xce,                        //  mulps         %xmm14,%xmm9
+    0x45,0x0f,0x58,0xcc,                        //  addps         %xmm12,%xmm9
+    0xf3,0x44,0x0f,0x10,0x72,0x44,              //  movss         0x44(%rdx),%xmm14
+    0x45,0x0f,0xc6,0xf6,0x00,                   //  shufps        $0x0,%xmm14,%xmm14
+    0x41,0x0f,0xc2,0xc6,0x01,                   //  cmpltps       %xmm14,%xmm0
+    0x66,0x45,0x0f,0x38,0x14,0xca,              //  blendvps      %xmm0,%xmm10,%xmm9
+    0x45,0x0f,0x28,0xfb,                        //  movaps        %xmm11,%xmm15
+    0x44,0x0f,0x59,0xf9,                        //  mulps         %xmm1,%xmm15
+    0x0f,0x28,0xc1,                             //  movaps        %xmm1,%xmm0
+    0x0f,0x59,0xc0,                             //  mulps         %xmm0,%xmm0
+    0x45,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm10
+    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
+    0x45,0x0f,0x58,0xd5,                        //  addps         %xmm13,%xmm10
+    0x44,0x0f,0x59,0xd0,                        //  mulps         %xmm0,%xmm10
+    0x45,0x0f,0x58,0xd4,                        //  addps         %xmm12,%xmm10
+    0x41,0x0f,0xc2,0xce,0x01,                   //  cmpltps       %xmm14,%xmm1
+    0x0f,0x28,0xc1,                             //  movaps        %xmm1,%xmm0
+    0x66,0x45,0x0f,0x38,0x14,0xd7,              //  blendvps      %xmm0,%xmm15,%xmm10
+    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
+    0x0f,0x28,0xc2,                             //  movaps        %xmm2,%xmm0
+    0x0f,0x59,0xc0,                             //  mulps         %xmm0,%xmm0
+    0x44,0x0f,0x59,0xc2,                        //  mulps         %xmm2,%xmm8
+    0x45,0x0f,0x58,0xc5,                        //  addps         %xmm13,%xmm8
+    0x44,0x0f,0x59,0xc0,                        //  mulps         %xmm0,%xmm8
+    0x45,0x0f,0x58,0xc4,                        //  addps         %xmm12,%xmm8
+    0x41,0x0f,0xc2,0xd6,0x01,                   //  cmpltps       %xmm14,%xmm2
+    0x0f,0x28,0xc2,                             //  movaps        %xmm2,%xmm0
+    0x66,0x45,0x0f,0x38,0x14,0xc3,              //  blendvps      %xmm0,%xmm11,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xc1,                        //  movaps        %xmm9,%xmm0
+    0x41,0x0f,0x28,0xca,                        //  movaps        %xmm10,%xmm1
+    0x41,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm2
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_to_srgb[] = {
+    0x48,0x83,0xec,0x18,                        //  sub           $0x18,%rsp
+    0x0f,0x29,0x3c,0x24,                        //  movaps        %xmm7,(%rsp)
+    0x0f,0x28,0xfe,                             //  movaps        %xmm6,%xmm7
+    0x0f,0x28,0xf5,                             //  movaps        %xmm5,%xmm6
+    0x0f,0x28,0xec,                             //  movaps        %xmm4,%xmm5
+    0x0f,0x28,0xe3,                             //  movaps        %xmm3,%xmm4
+    0x44,0x0f,0x28,0xc2,                        //  movaps        %xmm2,%xmm8
+    0x0f,0x28,0xd9,                             //  movaps        %xmm1,%xmm3
+    0x0f,0x52,0xd0,                             //  rsqrtps       %xmm0,%xmm2
+    0x44,0x0f,0x53,0xca,                        //  rcpps         %xmm2,%xmm9
+    0x44,0x0f,0x52,0xd2,                        //  rsqrtps       %xmm2,%xmm10
+    0xf3,0x0f,0x10,0x12,                        //  movss         (%rdx),%xmm2
+    0xf3,0x44,0x0f,0x10,0x5a,0x48,              //  movss         0x48(%rdx),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0x41,0x0f,0x28,0xcb,                        //  movaps        %xmm11,%xmm1
+    0x0f,0x59,0xc8,                             //  mulps         %xmm0,%xmm1
+    0x0f,0xc6,0xd2,0x00,                        //  shufps        $0x0,%xmm2,%xmm2
+    0xf3,0x44,0x0f,0x10,0x62,0x4c,              //  movss         0x4c(%rdx),%xmm12
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0xf3,0x44,0x0f,0x10,0x6a,0x50,              //  movss         0x50(%rdx),%xmm13
+    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
+    0xf3,0x44,0x0f,0x10,0x72,0x54,              //  movss         0x54(%rdx),%xmm14
+    0x45,0x0f,0xc6,0xf6,0x00,                   //  shufps        $0x0,%xmm14,%xmm14
+    0x45,0x0f,0x59,0xcd,                        //  mulps         %xmm13,%xmm9
+    0x45,0x0f,0x58,0xce,                        //  addps         %xmm14,%xmm9
+    0x45,0x0f,0x59,0xd4,                        //  mulps         %xmm12,%xmm10
+    0x45,0x0f,0x58,0xd1,                        //  addps         %xmm9,%xmm10
+    0x44,0x0f,0x28,0xca,                        //  movaps        %xmm2,%xmm9
+    0x45,0x0f,0x5d,0xca,                        //  minps         %xmm10,%xmm9
+    0xf3,0x44,0x0f,0x10,0x7a,0x58,              //  movss         0x58(%rdx),%xmm15
+    0x45,0x0f,0xc6,0xff,0x00,                   //  shufps        $0x0,%xmm15,%xmm15
+    0x41,0x0f,0xc2,0xc7,0x01,                   //  cmpltps       %xmm15,%xmm0
+    0x66,0x44,0x0f,0x38,0x14,0xc9,              //  blendvps      %xmm0,%xmm1,%xmm9
+    0x0f,0x52,0xc3,                             //  rsqrtps       %xmm3,%xmm0
+    0x0f,0x53,0xc8,                             //  rcpps         %xmm0,%xmm1
+    0x0f,0x52,0xc0,                             //  rsqrtps       %xmm0,%xmm0
+    0x41,0x0f,0x59,0xcd,                        //  mulps         %xmm13,%xmm1
+    0x41,0x0f,0x58,0xce,                        //  addps         %xmm14,%xmm1
+    0x41,0x0f,0x59,0xc4,                        //  mulps         %xmm12,%xmm0
+    0x0f,0x58,0xc1,                             //  addps         %xmm1,%xmm0
+    0x44,0x0f,0x28,0xd2,                        //  movaps        %xmm2,%xmm10
+    0x44,0x0f,0x5d,0xd0,                        //  minps         %xmm0,%xmm10
+    0x41,0x0f,0x28,0xcb,                        //  movaps        %xmm11,%xmm1
+    0x0f,0x59,0xcb,                             //  mulps         %xmm3,%xmm1
+    0x41,0x0f,0xc2,0xdf,0x01,                   //  cmpltps       %xmm15,%xmm3
+    0x0f,0x28,0xc3,                             //  movaps        %xmm3,%xmm0
+    0x66,0x44,0x0f,0x38,0x14,0xd1,              //  blendvps      %xmm0,%xmm1,%xmm10
+    0x41,0x0f,0x52,0xc0,                        //  rsqrtps       %xmm8,%xmm0
+    0x0f,0x53,0xc8,                             //  rcpps         %xmm0,%xmm1
+    0x41,0x0f,0x59,0xcd,                        //  mulps         %xmm13,%xmm1
+    0x41,0x0f,0x58,0xce,                        //  addps         %xmm14,%xmm1
+    0x0f,0x52,0xc0,                             //  rsqrtps       %xmm0,%xmm0
+    0x41,0x0f,0x59,0xc4,                        //  mulps         %xmm12,%xmm0
+    0x0f,0x58,0xc1,                             //  addps         %xmm1,%xmm0
+    0x0f,0x5d,0xd0,                             //  minps         %xmm0,%xmm2
+    0x45,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm11
+    0x45,0x0f,0xc2,0xc7,0x01,                   //  cmpltps       %xmm15,%xmm8
+    0x41,0x0f,0x28,0xc0,                        //  movaps        %xmm8,%xmm0
+    0x66,0x41,0x0f,0x38,0x14,0xd3,              //  blendvps      %xmm0,%xmm11,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xc1,                        //  movaps        %xmm9,%xmm0
+    0x41,0x0f,0x28,0xca,                        //  movaps        %xmm10,%xmm1
+    0x0f,0x28,0xdc,                             //  movaps        %xmm4,%xmm3
+    0x0f,0x28,0xe5,                             //  movaps        %xmm5,%xmm4
+    0x0f,0x28,0xee,                             //  movaps        %xmm6,%xmm5
+    0x0f,0x28,0xf7,                             //  movaps        %xmm7,%xmm6
+    0x0f,0x28,0x3c,0x24,                        //  movaps        (%rsp),%xmm7
+    0x48,0x83,0xc4,0x18,                        //  add           $0x18,%rsp
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_scale_u8[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0x66,0x44,0x0f,0x38,0x31,0x04,0x38,         //  pmovzxbd      (%rax,%rdi,1),%xmm8
+    0x45,0x0f,0x5b,0xc0,                        //  cvtdq2ps      %xmm8,%xmm8
+    0xf3,0x44,0x0f,0x10,0x4a,0x0c,              //  movss         0xc(%rdx),%xmm9
+    0x45,0x0f,0xc6,0xc9,0x00,                   //  shufps        $0x0,%xmm9,%xmm9
+    0x45,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm9
+    0x41,0x0f,0x59,0xc1,                        //  mulps         %xmm9,%xmm0
+    0x41,0x0f,0x59,0xc9,                        //  mulps         %xmm9,%xmm1
+    0x41,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm2
+    0x41,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_load_tables[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x08,                             //  mov           (%rax),%rcx
+    0x4c,0x8b,0x40,0x08,                        //  mov           0x8(%rax),%r8
+    0xf3,0x44,0x0f,0x6f,0x04,0xb9,              //  movdqu        (%rcx,%rdi,4),%xmm8
+    0x66,0x0f,0x6e,0x42,0x10,                   //  movd          0x10(%rdx),%xmm0
+    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
+    0x66,0x41,0x0f,0x6f,0xc8,                   //  movdqa        %xmm8,%xmm1
+    0x66,0x0f,0x72,0xd1,0x08,                   //  psrld         $0x8,%xmm1
+    0x66,0x0f,0xdb,0xc8,                        //  pand          %xmm0,%xmm1
+    0x66,0x41,0x0f,0x6f,0xd0,                   //  movdqa        %xmm8,%xmm2
+    0x66,0x0f,0x72,0xd2,0x10,                   //  psrld         $0x10,%xmm2
+    0x66,0x0f,0xdb,0xd0,                        //  pand          %xmm0,%xmm2
+    0x66,0x41,0x0f,0xdb,0xc0,                   //  pand          %xmm8,%xmm0
+    0x66,0x48,0x0f,0x3a,0x16,0xc1,0x01,         //  pextrq        $0x1,%xmm0,%rcx
+    0x41,0x89,0xc9,                             //  mov           %ecx,%r9d
+    0x48,0xc1,0xe9,0x20,                        //  shr           $0x20,%rcx
+    0x66,0x49,0x0f,0x7e,0xc2,                   //  movq          %xmm0,%r10
+    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
+    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
+    0xf3,0x43,0x0f,0x10,0x04,0x98,              //  movss         (%r8,%r11,4),%xmm0
+    0x66,0x43,0x0f,0x3a,0x21,0x04,0x90,0x10,    //  insertps      $0x10,(%r8,%r10,4),%xmm0
+    0x66,0x43,0x0f,0x3a,0x21,0x04,0x88,0x20,    //  insertps      $0x20,(%r8,%r9,4),%xmm0
+    0x66,0x41,0x0f,0x3a,0x21,0x04,0x88,0x30,    //  insertps      $0x30,(%r8,%rcx,4),%xmm0
+    0x48,0x8b,0x48,0x10,                        //  mov           0x10(%rax),%rcx
+    0x66,0x49,0x0f,0x3a,0x16,0xc8,0x01,         //  pextrq        $0x1,%xmm1,%r8
+    0x45,0x89,0xc1,                             //  mov           %r8d,%r9d
+    0x49,0xc1,0xe8,0x20,                        //  shr           $0x20,%r8
+    0x66,0x49,0x0f,0x7e,0xca,                   //  movq          %xmm1,%r10
+    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
+    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
+    0xf3,0x42,0x0f,0x10,0x0c,0x99,              //  movss         (%rcx,%r11,4),%xmm1
+    0x66,0x42,0x0f,0x3a,0x21,0x0c,0x91,0x10,    //  insertps      $0x10,(%rcx,%r10,4),%xmm1
+    0xf3,0x42,0x0f,0x10,0x1c,0x89,              //  movss         (%rcx,%r9,4),%xmm3
+    0x66,0x0f,0x3a,0x21,0xcb,0x20,              //  insertps      $0x20,%xmm3,%xmm1
+    0xf3,0x42,0x0f,0x10,0x1c,0x81,              //  movss         (%rcx,%r8,4),%xmm3
+    0x66,0x0f,0x3a,0x21,0xcb,0x30,              //  insertps      $0x30,%xmm3,%xmm1
+    0x48,0x8b,0x40,0x18,                        //  mov           0x18(%rax),%rax
+    0x66,0x48,0x0f,0x3a,0x16,0xd1,0x01,         //  pextrq        $0x1,%xmm2,%rcx
+    0x41,0x89,0xc8,                             //  mov           %ecx,%r8d
+    0x48,0xc1,0xe9,0x20,                        //  shr           $0x20,%rcx
+    0x66,0x49,0x0f,0x7e,0xd1,                   //  movq          %xmm2,%r9
+    0x45,0x89,0xca,                             //  mov           %r9d,%r10d
+    0x49,0xc1,0xe9,0x20,                        //  shr           $0x20,%r9
+    0xf3,0x42,0x0f,0x10,0x14,0x90,              //  movss         (%rax,%r10,4),%xmm2
+    0x66,0x42,0x0f,0x3a,0x21,0x14,0x88,0x10,    //  insertps      $0x10,(%rax,%r9,4),%xmm2
+    0xf3,0x42,0x0f,0x10,0x1c,0x80,              //  movss         (%rax,%r8,4),%xmm3
+    0x66,0x0f,0x3a,0x21,0xd3,0x20,              //  insertps      $0x20,%xmm3,%xmm2
+    0xf3,0x0f,0x10,0x1c,0x88,                   //  movss         (%rax,%rcx,4),%xmm3
+    0x66,0x0f,0x3a,0x21,0xd3,0x30,              //  insertps      $0x30,%xmm3,%xmm2
+    0x66,0x41,0x0f,0x72,0xd0,0x18,              //  psrld         $0x18,%xmm8
+    0x45,0x0f,0x5b,0xc0,                        //  cvtdq2ps      %xmm8,%xmm8
+    0xf3,0x0f,0x10,0x5a,0x0c,                   //  movss         0xc(%rdx),%xmm3
+    0x0f,0xc6,0xdb,0x00,                        //  shufps        $0x0,%xmm3,%xmm3
+    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_load_8888[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xf3,0x0f,0x6f,0x1c,0xb8,                   //  movdqu        (%rax,%rdi,4),%xmm3
+    0x66,0x0f,0x6e,0x42,0x10,                   //  movd          0x10(%rdx),%xmm0
+    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
+    0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
+    0x66,0x0f,0x72,0xd1,0x08,                   //  psrld         $0x8,%xmm1
+    0x66,0x0f,0xdb,0xc8,                        //  pand          %xmm0,%xmm1
+    0x66,0x0f,0x6f,0xd3,                        //  movdqa        %xmm3,%xmm2
+    0x66,0x0f,0x72,0xd2,0x10,                   //  psrld         $0x10,%xmm2
+    0x66,0x0f,0xdb,0xd0,                        //  pand          %xmm0,%xmm2
+    0x66,0x0f,0xdb,0xc3,                        //  pand          %xmm3,%xmm0
+    0x0f,0x5b,0xc0,                             //  cvtdq2ps      %xmm0,%xmm0
+    0xf3,0x44,0x0f,0x10,0x42,0x0c,              //  movss         0xc(%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x0f,0x5b,0xc9,                             //  cvtdq2ps      %xmm1,%xmm1
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x0f,0x5b,0xd2,                             //  cvtdq2ps      %xmm2,%xmm2
+    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
+    0x66,0x0f,0x72,0xd3,0x18,                   //  psrld         $0x18,%xmm3
+    0x0f,0x5b,0xdb,                             //  cvtdq2ps      %xmm3,%xmm3
+    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_store_8888[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xf3,0x44,0x0f,0x10,0x42,0x08,              //  movss         0x8(%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
+    0x66,0x45,0x0f,0x5b,0xc9,                   //  cvtps2dq      %xmm9,%xmm9
+    0x45,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm10
+    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
+    0x66,0x45,0x0f,0x5b,0xd2,                   //  cvtps2dq      %xmm10,%xmm10
+    0x66,0x41,0x0f,0x72,0xf2,0x08,              //  pslld         $0x8,%xmm10
+    0x66,0x45,0x0f,0xeb,0xd1,                   //  por           %xmm9,%xmm10
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xca,                        //  mulps         %xmm2,%xmm9
+    0x66,0x45,0x0f,0x5b,0xc9,                   //  cvtps2dq      %xmm9,%xmm9
+    0x66,0x41,0x0f,0x72,0xf1,0x10,              //  pslld         $0x10,%xmm9
+    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
+    0x66,0x45,0x0f,0x5b,0xc0,                   //  cvtps2dq      %xmm8,%xmm8
+    0x66,0x41,0x0f,0x72,0xf0,0x18,              //  pslld         $0x18,%xmm8
+    0x66,0x45,0x0f,0xeb,0xc1,                   //  por           %xmm9,%xmm8
+    0x66,0x45,0x0f,0xeb,0xc2,                   //  por           %xmm10,%xmm8
+    0xf3,0x44,0x0f,0x7f,0x04,0xb8,              //  movdqu        %xmm8,(%rax,%rdi,4)
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_load_f16[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xf3,0x0f,0x6f,0x04,0xf8,                   //  movdqu        (%rax,%rdi,8),%xmm0
+    0xf3,0x0f,0x6f,0x4c,0xf8,0x10,              //  movdqu        0x10(%rax,%rdi,8),%xmm1
+    0x66,0x0f,0x6f,0xd8,                        //  movdqa        %xmm0,%xmm3
+    0x66,0x0f,0x61,0xd9,                        //  punpcklwd     %xmm1,%xmm3
+    0x66,0x0f,0x69,0xc1,                        //  punpckhwd     %xmm1,%xmm0
+    0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
+    0x66,0x0f,0x61,0xc8,                        //  punpcklwd     %xmm0,%xmm1
+    0x66,0x0f,0x69,0xd8,                        //  punpckhwd     %xmm0,%xmm3
+    0x66,0x0f,0x38,0x33,0xc1,                   //  pmovzxwd      %xmm1,%xmm0
+    0x66,0x0f,0x72,0xf0,0x0d,                   //  pslld         $0xd,%xmm0
+    0x66,0x0f,0x6e,0x52,0x5c,                   //  movd          0x5c(%rdx),%xmm2
+    0x66,0x44,0x0f,0x70,0xc2,0x00,              //  pshufd        $0x0,%xmm2,%xmm8
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x66,0x45,0x0f,0xef,0xc9,                   //  pxor          %xmm9,%xmm9
+    0x66,0x41,0x0f,0x69,0xc9,                   //  punpckhwd     %xmm9,%xmm1
+    0x66,0x0f,0x72,0xf1,0x0d,                   //  pslld         $0xd,%xmm1
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x66,0x0f,0x38,0x33,0xd3,                   //  pmovzxwd      %xmm3,%xmm2
+    0x66,0x0f,0x72,0xf2,0x0d,                   //  pslld         $0xd,%xmm2
+    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
+    0x66,0x41,0x0f,0x69,0xd9,                   //  punpckhwd     %xmm9,%xmm3
+    0x66,0x0f,0x72,0xf3,0x0d,                   //  pslld         $0xd,%xmm3
+    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_store_f16[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0x66,0x44,0x0f,0x6e,0x42,0x60,              //  movd          0x60(%rdx),%xmm8
+    0x66,0x45,0x0f,0x70,0xc0,0x00,              //  pshufd        $0x0,%xmm8,%xmm8
+    0x66,0x45,0x0f,0x6f,0xc8,                   //  movdqa        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
+    0x66,0x41,0x0f,0x72,0xd1,0x0d,              //  psrld         $0xd,%xmm9
+    0x66,0x45,0x0f,0x6f,0xd0,                   //  movdqa        %xmm8,%xmm10
+    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
+    0x66,0x41,0x0f,0x72,0xd2,0x0d,              //  psrld         $0xd,%xmm10
+    0x66,0x45,0x0f,0x6f,0xd8,                   //  movdqa        %xmm8,%xmm11
+    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
+    0x66,0x41,0x0f,0x72,0xd3,0x0d,              //  psrld         $0xd,%xmm11
+    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
+    0x66,0x41,0x0f,0x72,0xd0,0x0d,              //  psrld         $0xd,%xmm8
+    0x66,0x41,0x0f,0x73,0xfa,0x02,              //  pslldq        $0x2,%xmm10
+    0x66,0x45,0x0f,0xeb,0xd1,                   //  por           %xmm9,%xmm10
+    0x66,0x41,0x0f,0x73,0xf8,0x02,              //  pslldq        $0x2,%xmm8
+    0x66,0x45,0x0f,0xeb,0xc3,                   //  por           %xmm11,%xmm8
+    0x66,0x45,0x0f,0x6f,0xca,                   //  movdqa        %xmm10,%xmm9
+    0x66,0x45,0x0f,0x62,0xc8,                   //  punpckldq     %xmm8,%xmm9
+    0xf3,0x44,0x0f,0x7f,0x0c,0xf8,              //  movdqu        %xmm9,(%rax,%rdi,8)
+    0x66,0x45,0x0f,0x6a,0xd0,                   //  punpckhdq     %xmm8,%xmm10
+    0xf3,0x44,0x0f,0x7f,0x54,0xf8,0x10,         //  movdqu        %xmm10,0x10(%rax,%rdi,8)
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_clamp_x[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x00,                   //  movss         (%rax),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x66,0x45,0x0f,0x76,0xc9,                   //  pcmpeqd       %xmm9,%xmm9
+    0x66,0x45,0x0f,0xfe,0xc8,                   //  paddd         %xmm8,%xmm9
+    0x41,0x0f,0x5d,0xc1,                        //  minps         %xmm9,%xmm0
+    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
+    0x44,0x0f,0x5f,0xc0,                        //  maxps         %xmm0,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xc0,                        //  movaps        %xmm8,%xmm0
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_clamp_y[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x00,                   //  movss         (%rax),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x66,0x45,0x0f,0x76,0xc9,                   //  pcmpeqd       %xmm9,%xmm9
+    0x66,0x45,0x0f,0xfe,0xc8,                   //  paddd         %xmm8,%xmm9
+    0x41,0x0f,0x5d,0xc9,                        //  minps         %xmm9,%xmm1
+    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
+    0x44,0x0f,0x5f,0xc1,                        //  maxps         %xmm1,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm1
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_matrix_2x3[] = {
+    0x44,0x0f,0x28,0xc9,                        //  movaps        %xmm1,%xmm9
+    0x44,0x0f,0x28,0xc0,                        //  movaps        %xmm0,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x0f,0x10,0x00,                        //  movss         (%rax),%xmm0
+    0xf3,0x0f,0x10,0x48,0x04,                   //  movss         0x4(%rax),%xmm1
+    0x0f,0xc6,0xc0,0x00,                        //  shufps        $0x0,%xmm0,%xmm0
+    0xf3,0x44,0x0f,0x10,0x50,0x08,              //  movss         0x8(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x10,              //  movss         0x10(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x41,0x0f,0x58,0xc2,                        //  addps         %xmm10,%xmm0
+    0x0f,0xc6,0xc9,0x00,                        //  shufps        $0x0,%xmm1,%xmm1
+    0xf3,0x44,0x0f,0x10,0x50,0x0c,              //  movss         0xc(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x14,              //  movss         0x14(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x41,0x0f,0x58,0xca,                        //  addps         %xmm10,%xmm1
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_matrix_3x4[] = {
+    0x44,0x0f,0x28,0xc9,                        //  movaps        %xmm1,%xmm9
+    0x44,0x0f,0x28,0xc0,                        //  movaps        %xmm0,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x0f,0x10,0x00,                        //  movss         (%rax),%xmm0
+    0xf3,0x0f,0x10,0x48,0x04,                   //  movss         0x4(%rax),%xmm1
+    0x0f,0xc6,0xc0,0x00,                        //  shufps        $0x0,%xmm0,%xmm0
+    0xf3,0x44,0x0f,0x10,0x50,0x0c,              //  movss         0xc(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x18,              //  movss         0x18(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0xf3,0x44,0x0f,0x10,0x60,0x24,              //  movss         0x24(%rax),%xmm12
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
+    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
+    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x41,0x0f,0x58,0xc2,                        //  addps         %xmm10,%xmm0
+    0x0f,0xc6,0xc9,0x00,                        //  shufps        $0x0,%xmm1,%xmm1
+    0xf3,0x44,0x0f,0x10,0x50,0x10,              //  movss         0x10(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x1c,              //  movss         0x1c(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0xf3,0x44,0x0f,0x10,0x60,0x28,              //  movss         0x28(%rax),%xmm12
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
+    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
+    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x41,0x0f,0x58,0xca,                        //  addps         %xmm10,%xmm1
+    0xf3,0x44,0x0f,0x10,0x50,0x08,              //  movss         0x8(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x14,              //  movss         0x14(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0xf3,0x44,0x0f,0x10,0x60,0x20,              //  movss         0x20(%rax),%xmm12
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0xf3,0x44,0x0f,0x10,0x68,0x2c,              //  movss         0x2c(%rax),%xmm13
+    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
+    0x44,0x0f,0x59,0xe2,                        //  mulps         %xmm2,%xmm12
+    0x45,0x0f,0x58,0xe5,                        //  addps         %xmm13,%xmm12
+    0x45,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm11
+    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
+    0x45,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xd2,                        //  movaps        %xmm10,%xmm2
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse41_sk_linear_gradient_2stops[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x44,0x0f,0x10,0x08,                        //  movups        (%rax),%xmm9
+    0x0f,0x10,0x58,0x10,                        //  movups        0x10(%rax),%xmm3
+    0x44,0x0f,0x28,0xc3,                        //  movaps        %xmm3,%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x41,0x0f,0x28,0xc9,                        //  movaps        %xmm9,%xmm1
+    0x0f,0xc6,0xc9,0x00,                        //  shufps        $0x0,%xmm1,%xmm1
+    0x44,0x0f,0x59,0xc0,                        //  mulps         %xmm0,%xmm8
+    0x44,0x0f,0x58,0xc1,                        //  addps         %xmm1,%xmm8
+    0x0f,0x28,0xcb,                             //  movaps        %xmm3,%xmm1
+    0x0f,0xc6,0xc9,0x55,                        //  shufps        $0x55,%xmm1,%xmm1
+    0x41,0x0f,0x28,0xd1,                        //  movaps        %xmm9,%xmm2
+    0x0f,0xc6,0xd2,0x55,                        //  shufps        $0x55,%xmm2,%xmm2
+    0x0f,0x59,0xc8,                             //  mulps         %xmm0,%xmm1
+    0x0f,0x58,0xca,                             //  addps         %xmm2,%xmm1
+    0x0f,0x28,0xd3,                             //  movaps        %xmm3,%xmm2
+    0x0f,0xc6,0xd2,0xaa,                        //  shufps        $0xaa,%xmm2,%xmm2
+    0x45,0x0f,0x28,0xd1,                        //  movaps        %xmm9,%xmm10
+    0x45,0x0f,0xc6,0xd2,0xaa,                   //  shufps        $0xaa,%xmm10,%xmm10
+    0x0f,0x59,0xd0,                             //  mulps         %xmm0,%xmm2
+    0x41,0x0f,0x58,0xd2,                        //  addps         %xmm10,%xmm2
+    0x0f,0xc6,0xdb,0xff,                        //  shufps        $0xff,%xmm3,%xmm3
+    0x45,0x0f,0xc6,0xc9,0xff,                   //  shufps        $0xff,%xmm9,%xmm9
+    0x0f,0x59,0xd8,                             //  mulps         %xmm0,%xmm3
+    0x41,0x0f,0x58,0xd9,                        //  addps         %xmm9,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xc0,                        //  movaps        %xmm8,%xmm0
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_just_return[] = {
+    0xc3,                                       //  retq
+};
+static const unsigned char sse2_sk_seed_shader[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x66,0x0f,0x6e,0xc7,                        //  movd          %edi,%xmm0
+    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
+    0x0f,0x5b,0xc8,                             //  cvtdq2ps      %xmm0,%xmm1
+    0xf3,0x0f,0x10,0x12,                        //  movss         (%rdx),%xmm2
+    0xf3,0x0f,0x10,0x5a,0x04,                   //  movss         0x4(%rdx),%xmm3
+    0x0f,0xc6,0xdb,0x00,                        //  shufps        $0x0,%xmm3,%xmm3
+    0x0f,0x58,0xcb,                             //  addps         %xmm3,%xmm1
+    0x0f,0x10,0x42,0x14,                        //  movups        0x14(%rdx),%xmm0
+    0x0f,0x58,0xc1,                             //  addps         %xmm1,%xmm0
+    0x66,0x0f,0x6e,0x08,                        //  movd          (%rax),%xmm1
+    0x66,0x0f,0x70,0xc9,0x00,                   //  pshufd        $0x0,%xmm1,%xmm1
+    0x0f,0x5b,0xc9,                             //  cvtdq2ps      %xmm1,%xmm1
+    0x0f,0x58,0xcb,                             //  addps         %xmm3,%xmm1
+    0x0f,0xc6,0xd2,0x00,                        //  shufps        $0x0,%xmm2,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xdb,                             //  xorps         %xmm3,%xmm3
+    0x0f,0x57,0xe4,                             //  xorps         %xmm4,%xmm4
+    0x0f,0x57,0xed,                             //  xorps         %xmm5,%xmm5
+    0x0f,0x57,0xf6,                             //  xorps         %xmm6,%xmm6
+    0x0f,0x57,0xff,                             //  xorps         %xmm7,%xmm7
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_constant_color[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x10,0x18,                             //  movups        (%rax),%xmm3
+    0x0f,0x28,0xc3,                             //  movaps        %xmm3,%xmm0
+    0x0f,0xc6,0xc0,0x00,                        //  shufps        $0x0,%xmm0,%xmm0
+    0x0f,0x28,0xcb,                             //  movaps        %xmm3,%xmm1
+    0x0f,0xc6,0xc9,0x55,                        //  shufps        $0x55,%xmm1,%xmm1
+    0x0f,0x28,0xd3,                             //  movaps        %xmm3,%xmm2
+    0x0f,0xc6,0xd2,0xaa,                        //  shufps        $0xaa,%xmm2,%xmm2
+    0x0f,0xc6,0xdb,0xff,                        //  shufps        $0xff,%xmm3,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_clear[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x57,0xc0,                             //  xorps         %xmm0,%xmm0
+    0x0f,0x57,0xc9,                             //  xorps         %xmm1,%xmm1
+    0x0f,0x57,0xd2,                             //  xorps         %xmm2,%xmm2
+    0x0f,0x57,0xdb,                             //  xorps         %xmm3,%xmm3
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_plus_[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x58,0xc4,                             //  addps         %xmm4,%xmm0
+    0x0f,0x58,0xcd,                             //  addps         %xmm5,%xmm1
+    0x0f,0x58,0xd6,                             //  addps         %xmm6,%xmm2
+    0x0f,0x58,0xdf,                             //  addps         %xmm7,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_srcover[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x02,                   //  movss         (%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x44,0x0f,0x5c,0xc3,                        //  subps         %xmm3,%xmm8
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xcc,                        //  mulps         %xmm4,%xmm9
+    0x41,0x0f,0x58,0xc1,                        //  addps         %xmm9,%xmm0
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xcd,                        //  mulps         %xmm5,%xmm9
+    0x41,0x0f,0x58,0xc9,                        //  addps         %xmm9,%xmm1
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xce,                        //  mulps         %xmm6,%xmm9
+    0x41,0x0f,0x58,0xd1,                        //  addps         %xmm9,%xmm2
+    0x44,0x0f,0x59,0xc7,                        //  mulps         %xmm7,%xmm8
+    0x41,0x0f,0x58,0xd8,                        //  addps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_dstover[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x02,                   //  movss         (%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x44,0x0f,0x5c,0xc7,                        //  subps         %xmm7,%xmm8
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x0f,0x58,0xc4,                             //  addps         %xmm4,%xmm0
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x0f,0x58,0xcd,                             //  addps         %xmm5,%xmm1
+    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
+    0x0f,0x58,0xd6,                             //  addps         %xmm6,%xmm2
+    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
+    0x0f,0x58,0xdf,                             //  addps         %xmm7,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_clamp_0[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
+    0x41,0x0f,0x5f,0xc0,                        //  maxps         %xmm8,%xmm0
+    0x41,0x0f,0x5f,0xc8,                        //  maxps         %xmm8,%xmm1
+    0x41,0x0f,0x5f,0xd0,                        //  maxps         %xmm8,%xmm2
+    0x41,0x0f,0x5f,0xd8,                        //  maxps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_clamp_1[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x02,                   //  movss         (%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x41,0x0f,0x5d,0xc0,                        //  minps         %xmm8,%xmm0
+    0x41,0x0f,0x5d,0xc8,                        //  minps         %xmm8,%xmm1
+    0x41,0x0f,0x5d,0xd0,                        //  minps         %xmm8,%xmm2
+    0x41,0x0f,0x5d,0xd8,                        //  minps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_clamp_a[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x02,                   //  movss         (%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x41,0x0f,0x5d,0xd8,                        //  minps         %xmm8,%xmm3
+    0x0f,0x5d,0xc3,                             //  minps         %xmm3,%xmm0
+    0x0f,0x5d,0xcb,                             //  minps         %xmm3,%xmm1
+    0x0f,0x5d,0xd3,                             //  minps         %xmm3,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_swap[] = {
+    0x44,0x0f,0x28,0xc3,                        //  movaps        %xmm3,%xmm8
+    0x44,0x0f,0x28,0xca,                        //  movaps        %xmm2,%xmm9
+    0x44,0x0f,0x28,0xd1,                        //  movaps        %xmm1,%xmm10
+    0x44,0x0f,0x28,0xd8,                        //  movaps        %xmm0,%xmm11
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x28,0xc4,                             //  movaps        %xmm4,%xmm0
+    0x0f,0x28,0xcd,                             //  movaps        %xmm5,%xmm1
+    0x0f,0x28,0xd6,                             //  movaps        %xmm6,%xmm2
+    0x0f,0x28,0xdf,                             //  movaps        %xmm7,%xmm3
+    0x41,0x0f,0x28,0xe3,                        //  movaps        %xmm11,%xmm4
+    0x41,0x0f,0x28,0xea,                        //  movaps        %xmm10,%xmm5
+    0x41,0x0f,0x28,0xf1,                        //  movaps        %xmm9,%xmm6
+    0x41,0x0f,0x28,0xf8,                        //  movaps        %xmm8,%xmm7
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_move_src_dst[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x28,0xe0,                             //  movaps        %xmm0,%xmm4
+    0x0f,0x28,0xe9,                             //  movaps        %xmm1,%xmm5
+    0x0f,0x28,0xf2,                             //  movaps        %xmm2,%xmm6
+    0x0f,0x28,0xfb,                             //  movaps        %xmm3,%xmm7
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_move_dst_src[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x28,0xc4,                             //  movaps        %xmm4,%xmm0
+    0x0f,0x28,0xcd,                             //  movaps        %xmm5,%xmm1
+    0x0f,0x28,0xd6,                             //  movaps        %xmm6,%xmm2
+    0x0f,0x28,0xdf,                             //  movaps        %xmm7,%xmm3
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_premul[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x59,0xc3,                             //  mulps         %xmm3,%xmm0
+    0x0f,0x59,0xcb,                             //  mulps         %xmm3,%xmm1
+    0x0f,0x59,0xd3,                             //  mulps         %xmm3,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_unpremul[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
+    0x44,0x0f,0xc2,0xc3,0x00,                   //  cmpeqps       %xmm3,%xmm8
+    0xf3,0x44,0x0f,0x10,0x0a,                   //  movss         (%rdx),%xmm9
+    0x45,0x0f,0xc6,0xc9,0x00,                   //  shufps        $0x0,%xmm9,%xmm9
+    0x44,0x0f,0x5e,0xcb,                        //  divps         %xmm3,%xmm9
+    0x45,0x0f,0x55,0xc1,                        //  andnps        %xmm9,%xmm8
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_from_srgb[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x42,0x40,              //  movss         0x40(%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x45,0x0f,0x28,0xe8,                        //  movaps        %xmm8,%xmm13
+    0x44,0x0f,0x59,0xe8,                        //  mulps         %xmm0,%xmm13
+    0x44,0x0f,0x28,0xe0,                        //  movaps        %xmm0,%xmm12
+    0x45,0x0f,0x59,0xe4,                        //  mulps         %xmm12,%xmm12
+    0xf3,0x44,0x0f,0x10,0x4a,0x3c,              //  movss         0x3c(%rdx),%xmm9
+    0x45,0x0f,0xc6,0xc9,0x00,                   //  shufps        $0x0,%xmm9,%xmm9
+    0xf3,0x44,0x0f,0x10,0x52,0x34,              //  movss         0x34(%rdx),%xmm10
+    0xf3,0x44,0x0f,0x10,0x5a,0x38,              //  movss         0x38(%rdx),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0x45,0x0f,0x28,0xf1,                        //  movaps        %xmm9,%xmm14
+    0x44,0x0f,0x59,0xf0,                        //  mulps         %xmm0,%xmm14
+    0x45,0x0f,0x58,0xf3,                        //  addps         %xmm11,%xmm14
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0x45,0x0f,0x59,0xf4,                        //  mulps         %xmm12,%xmm14
+    0x45,0x0f,0x58,0xf2,                        //  addps         %xmm10,%xmm14
+    0xf3,0x44,0x0f,0x10,0x62,0x44,              //  movss         0x44(%rdx),%xmm12
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0x41,0x0f,0xc2,0xc4,0x01,                   //  cmpltps       %xmm12,%xmm0
+    0x44,0x0f,0x54,0xe8,                        //  andps         %xmm0,%xmm13
+    0x41,0x0f,0x55,0xc6,                        //  andnps        %xmm14,%xmm0
+    0x41,0x0f,0x56,0xc5,                        //  orps          %xmm13,%xmm0
+    0x45,0x0f,0x28,0xe8,                        //  movaps        %xmm8,%xmm13
+    0x44,0x0f,0x59,0xe9,                        //  mulps         %xmm1,%xmm13
+    0x44,0x0f,0x28,0xf1,                        //  movaps        %xmm1,%xmm14
+    0x45,0x0f,0x59,0xf6,                        //  mulps         %xmm14,%xmm14
+    0x45,0x0f,0x28,0xf9,                        //  movaps        %xmm9,%xmm15
+    0x44,0x0f,0x59,0xf9,                        //  mulps         %xmm1,%xmm15
+    0x45,0x0f,0x58,0xfb,                        //  addps         %xmm11,%xmm15
+    0x45,0x0f,0x59,0xfe,                        //  mulps         %xmm14,%xmm15
+    0x45,0x0f,0x58,0xfa,                        //  addps         %xmm10,%xmm15
+    0x41,0x0f,0xc2,0xcc,0x01,                   //  cmpltps       %xmm12,%xmm1
+    0x44,0x0f,0x54,0xe9,                        //  andps         %xmm1,%xmm13
+    0x41,0x0f,0x55,0xcf,                        //  andnps        %xmm15,%xmm1
+    0x41,0x0f,0x56,0xcd,                        //  orps          %xmm13,%xmm1
+    0x44,0x0f,0x59,0xc2,                        //  mulps         %xmm2,%xmm8
+    0x44,0x0f,0x28,0xea,                        //  movaps        %xmm2,%xmm13
+    0x45,0x0f,0x59,0xed,                        //  mulps         %xmm13,%xmm13
+    0x44,0x0f,0x59,0xca,                        //  mulps         %xmm2,%xmm9
+    0x45,0x0f,0x58,0xcb,                        //  addps         %xmm11,%xmm9
+    0x45,0x0f,0x59,0xcd,                        //  mulps         %xmm13,%xmm9
+    0x45,0x0f,0x58,0xca,                        //  addps         %xmm10,%xmm9
+    0x41,0x0f,0xc2,0xd4,0x01,                   //  cmpltps       %xmm12,%xmm2
+    0x44,0x0f,0x54,0xc2,                        //  andps         %xmm2,%xmm8
+    0x41,0x0f,0x55,0xd1,                        //  andnps        %xmm9,%xmm2
+    0x41,0x0f,0x56,0xd0,                        //  orps          %xmm8,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_to_srgb[] = {
+    0x48,0x83,0xec,0x28,                        //  sub           $0x28,%rsp
+    0x0f,0x29,0x7c,0x24,0x10,                   //  movaps        %xmm7,0x10(%rsp)
+    0x0f,0x29,0x34,0x24,                        //  movaps        %xmm6,(%rsp)
+    0x0f,0x28,0xf5,                             //  movaps        %xmm5,%xmm6
+    0x0f,0x28,0xec,                             //  movaps        %xmm4,%xmm5
+    0x0f,0x28,0xe3,                             //  movaps        %xmm3,%xmm4
+    0x44,0x0f,0x52,0xc0,                        //  rsqrtps       %xmm0,%xmm8
+    0x45,0x0f,0x53,0xe8,                        //  rcpps         %xmm8,%xmm13
+    0x45,0x0f,0x52,0xf8,                        //  rsqrtps       %xmm8,%xmm15
+    0xf3,0x0f,0x10,0x1a,                        //  movss         (%rdx),%xmm3
+    0xf3,0x44,0x0f,0x10,0x42,0x48,              //  movss         0x48(%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x45,0x0f,0x28,0xf0,                        //  movaps        %xmm8,%xmm14
+    0x44,0x0f,0x59,0xf0,                        //  mulps         %xmm0,%xmm14
+    0x0f,0xc6,0xdb,0x00,                        //  shufps        $0x0,%xmm3,%xmm3
+    0xf3,0x44,0x0f,0x10,0x52,0x4c,              //  movss         0x4c(%rdx),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x5a,0x50,              //  movss         0x50(%rdx),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0xf3,0x44,0x0f,0x10,0x62,0x54,              //  movss         0x54(%rdx),%xmm12
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0x45,0x0f,0x59,0xeb,                        //  mulps         %xmm11,%xmm13
+    0x45,0x0f,0x58,0xec,                        //  addps         %xmm12,%xmm13
+    0x45,0x0f,0x59,0xfa,                        //  mulps         %xmm10,%xmm15
+    0x45,0x0f,0x58,0xfd,                        //  addps         %xmm13,%xmm15
+    0x44,0x0f,0x28,0xcb,                        //  movaps        %xmm3,%xmm9
+    0x45,0x0f,0x5d,0xcf,                        //  minps         %xmm15,%xmm9
+    0xf3,0x44,0x0f,0x10,0x6a,0x58,              //  movss         0x58(%rdx),%xmm13
+    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
+    0x41,0x0f,0xc2,0xc5,0x01,                   //  cmpltps       %xmm13,%xmm0
+    0x44,0x0f,0x54,0xf0,                        //  andps         %xmm0,%xmm14
+    0x41,0x0f,0x55,0xc1,                        //  andnps        %xmm9,%xmm0
+    0x41,0x0f,0x56,0xc6,                        //  orps          %xmm14,%xmm0
+    0x44,0x0f,0x52,0xc9,                        //  rsqrtps       %xmm1,%xmm9
+    0x45,0x0f,0x53,0xf1,                        //  rcpps         %xmm9,%xmm14
+    0x45,0x0f,0x52,0xc9,                        //  rsqrtps       %xmm9,%xmm9
+    0x45,0x0f,0x59,0xf3,                        //  mulps         %xmm11,%xmm14
+    0x45,0x0f,0x58,0xf4,                        //  addps         %xmm12,%xmm14
+    0x45,0x0f,0x59,0xca,                        //  mulps         %xmm10,%xmm9
+    0x45,0x0f,0x58,0xce,                        //  addps         %xmm14,%xmm9
+    0x44,0x0f,0x28,0xf3,                        //  movaps        %xmm3,%xmm14
+    0x45,0x0f,0x5d,0xf1,                        //  minps         %xmm9,%xmm14
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xc9,                        //  mulps         %xmm1,%xmm9
+    0x41,0x0f,0xc2,0xcd,0x01,                   //  cmpltps       %xmm13,%xmm1
+    0x44,0x0f,0x54,0xc9,                        //  andps         %xmm1,%xmm9
+    0x41,0x0f,0x55,0xce,                        //  andnps        %xmm14,%xmm1
+    0x41,0x0f,0x56,0xc9,                        //  orps          %xmm9,%xmm1
+    0x44,0x0f,0x52,0xca,                        //  rsqrtps       %xmm2,%xmm9
+    0x45,0x0f,0x53,0xf1,                        //  rcpps         %xmm9,%xmm14
+    0x45,0x0f,0x59,0xf3,                        //  mulps         %xmm11,%xmm14
+    0x45,0x0f,0x58,0xf4,                        //  addps         %xmm12,%xmm14
+    0x41,0x0f,0x52,0xf9,                        //  rsqrtps       %xmm9,%xmm7
+    0x41,0x0f,0x59,0xfa,                        //  mulps         %xmm10,%xmm7
+    0x41,0x0f,0x58,0xfe,                        //  addps         %xmm14,%xmm7
+    0x0f,0x5d,0xdf,                             //  minps         %xmm7,%xmm3
+    0x44,0x0f,0x59,0xc2,                        //  mulps         %xmm2,%xmm8
+    0x41,0x0f,0xc2,0xd5,0x01,                   //  cmpltps       %xmm13,%xmm2
+    0x44,0x0f,0x54,0xc2,                        //  andps         %xmm2,%xmm8
+    0x0f,0x55,0xd3,                             //  andnps        %xmm3,%xmm2
+    0x41,0x0f,0x56,0xd0,                        //  orps          %xmm8,%xmm2
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x0f,0x28,0xdc,                             //  movaps        %xmm4,%xmm3
+    0x0f,0x28,0xe5,                             //  movaps        %xmm5,%xmm4
+    0x0f,0x28,0xee,                             //  movaps        %xmm6,%xmm5
+    0x0f,0x28,0x34,0x24,                        //  movaps        (%rsp),%xmm6
+    0x0f,0x28,0x7c,0x24,0x10,                   //  movaps        0x10(%rsp),%xmm7
+    0x48,0x83,0xc4,0x28,                        //  add           $0x28,%rsp
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_scale_u8[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0x66,0x44,0x0f,0x6e,0x04,0x38,              //  movd          (%rax,%rdi,1),%xmm8
+    0x66,0x45,0x0f,0xef,0xc9,                   //  pxor          %xmm9,%xmm9
+    0x66,0x45,0x0f,0x60,0xc1,                   //  punpcklbw     %xmm9,%xmm8
+    0x66,0x45,0x0f,0x61,0xc1,                   //  punpcklwd     %xmm9,%xmm8
+    0x45,0x0f,0x5b,0xc0,                        //  cvtdq2ps      %xmm8,%xmm8
+    0xf3,0x44,0x0f,0x10,0x4a,0x0c,              //  movss         0xc(%rdx),%xmm9
+    0x45,0x0f,0xc6,0xc9,0x00,                   //  shufps        $0x0,%xmm9,%xmm9
+    0x45,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm9
+    0x41,0x0f,0x59,0xc1,                        //  mulps         %xmm9,%xmm0
+    0x41,0x0f,0x59,0xc9,                        //  mulps         %xmm9,%xmm1
+    0x41,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm2
+    0x41,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_load_tables[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x08,                             //  mov           (%rax),%rcx
+    0x4c,0x8b,0x40,0x08,                        //  mov           0x8(%rax),%r8
+    0xf3,0x44,0x0f,0x6f,0x04,0xb9,              //  movdqu        (%rcx,%rdi,4),%xmm8
+    0x66,0x0f,0x6e,0x42,0x10,                   //  movd          0x10(%rdx),%xmm0
+    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
+    0x66,0x45,0x0f,0x6f,0xc8,                   //  movdqa        %xmm8,%xmm9
+    0x66,0x41,0x0f,0x72,0xd1,0x08,              //  psrld         $0x8,%xmm9
+    0x66,0x44,0x0f,0xdb,0xc8,                   //  pand          %xmm0,%xmm9
+    0x66,0x45,0x0f,0x6f,0xd0,                   //  movdqa        %xmm8,%xmm10
+    0x66,0x41,0x0f,0x72,0xd2,0x10,              //  psrld         $0x10,%xmm10
+    0x66,0x44,0x0f,0xdb,0xd0,                   //  pand          %xmm0,%xmm10
+    0x66,0x41,0x0f,0xdb,0xc0,                   //  pand          %xmm8,%xmm0
+    0x66,0x0f,0x70,0xd8,0x4e,                   //  pshufd        $0x4e,%xmm0,%xmm3
+    0x66,0x48,0x0f,0x7e,0xd9,                   //  movq          %xmm3,%rcx
+    0x41,0x89,0xc9,                             //  mov           %ecx,%r9d
+    0x48,0xc1,0xe9,0x20,                        //  shr           $0x20,%rcx
+    0x66,0x49,0x0f,0x7e,0xc2,                   //  movq          %xmm0,%r10
+    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
+    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
+    0xf3,0x43,0x0f,0x10,0x1c,0x90,              //  movss         (%r8,%r10,4),%xmm3
+    0xf3,0x41,0x0f,0x10,0x04,0x88,              //  movss         (%r8,%rcx,4),%xmm0
+    0x0f,0x14,0xd8,                             //  unpcklps      %xmm0,%xmm3
+    0xf3,0x43,0x0f,0x10,0x04,0x98,              //  movss         (%r8,%r11,4),%xmm0
+    0xf3,0x43,0x0f,0x10,0x0c,0x88,              //  movss         (%r8,%r9,4),%xmm1
+    0x0f,0x14,0xc1,                             //  unpcklps      %xmm1,%xmm0
+    0x0f,0x14,0xc3,                             //  unpcklps      %xmm3,%xmm0
+    0x48,0x8b,0x48,0x10,                        //  mov           0x10(%rax),%rcx
+    0x66,0x41,0x0f,0x70,0xc9,0x4e,              //  pshufd        $0x4e,%xmm9,%xmm1
+    0x66,0x49,0x0f,0x7e,0xc8,                   //  movq          %xmm1,%r8
+    0x45,0x89,0xc1,                             //  mov           %r8d,%r9d
+    0x49,0xc1,0xe8,0x20,                        //  shr           $0x20,%r8
+    0x66,0x4d,0x0f,0x7e,0xca,                   //  movq          %xmm9,%r10
+    0x45,0x89,0xd3,                             //  mov           %r10d,%r11d
+    0x49,0xc1,0xea,0x20,                        //  shr           $0x20,%r10
+    0xf3,0x42,0x0f,0x10,0x1c,0x91,              //  movss         (%rcx,%r10,4),%xmm3
+    0xf3,0x42,0x0f,0x10,0x0c,0x81,              //  movss         (%rcx,%r8,4),%xmm1
+    0x0f,0x14,0xd9,                             //  unpcklps      %xmm1,%xmm3
+    0xf3,0x42,0x0f,0x10,0x0c,0x99,              //  movss         (%rcx,%r11,4),%xmm1
+    0xf3,0x42,0x0f,0x10,0x14,0x89,              //  movss         (%rcx,%r9,4),%xmm2
+    0x0f,0x14,0xca,                             //  unpcklps      %xmm2,%xmm1
+    0x0f,0x14,0xcb,                             //  unpcklps      %xmm3,%xmm1
+    0x48,0x8b,0x40,0x18,                        //  mov           0x18(%rax),%rax
+    0x66,0x41,0x0f,0x70,0xd2,0x4e,              //  pshufd        $0x4e,%xmm10,%xmm2
+    0x66,0x48,0x0f,0x7e,0xd1,                   //  movq          %xmm2,%rcx
+    0x41,0x89,0xc8,                             //  mov           %ecx,%r8d
+    0x48,0xc1,0xe9,0x20,                        //  shr           $0x20,%rcx
+    0x66,0x4d,0x0f,0x7e,0xd1,                   //  movq          %xmm10,%r9
+    0x45,0x89,0xca,                             //  mov           %r9d,%r10d
+    0x49,0xc1,0xe9,0x20,                        //  shr           $0x20,%r9
+    0xf3,0x46,0x0f,0x10,0x0c,0x88,              //  movss         (%rax,%r9,4),%xmm9
+    0xf3,0x0f,0x10,0x14,0x88,                   //  movss         (%rax,%rcx,4),%xmm2
+    0x44,0x0f,0x14,0xca,                        //  unpcklps      %xmm2,%xmm9
+    0xf3,0x42,0x0f,0x10,0x14,0x90,              //  movss         (%rax,%r10,4),%xmm2
+    0xf3,0x42,0x0f,0x10,0x1c,0x80,              //  movss         (%rax,%r8,4),%xmm3
+    0x0f,0x14,0xd3,                             //  unpcklps      %xmm3,%xmm2
+    0x41,0x0f,0x14,0xd1,                        //  unpcklps      %xmm9,%xmm2
+    0x66,0x41,0x0f,0x72,0xd0,0x18,              //  psrld         $0x18,%xmm8
+    0x45,0x0f,0x5b,0xc0,                        //  cvtdq2ps      %xmm8,%xmm8
+    0xf3,0x0f,0x10,0x5a,0x0c,                   //  movss         0xc(%rdx),%xmm3
+    0x0f,0xc6,0xdb,0x00,                        //  shufps        $0x0,%xmm3,%xmm3
+    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_load_8888[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xf3,0x0f,0x6f,0x1c,0xb8,                   //  movdqu        (%rax,%rdi,4),%xmm3
+    0x66,0x0f,0x6e,0x42,0x10,                   //  movd          0x10(%rdx),%xmm0
+    0x66,0x0f,0x70,0xc0,0x00,                   //  pshufd        $0x0,%xmm0,%xmm0
+    0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
+    0x66,0x0f,0x72,0xd1,0x08,                   //  psrld         $0x8,%xmm1
+    0x66,0x0f,0xdb,0xc8,                        //  pand          %xmm0,%xmm1
+    0x66,0x0f,0x6f,0xd3,                        //  movdqa        %xmm3,%xmm2
+    0x66,0x0f,0x72,0xd2,0x10,                   //  psrld         $0x10,%xmm2
+    0x66,0x0f,0xdb,0xd0,                        //  pand          %xmm0,%xmm2
+    0x66,0x0f,0xdb,0xc3,                        //  pand          %xmm3,%xmm0
+    0x0f,0x5b,0xc0,                             //  cvtdq2ps      %xmm0,%xmm0
+    0xf3,0x44,0x0f,0x10,0x42,0x0c,              //  movss         0xc(%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x0f,0x5b,0xc9,                             //  cvtdq2ps      %xmm1,%xmm1
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x0f,0x5b,0xd2,                             //  cvtdq2ps      %xmm2,%xmm2
+    0x41,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm2
+    0x66,0x0f,0x72,0xd3,0x18,                   //  psrld         $0x18,%xmm3
+    0x0f,0x5b,0xdb,                             //  cvtdq2ps      %xmm3,%xmm3
+    0x41,0x0f,0x59,0xd8,                        //  mulps         %xmm8,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_store_8888[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xf3,0x44,0x0f,0x10,0x42,0x08,              //  movss         0x8(%rdx),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
+    0x66,0x45,0x0f,0x5b,0xc9,                   //  cvtps2dq      %xmm9,%xmm9
+    0x45,0x0f,0x28,0xd0,                        //  movaps        %xmm8,%xmm10
+    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
+    0x66,0x45,0x0f,0x5b,0xd2,                   //  cvtps2dq      %xmm10,%xmm10
+    0x66,0x41,0x0f,0x72,0xf2,0x08,              //  pslld         $0x8,%xmm10
+    0x66,0x45,0x0f,0xeb,0xd1,                   //  por           %xmm9,%xmm10
+    0x45,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xca,                        //  mulps         %xmm2,%xmm9
+    0x66,0x45,0x0f,0x5b,0xc9,                   //  cvtps2dq      %xmm9,%xmm9
+    0x66,0x41,0x0f,0x72,0xf1,0x10,              //  pslld         $0x10,%xmm9
+    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
+    0x66,0x45,0x0f,0x5b,0xc0,                   //  cvtps2dq      %xmm8,%xmm8
+    0x66,0x41,0x0f,0x72,0xf0,0x18,              //  pslld         $0x18,%xmm8
+    0x66,0x45,0x0f,0xeb,0xc1,                   //  por           %xmm9,%xmm8
+    0x66,0x45,0x0f,0xeb,0xc2,                   //  por           %xmm10,%xmm8
+    0xf3,0x44,0x0f,0x7f,0x04,0xb8,              //  movdqu        %xmm8,(%rax,%rdi,4)
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_load_f16[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0xf3,0x0f,0x6f,0x04,0xf8,                   //  movdqu        (%rax,%rdi,8),%xmm0
+    0xf3,0x0f,0x6f,0x4c,0xf8,0x10,              //  movdqu        0x10(%rax,%rdi,8),%xmm1
+    0x66,0x0f,0x6f,0xd8,                        //  movdqa        %xmm0,%xmm3
+    0x66,0x0f,0x61,0xd9,                        //  punpcklwd     %xmm1,%xmm3
+    0x66,0x0f,0x69,0xc1,                        //  punpckhwd     %xmm1,%xmm0
+    0x66,0x0f,0x6f,0xcb,                        //  movdqa        %xmm3,%xmm1
+    0x66,0x0f,0x61,0xc8,                        //  punpcklwd     %xmm0,%xmm1
+    0x66,0x0f,0x69,0xd8,                        //  punpckhwd     %xmm0,%xmm3
+    0x66,0x45,0x0f,0xef,0xc0,                   //  pxor          %xmm8,%xmm8
+    0x66,0x0f,0x6f,0xc1,                        //  movdqa        %xmm1,%xmm0
+    0x66,0x41,0x0f,0x61,0xc0,                   //  punpcklwd     %xmm8,%xmm0
+    0x66,0x0f,0x72,0xf0,0x0d,                   //  pslld         $0xd,%xmm0
+    0x66,0x0f,0x6e,0x52,0x5c,                   //  movd          0x5c(%rdx),%xmm2
+    0x66,0x44,0x0f,0x70,0xca,0x00,              //  pshufd        $0x0,%xmm2,%xmm9
+    0x41,0x0f,0x59,0xc1,                        //  mulps         %xmm9,%xmm0
+    0x66,0x41,0x0f,0x69,0xc8,                   //  punpckhwd     %xmm8,%xmm1
+    0x66,0x0f,0x72,0xf1,0x0d,                   //  pslld         $0xd,%xmm1
+    0x41,0x0f,0x59,0xc9,                        //  mulps         %xmm9,%xmm1
+    0x66,0x0f,0x6f,0xd3,                        //  movdqa        %xmm3,%xmm2
+    0x66,0x41,0x0f,0x61,0xd0,                   //  punpcklwd     %xmm8,%xmm2
+    0x66,0x0f,0x72,0xf2,0x0d,                   //  pslld         $0xd,%xmm2
+    0x41,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm2
+    0x66,0x41,0x0f,0x69,0xd8,                   //  punpckhwd     %xmm8,%xmm3
+    0x66,0x0f,0x72,0xf3,0x0d,                   //  pslld         $0xd,%xmm3
+    0x41,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_store_f16[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x48,0x8b,0x00,                             //  mov           (%rax),%rax
+    0x66,0x44,0x0f,0x6e,0x42,0x60,              //  movd          0x60(%rdx),%xmm8
+    0x66,0x45,0x0f,0x70,0xc0,0x00,              //  pshufd        $0x0,%xmm8,%xmm8
+    0x66,0x45,0x0f,0x6f,0xc8,                   //  movdqa        %xmm8,%xmm9
+    0x44,0x0f,0x59,0xc8,                        //  mulps         %xmm0,%xmm9
+    0x66,0x41,0x0f,0x72,0xd1,0x0d,              //  psrld         $0xd,%xmm9
+    0x66,0x45,0x0f,0x6f,0xd0,                   //  movdqa        %xmm8,%xmm10
+    0x44,0x0f,0x59,0xd1,                        //  mulps         %xmm1,%xmm10
+    0x66,0x41,0x0f,0x72,0xd2,0x0d,              //  psrld         $0xd,%xmm10
+    0x66,0x45,0x0f,0x6f,0xd8,                   //  movdqa        %xmm8,%xmm11
+    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
+    0x66,0x41,0x0f,0x72,0xd3,0x0d,              //  psrld         $0xd,%xmm11
+    0x44,0x0f,0x59,0xc3,                        //  mulps         %xmm3,%xmm8
+    0x66,0x41,0x0f,0x72,0xd0,0x0d,              //  psrld         $0xd,%xmm8
+    0x66,0x41,0x0f,0x73,0xfa,0x02,              //  pslldq        $0x2,%xmm10
+    0x66,0x45,0x0f,0xeb,0xd1,                   //  por           %xmm9,%xmm10
+    0x66,0x41,0x0f,0x73,0xf8,0x02,              //  pslldq        $0x2,%xmm8
+    0x66,0x45,0x0f,0xeb,0xc3,                   //  por           %xmm11,%xmm8
+    0x66,0x45,0x0f,0x6f,0xca,                   //  movdqa        %xmm10,%xmm9
+    0x66,0x45,0x0f,0x62,0xc8,                   //  punpckldq     %xmm8,%xmm9
+    0xf3,0x44,0x0f,0x7f,0x0c,0xf8,              //  movdqu        %xmm9,(%rax,%rdi,8)
+    0x66,0x45,0x0f,0x6a,0xd0,                   //  punpckhdq     %xmm8,%xmm10
+    0xf3,0x44,0x0f,0x7f,0x54,0xf8,0x10,         //  movdqu        %xmm10,0x10(%rax,%rdi,8)
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_clamp_x[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x00,                   //  movss         (%rax),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x66,0x45,0x0f,0x76,0xc9,                   //  pcmpeqd       %xmm9,%xmm9
+    0x66,0x45,0x0f,0xfe,0xc8,                   //  paddd         %xmm8,%xmm9
+    0x41,0x0f,0x5d,0xc1,                        //  minps         %xmm9,%xmm0
+    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
+    0x44,0x0f,0x5f,0xc0,                        //  maxps         %xmm0,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xc0,                        //  movaps        %xmm8,%xmm0
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_clamp_y[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x44,0x0f,0x10,0x00,                   //  movss         (%rax),%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x66,0x45,0x0f,0x76,0xc9,                   //  pcmpeqd       %xmm9,%xmm9
+    0x66,0x45,0x0f,0xfe,0xc8,                   //  paddd         %xmm8,%xmm9
+    0x41,0x0f,0x5d,0xc9,                        //  minps         %xmm9,%xmm1
+    0x45,0x0f,0x57,0xc0,                        //  xorps         %xmm8,%xmm8
+    0x44,0x0f,0x5f,0xc1,                        //  maxps         %xmm1,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xc8,                        //  movaps        %xmm8,%xmm1
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_matrix_2x3[] = {
+    0x44,0x0f,0x28,0xc9,                        //  movaps        %xmm1,%xmm9
+    0x44,0x0f,0x28,0xc0,                        //  movaps        %xmm0,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x0f,0x10,0x00,                        //  movss         (%rax),%xmm0
+    0xf3,0x0f,0x10,0x48,0x04,                   //  movss         0x4(%rax),%xmm1
+    0x0f,0xc6,0xc0,0x00,                        //  shufps        $0x0,%xmm0,%xmm0
+    0xf3,0x44,0x0f,0x10,0x50,0x08,              //  movss         0x8(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x10,              //  movss         0x10(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x41,0x0f,0x58,0xc2,                        //  addps         %xmm10,%xmm0
+    0x0f,0xc6,0xc9,0x00,                        //  shufps        $0x0,%xmm1,%xmm1
+    0xf3,0x44,0x0f,0x10,0x50,0x0c,              //  movss         0xc(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x14,              //  movss         0x14(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x41,0x0f,0x58,0xca,                        //  addps         %xmm10,%xmm1
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_matrix_3x4[] = {
+    0x44,0x0f,0x28,0xc9,                        //  movaps        %xmm1,%xmm9
+    0x44,0x0f,0x28,0xc0,                        //  movaps        %xmm0,%xmm8
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0xf3,0x0f,0x10,0x00,                        //  movss         (%rax),%xmm0
+    0xf3,0x0f,0x10,0x48,0x04,                   //  movss         0x4(%rax),%xmm1
+    0x0f,0xc6,0xc0,0x00,                        //  shufps        $0x0,%xmm0,%xmm0
+    0xf3,0x44,0x0f,0x10,0x50,0x0c,              //  movss         0xc(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x18,              //  movss         0x18(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0xf3,0x44,0x0f,0x10,0x60,0x24,              //  movss         0x24(%rax),%xmm12
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
+    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
+    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x41,0x0f,0x59,0xc0,                        //  mulps         %xmm8,%xmm0
+    0x41,0x0f,0x58,0xc2,                        //  addps         %xmm10,%xmm0
+    0x0f,0xc6,0xc9,0x00,                        //  shufps        $0x0,%xmm1,%xmm1
+    0xf3,0x44,0x0f,0x10,0x50,0x10,              //  movss         0x10(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x1c,              //  movss         0x1c(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0xf3,0x44,0x0f,0x10,0x60,0x28,              //  movss         0x28(%rax),%xmm12
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0x44,0x0f,0x59,0xda,                        //  mulps         %xmm2,%xmm11
+    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
+    0x45,0x0f,0x59,0xd1,                        //  mulps         %xmm9,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x41,0x0f,0x59,0xc8,                        //  mulps         %xmm8,%xmm1
+    0x41,0x0f,0x58,0xca,                        //  addps         %xmm10,%xmm1
+    0xf3,0x44,0x0f,0x10,0x50,0x08,              //  movss         0x8(%rax),%xmm10
+    0x45,0x0f,0xc6,0xd2,0x00,                   //  shufps        $0x0,%xmm10,%xmm10
+    0xf3,0x44,0x0f,0x10,0x58,0x14,              //  movss         0x14(%rax),%xmm11
+    0x45,0x0f,0xc6,0xdb,0x00,                   //  shufps        $0x0,%xmm11,%xmm11
+    0xf3,0x44,0x0f,0x10,0x60,0x20,              //  movss         0x20(%rax),%xmm12
+    0x45,0x0f,0xc6,0xe4,0x00,                   //  shufps        $0x0,%xmm12,%xmm12
+    0xf3,0x44,0x0f,0x10,0x68,0x2c,              //  movss         0x2c(%rax),%xmm13
+    0x45,0x0f,0xc6,0xed,0x00,                   //  shufps        $0x0,%xmm13,%xmm13
+    0x44,0x0f,0x59,0xe2,                        //  mulps         %xmm2,%xmm12
+    0x45,0x0f,0x58,0xe5,                        //  addps         %xmm13,%xmm12
+    0x45,0x0f,0x59,0xd9,                        //  mulps         %xmm9,%xmm11
+    0x45,0x0f,0x58,0xdc,                        //  addps         %xmm12,%xmm11
+    0x45,0x0f,0x59,0xd0,                        //  mulps         %xmm8,%xmm10
+    0x45,0x0f,0x58,0xd3,                        //  addps         %xmm11,%xmm10
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xd2,                        //  movaps        %xmm10,%xmm2
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+static const unsigned char sse2_sk_linear_gradient_2stops[] = {
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x44,0x0f,0x10,0x08,                        //  movups        (%rax),%xmm9
+    0x0f,0x10,0x58,0x10,                        //  movups        0x10(%rax),%xmm3
+    0x44,0x0f,0x28,0xc3,                        //  movaps        %xmm3,%xmm8
+    0x45,0x0f,0xc6,0xc0,0x00,                   //  shufps        $0x0,%xmm8,%xmm8
+    0x41,0x0f,0x28,0xc9,                        //  movaps        %xmm9,%xmm1
+    0x0f,0xc6,0xc9,0x00,                        //  shufps        $0x0,%xmm1,%xmm1
+    0x44,0x0f,0x59,0xc0,                        //  mulps         %xmm0,%xmm8
+    0x44,0x0f,0x58,0xc1,                        //  addps         %xmm1,%xmm8
+    0x0f,0x28,0xcb,                             //  movaps        %xmm3,%xmm1
+    0x0f,0xc6,0xc9,0x55,                        //  shufps        $0x55,%xmm1,%xmm1
+    0x41,0x0f,0x28,0xd1,                        //  movaps        %xmm9,%xmm2
+    0x0f,0xc6,0xd2,0x55,                        //  shufps        $0x55,%xmm2,%xmm2
+    0x0f,0x59,0xc8,                             //  mulps         %xmm0,%xmm1
+    0x0f,0x58,0xca,                             //  addps         %xmm2,%xmm1
+    0x0f,0x28,0xd3,                             //  movaps        %xmm3,%xmm2
+    0x0f,0xc6,0xd2,0xaa,                        //  shufps        $0xaa,%xmm2,%xmm2
+    0x45,0x0f,0x28,0xd1,                        //  movaps        %xmm9,%xmm10
+    0x45,0x0f,0xc6,0xd2,0xaa,                   //  shufps        $0xaa,%xmm10,%xmm10
+    0x0f,0x59,0xd0,                             //  mulps         %xmm0,%xmm2
+    0x41,0x0f,0x58,0xd2,                        //  addps         %xmm10,%xmm2
+    0x0f,0xc6,0xdb,0xff,                        //  shufps        $0xff,%xmm3,%xmm3
+    0x45,0x0f,0xc6,0xc9,0xff,                   //  shufps        $0xff,%xmm9,%xmm9
+    0x0f,0x59,0xd8,                             //  mulps         %xmm0,%xmm3
+    0x41,0x0f,0x58,0xd9,                        //  addps         %xmm9,%xmm3
+    0x48,0xad,                                  //  lods          %ds:(%rsi),%rax
+    0x41,0x0f,0x28,0xc0,                        //  movaps        %xmm8,%xmm0
+    0xff,0xe0,                                  //  jmpq          *%rax
+};
+#endif//SkJumper_generated_DEFINED
diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp
new file mode 100644 (file)
index 0000000..6c106c3
--- /dev/null
@@ -0,0 +1,549 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkJumper.h"
+#include <string.h>
+
+// It's tricky to relocate code referencing ordinary constants, so we read them from this struct.
+using K = const SkJumper_constants;
+
+#if !defined(JUMPER)
+    // This path should lead to portable code that can be compiled directly into Skia.
+    // (All other paths are compiled offline by Clang into SkJumper_generated.h.)
+    #include <math.h>
+
+    using F   = float;
+    using I32 =  int32_t;
+    using U32 = uint32_t;
+    using U8  = uint8_t;
+
+    static F   fma(F f, F m, F a)  { return f*m+a; }
+    static F   min(F a, F b)       { return fminf(a,b); }
+    static F   max(F a, F b)       { return fmaxf(a,b); }
+    static F   rcp  (F v)          { return 1.0f / v; }
+    static F   rsqrt(F v)          { return 1.0f / sqrtf(v); }
+    static U32 round(F v, F scale) { return (uint32_t)(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) { return c ? t : e; }
+
+    static F gather(const float* p, U32 ix) { return p[ix]; }
+
+#elif defined(__aarch64__)
+    #include <arm_neon.h>
+
+    // Since we know we're using Clang, we can use its vector extensions.
+    using F   = float    __attribute__((ext_vector_type(4)));
+    using I32 =  int32_t __attribute__((ext_vector_type(4)));
+    using U32 = uint32_t __attribute__((ext_vector_type(4)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(4)));
+
+    // We polyfill a few routines that Clang doesn't build into ext_vector_types.
+    static F   fma(F f, F m, F a)                   { return vfmaq_f32(a,f,m);        }
+    static F   min(F a, F b)                        { return vminq_f32(a,b);          }
+    static F   max(F a, F b)                        { return vmaxq_f32(a,b);          }
+    static F   rcp  (F v) { auto e = vrecpeq_f32 (v); return vrecpsq_f32 (v,e  ) * e; }
+    static F   rsqrt(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
+    static U32 round(F v, F scale)                  { return vcvtnq_u32_f32(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
+
+    static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
+
+#elif defined(__ARM_NEON__)
+    #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
+        #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
+    #endif
+    #include <arm_neon.h>
+
+    // We can pass {s0-s15} as arguments under AAPCS-VFP.  We'll slice that as 8 d-registers.
+    using F   = float    __attribute__((ext_vector_type(2)));
+    using I32 =  int32_t __attribute__((ext_vector_type(2)));
+    using U32 = uint32_t __attribute__((ext_vector_type(2)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(2)));
+
+    static F   fma(F f, F m, F a)                  { return vfma_f32(a,f,m);        }
+    static F   min(F a, F b)                       { return vmin_f32(a,b);          }
+    static F   max(F a, F b)                       { return vmax_f32(a,b);          }
+    static F   rcp  (F v) { auto e = vrecpe_f32 (v); return vrecps_f32 (v,e  ) * e; }
+    static F   rsqrt(F v) { auto e = vrsqrte_f32(v); return vrsqrts_f32(v,e*e) * e; }
+    static U32 round(F v, F scale)                 { return vcvt_u32_f32(fma(v,scale,0.5f)); }
+
+    static F if_then_else(I32 c, F t, F e) { return vbsl_f32((U32)c,t,e); }
+
+    static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
+
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
+    #include <immintrin.h>
+
+    // These are __m256 and __m256i, but friendlier and strongly-typed.
+    using F   = float    __attribute__((ext_vector_type(8)));
+    using I32 =  int32_t __attribute__((ext_vector_type(8)));
+    using U32 = uint32_t __attribute__((ext_vector_type(8)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(8)));
+
+    static F   fma(F f, F m, F a)  { return _mm256_fmadd_ps(f,m,a);}
+    static F   min(F a, F b)       { return _mm256_min_ps(a,b);    }
+    static F   max(F a, F b)       { return _mm256_max_ps(a,b);    }
+    static F   rcp  (F v)          { return _mm256_rcp_ps  (v);    }
+    static F   rsqrt(F v)          { return _mm256_rsqrt_ps(v);    }
+    static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+
+    static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
+
+#elif defined(__SSE2__)
+    #include <immintrin.h>
+
+    using F   = float    __attribute__((ext_vector_type(4)));
+    using I32 =  int32_t __attribute__((ext_vector_type(4)));
+    using U32 = uint32_t __attribute__((ext_vector_type(4)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(4)));
+
+    static F   fma(F f, F m, F a)  { return f*m+a;           }
+    static F   min(F a, F b)       { return _mm_min_ps(a,b); }
+    static F   max(F a, F b)       { return _mm_max_ps(a,b); }
+    static F   rcp  (F v)          { return _mm_rcp_ps  (v); }
+    static F   rsqrt(F v)          { return _mm_rsqrt_ps(v); }
+    static U32 round(F v, F scale) { return _mm_cvtps_epi32(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) {
+    #if defined(__SSE4_1__)
+        return _mm_blendv_ps(e,t,c);
+    #else
+        return _mm_or_ps(_mm_and_ps(c, t), _mm_andnot_ps(c, e));
+    #endif
+    }
+
+    static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
+#endif
+
+// We need to be a careful with casts.
+// (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
+// These named casts and bit_cast() are always what they seem to be.
+#if defined(JUMPER)
+    static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F);   }
+    static U32 expand(U8  v) { return __builtin_convertvector(     v, U32); }
+#else
+    static F   cast  (U32 v) { return (F)v; }
+    static U32 expand(U8  v) { return (U32)v; }
+#endif
+
+template <typename T, typename P>
+static T unaligned_load(const P* p) {
+    T v;
+    memcpy(&v, p, sizeof(v));
+    return v;
+}
+
+template <typename Dst, typename Src>
+static Dst bit_cast(const Src& src) {
+    static_assert(sizeof(Dst) == sizeof(Src), "");
+    return unaligned_load<Dst>(&src);
+}
+
+// Sometimes we want to work with 4 floats directly, regardless of the depth of the F vector.
+#if defined(JUMPER)
+    using F4 = float __attribute__((ext_vector_type(4)));
+#else
+    struct F4 {
+        float vals[4];
+        float operator[](int i) const { return vals[i]; }
+    };
+#endif
+
+// Stages tail call between each other by following program,
+// an interlaced sequence of Stage pointers and context pointers.
+using Stage = void(size_t x, void** program, K* k, F,F,F,F, F,F,F,F);
+
+static void* load_and_inc(void**& program) {
+#if defined(__GNUC__) && defined(__x86_64__)
+    // Passing program as the second Stage argument makes it likely that it's in %rsi,
+    // so this is usually a single instruction *program++.
+    void* rax;
+    asm("lodsq" : "=a"(rax), "+S"(program));  // Write-only %rax, read-write %rsi.
+    return rax;
+    // When a Stage uses its ctx pointer, this optimization typically cuts an instruction:
+    //    mov    (%rsi), %rcx     // ctx  = program[0]
+    //    ...
+    //    mov 0x8(%rsi), %rax     // next = program[1]
+    //    add $0x10, %rsi         // program += 2
+    //    jmpq *%rax              // JUMP!
+    // becomes
+    //    lods   %ds:(%rsi),%rax  // ctx  = *program++;
+    //    ...
+    //    lods   %ds:(%rsi),%rax  // next = *program++;
+    //    jmpq *%rax              // JUMP!
+    //
+    // When a Stage doesn't use its ctx pointer, it's 3 instructions either way,
+    // but using lodsq (a 2-byte instruction) tends to trim a few bytes.
+#else
+    // On ARM *program++ compiles into a single instruction without any handholding.
+    return *program++;
+#endif
+}
+
+#define STAGE(name)                                                           \
+    static void name##_k(size_t& x, void* ctx, K* k,                          \
+                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+    extern "C" void sk_##name(size_t x, void** program, K* k,                 \
+                              F r, F g, F b, F a, F dr, F dg, F db, F da) {   \
+        auto ctx = load_and_inc(program);                                     \
+        name##_k(x,ctx,k, r,g,b,a, dr,dg,db,da);                              \
+        auto next = (Stage*)load_and_inc(program);                            \
+        next(x,program,k, r,g,b,a, dr,dg,db,da);                              \
+    }                                                                         \
+    static void name##_k(size_t& x, void* ctx, K* k,                          \
+                         F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+
+// A glue Stage to end the tail call chain, finally returning to the caller.
+extern "C" void sk_just_return(size_t, void**, K*, F,F,F,F, F,F,F,F) {
+#if defined(JUMPER) && defined(__AVX2__)
+    _mm256_zeroupper();
+#endif
+}
+
+// We can now define Stages!
+
+// Some things to keep in mind while writing Stages:
+//   - do not branch;                                           (i.e. avoid jmp)
+//   - do not call functions that don't inline;                 (i.e. avoid call, ret)
+//   - do not use constant literals other than 0, ~0 and 0.0f.  (i.e. avoid rip relative addressing)
+//
+// Some things that should work fine:
+//   - 0, ~0, and 0.0f;
+//   - arithmetic;
+//   - functions of F and U32 that we've defined above;
+//   - temporary values;
+//   - lambdas;
+//   - memcpy() with a compile-time constant size argument.
+
+STAGE(seed_shader) {
+    auto y = *(const int*)ctx;
+
+    // It's important for speed to explicitly cast(x) and cast(y),
+    // which has the effect of splatting them to vectors before converting to floats.
+    // On Intel this breaks a data dependency on previous loop iterations' registers.
+
+    r = cast(x) + k->_0_5 + unaligned_load<F>(k->iota);
+    g = cast(y) + k->_0_5;
+    b = k->_1;
+    a = 0;
+    dr = dg = db = da = 0;
+}
+
+STAGE(constant_color) {
+    auto rgba = unaligned_load<F4>(ctx);
+    r = rgba[0];
+    g = rgba[1];
+    b = rgba[2];
+    a = rgba[3];
+}
+
+STAGE(clear) {
+    r = g = b = a = 0;
+}
+
+STAGE(plus_) {
+    r = r + dr;
+    g = g + dg;
+    b = b + db;
+    a = a + da;
+}
+
+STAGE(srcover) {
+    auto A = k->_1 - a;
+    r = fma(dr, A, r);
+    g = fma(dg, A, g);
+    b = fma(db, A, b);
+    a = fma(da, A, a);
+}
+STAGE(dstover) {
+    auto DA = k->_1 - da;
+    r = fma(r, DA, dr);
+    g = fma(g, DA, dg);
+    b = fma(b, DA, db);
+    a = fma(a, DA, da);
+}
+
+STAGE(clamp_0) {
+    r = max(r, 0);
+    g = max(g, 0);
+    b = max(b, 0);
+    a = max(a, 0);
+}
+
+STAGE(clamp_1) {
+    r = min(r, k->_1);
+    g = min(g, k->_1);
+    b = min(b, k->_1);
+    a = min(a, k->_1);
+}
+
+STAGE(clamp_a) {
+    a = min(a, k->_1);
+    r = min(r, a);
+    g = min(g, a);
+    b = min(b, a);
+}
+
+STAGE(swap) {
+    auto swap = [](F& v, F& dv) {
+        auto tmp = v;
+        v = dv;
+        dv = tmp;
+    };
+    swap(r, dr);
+    swap(g, dg);
+    swap(b, db);
+    swap(a, da);
+}
+STAGE(move_src_dst) {
+    dr = r;
+    dg = g;
+    db = b;
+    da = a;
+}
+STAGE(move_dst_src) {
+    r = dr;
+    g = dg;
+    b = db;
+    a = da;
+}
+
+STAGE(premul) {
+    r = r * a;
+    g = g * a;
+    b = b * a;
+}
+STAGE(unpremul) {
+    auto scale = if_then_else(a == 0, 0, k->_1 / a);
+    r = r * scale;
+    g = g * scale;
+    b = b * scale;
+}
+
+STAGE(from_srgb) {
+    auto fn = [&](F s) {
+        auto lo = s * k->_1_1292;
+        auto hi = fma(s*s, fma(s, k->_03000, k->_06975), k->_00025);
+        return if_then_else(s < k->_0055, lo, hi);
+    };
+    r = fn(r);
+    g = fn(g);
+    b = fn(b);
+}
+STAGE(to_srgb) {
+    auto fn = [&](F l) {
+        F sqrt = rcp  (rsqrt(l)),
+          ftrt = rsqrt(rsqrt(l));
+        auto lo = l * k->_1246;
+        auto hi = min(k->_1, fma(k->_0411192, ftrt,
+                             fma(k->_0689206, sqrt,
+                                 k->n_00988)));
+        return if_then_else(l < k->_00043, lo, hi);
+    };
+    r = fn(r);
+    g = fn(g);
+    b = fn(b);
+}
+
+STAGE(scale_u8) {
+    auto ptr = *(const uint8_t**)ctx + x;
+
+    auto scales = unaligned_load<U8>(ptr);
+    auto c = cast(expand(scales)) * k->_1_255;
+
+    r = r * c;
+    g = g * c;
+    b = b * c;
+    a = a * c;
+}
+
+STAGE(load_tables) {
+    struct Ctx {
+        const uint32_t* src;
+        const float *r, *g, *b;
+    };
+    auto c = (const Ctx*)ctx;
+
+    auto px = unaligned_load<U32>(c->src + x);
+    r = gather(c->r, (px      ) & k->_0x000000ff);
+    g = gather(c->g, (px >>  8) & k->_0x000000ff);
+    b = gather(c->b, (px >> 16) & k->_0x000000ff);
+    a = cast(        (px >> 24)) * k->_1_255;
+}
+
+STAGE(load_8888) {
+    auto ptr = *(const uint32_t**)ctx + x;
+
+    auto px = unaligned_load<U32>(ptr);
+    r = cast((px      ) & k->_0x000000ff) * k->_1_255;
+    g = cast((px >>  8) & k->_0x000000ff) * k->_1_255;
+    b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
+    a = cast((px >> 24)                 ) * k->_1_255;
+}
+
+STAGE(store_8888) {
+    auto ptr = *(uint32_t**)ctx + x;
+
+    U32 px = round(r, k->_255)
+           | round(g, k->_255) <<  8
+           | round(b, k->_255) << 16
+           | round(a, k->_255) << 24;
+    memcpy(ptr, &px, sizeof(px));
+}
+
+STAGE(load_f16) {
+    auto ptr = *(const uint64_t**)ctx + x;
+
+#if !defined(JUMPER)
+    // TODO:
+    (void)ptr;
+#elif defined(__aarch64__)
+    auto halfs = vld4_f16((const float16_t*)ptr);
+    r = vcvt_f32_f16(halfs.val[0]);
+    g = vcvt_f32_f16(halfs.val[1]);
+    b = vcvt_f32_f16(halfs.val[2]);
+    a = vcvt_f32_f16(halfs.val[3]);
+#elif defined(__ARM_NEON__)
+    auto rb_ga = vld2_f16((const float16_t*)ptr);
+    auto rb = vcvt_f32_f16(rb_ga.val[0]),
+         ga = vcvt_f32_f16(rb_ga.val[1]);
+    r = {rb[0], rb[2]};
+    g = {ga[0], ga[2]};
+    b = {rb[1], rb[3]};
+    a = {ga[1], ga[3]};
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
+    auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
+         _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
+         _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
+         _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+
+    auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
+         _13 = _mm_unpackhi_epi16(_01, _23),  // r1 r3 g1 g3 b1 b3 a1 a3
+         _46 = _mm_unpacklo_epi16(_45, _67),
+         _57 = _mm_unpackhi_epi16(_45, _67);
+
+    auto rg0123 = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
+         ba0123 = _mm_unpackhi_epi16(_02, _13),  // b0 b1 b2 b3 a0 a1 a2 a3
+         rg4567 = _mm_unpacklo_epi16(_46, _57),
+         ba4567 = _mm_unpackhi_epi16(_46, _57);
+
+    r = _mm256_cvtph_ps(_mm_unpacklo_epi64(rg0123, rg4567));
+    g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
+    b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
+    a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+#elif defined(__SSE2__)
+    auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
+         _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
+
+    auto _02 = _mm_unpacklo_epi16(_01, _23),  // r0 r2 g0 g2 b0 b2 a0 a2
+         _13 = _mm_unpackhi_epi16(_01, _23);  // r1 r3 g1 g3 b1 b3 a1 a3
+
+    auto rg = _mm_unpacklo_epi16(_02, _13),  // r0 r1 r2 r3 g0 g1 g2 g3
+         ba = _mm_unpackhi_epi16(_02, _13);  // b0 b1 b2 b3 a0 a1 a2 a3
+
+    auto half_to_float = [&](U32 h) {
+        return bit_cast<F>(h << 13)               // Line up the mantissa,
+             * bit_cast<F>(U32(k->_0x77800000));  // then fix up the exponent.
+    };
+
+    r = half_to_float(_mm_unpacklo_epi16(rg, _mm_setzero_si128()));
+    g = half_to_float(_mm_unpackhi_epi16(rg, _mm_setzero_si128()));
+    b = half_to_float(_mm_unpacklo_epi16(ba, _mm_setzero_si128()));
+    a = half_to_float(_mm_unpackhi_epi16(ba, _mm_setzero_si128()));
+#endif
+}
+
+STAGE(store_f16) {
+    auto ptr = *(uint64_t**)ctx + x;
+
+#if !defined(JUMPER)
+    // TODO:
+    (void)ptr;
+#elif defined(__aarch64__)
+    float16x4x4_t halfs = {{
+        vcvt_f16_f32(r),
+        vcvt_f16_f32(g),
+        vcvt_f16_f32(b),
+        vcvt_f16_f32(a),
+    }};
+    vst4_f16((float16_t*)ptr, halfs);
+#elif defined(__ARM_NEON__)
+    float16x4x2_t rb_ga = {{
+        vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}),
+        vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
+    }};
+    vst2_f16((float16_t*)ptr, rb_ga);
+#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
+    auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
+         G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
+         B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
+         A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION);
+
+    auto rg0123 = _mm_unpacklo_epi16(R, G),  // r0 g0 r1 g1 r2 g2 r3 g3
+         rg4567 = _mm_unpackhi_epi16(R, G),  // r4 g4 r5 g5 r6 g6 r7 g7
+         ba0123 = _mm_unpacklo_epi16(B, A),
+         ba4567 = _mm_unpackhi_epi16(B, A);
+
+    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
+    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
+    _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
+    _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+#elif defined(__SSE2__)
+    auto float_to_half = [&](F f) {
+        return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000)))  // Fix up the exponent,
+            >> 13;                                                  // then line up the mantissa.
+    };
+    U32 R = float_to_half(r),
+        G = float_to_half(g),
+        B = float_to_half(b),
+        A = float_to_half(a);
+    U32 rg = R | _mm_slli_si128(G,2),
+        ba = B | _mm_slli_si128(A,2);
+    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
+    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
+#endif
+}
+
+static F clamp(const F& v, float limit) {
+    F l = bit_cast<F>(bit_cast<U32>(F(limit)) + U32(0xffffffff));  // limit - 1 ulp
+    return max(0, min(v, l));
+}
+STAGE(clamp_x) { r = clamp(r, *(const float*)ctx); }
+STAGE(clamp_y) { g = clamp(g, *(const float*)ctx); }
+
+STAGE(matrix_2x3) {
+    auto m = (const float*)ctx;
+
+    auto R = fma(r,m[0], fma(g,m[2], m[4])),
+         G = fma(r,m[1], fma(g,m[3], m[5]));
+    r = R;
+    g = G;
+}
+STAGE(matrix_3x4) {
+    auto m = (const float*)ctx;
+
+    auto R = fma(r,m[0], fma(g,m[3], fma(b,m[6], m[ 9]))),
+         G = fma(r,m[1], fma(g,m[4], fma(b,m[7], m[10]))),
+         B = fma(r,m[2], fma(g,m[5], fma(b,m[8], m[11])));
+    r = R;
+    g = G;
+    b = B;
+}
+
+STAGE(linear_gradient_2stops) {
+    struct Ctx { F4 c0, dc; };
+    auto c = unaligned_load<Ctx>(ctx);
+
+    auto t = r;
+    r = fma(t, c.dc[0], c.c0[0]);
+    g = fma(t, c.dc[1], c.c0[1]);
+    b = fma(t, c.dc[2], c.c0[2]);
+    a = fma(t, c.dc[3], c.c0[3]);
+}
diff --git a/src/jumper/build_stages.py b/src/jumper/build_stages.py
new file mode 100755 (executable)
index 0000000..22d8e37
--- /dev/null
@@ -0,0 +1,118 @@
+#!/usr/bin/env python2.7
+#
+# Copyright 2017 Google Inc.
+#
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import re
+import subprocess
+import sys
+
+sys.stdout = open('src/jumper/SkJumper_generated.h', 'w')
+
+ndk = '/Users/mtklein/brew/opt/android-ndk/'
+objdump = 'gobjdump'
+
+#ndk = '/home/mtklein/ndk/'
+#objdump = '/home/mtklein/binutils-2.27/binutils/objdump'
+
+cflags = '-std=c++11 -Os -fomit-frame-pointer -DJUMPER'.split()
+
+sse2 = '-mno-red-zone -msse2 -mno-sse3 -mno-ssse3 -mno-sse4.1'.split()
+subprocess.check_call(['clang++'] + cflags + sse2 +
+                      ['-c', 'src/jumper/SkJumper_stages.cpp'] +
+                      ['-o', 'sse2.o'])
+
+sse41 = '-mno-red-zone -msse4.1'.split()
+subprocess.check_call(['clang++'] + cflags + sse41 +
+                      ['-c', 'src/jumper/SkJumper_stages.cpp'] +
+                      ['-o', 'sse41.o'])
+
+hsw = '-mno-red-zone -mavx2 -mfma -mf16c'.split()
+subprocess.check_call(['clang++'] + cflags + hsw +
+                      ['-c', 'src/jumper/SkJumper_stages.cpp'] +
+                      ['-o', 'hsw.o'])
+
+aarch64 = [
+    '--target=aarch64-linux-android',
+    '--sysroot=' + ndk + 'platforms/android-21/arch-arm64',
+]
+subprocess.check_call(['clang++'] + cflags + aarch64 +
+                      ['-c', 'src/jumper/SkJumper_stages.cpp'] +
+                      ['-o', 'aarch64.o'])
+
+armv7 = [
+    '--target=armv7a-linux-android',
+    '--sysroot=' + ndk + 'platforms/android-18/arch-arm',
+    '-mfpu=neon-vfpv4',
+    '-mfloat-abi=hard',
+]
+subprocess.check_call(['clang++'] + cflags + armv7 +
+                      ['-c', 'src/jumper/SkJumper_stages.cpp'] +
+                      ['-o', 'armv7.o'])
+
+def parse_object_file(dot_o, array_type, target=None):
+  prefix = dot_o.replace('.o', '_')
+  cmd = [ objdump, '-d', '--insn-width=8', dot_o]
+  if target:
+    cmd += ['--target', target]
+
+  active = False
+  for line in subprocess.check_output(cmd).split('\n'):
+    line = line.strip()
+
+    if line.startswith(dot_o) or line.startswith('Disassembly'):
+      continue
+
+    if not line:
+      if active:
+        print '};'
+        active = False
+      continue
+
+    # E.g. 00000000000003a4 <_load_f16>:
+    m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
+    if m:
+      print 'static const', array_type, prefix + m.group(1) + '[] = {'
+      active = True
+      continue
+
+    columns = line.split('\t')
+    code = columns[1]
+    if len(columns) >= 4:
+      inst = columns[2]
+      args = columns[3]
+    else:
+      inst, args = columns[2], ''
+      if ' ' in columns[2]:
+        inst, args = columns[2].split(' ', 1)
+    code, inst, args = code.strip(), inst.strip(), args.strip()
+
+    # We can't work with code that uses ip-relative addressing.
+    for arg in args:
+      assert 'rip' not in arg  # TODO: detect on aarch64 too
+
+    hexed = ''.join('0x'+x+',' for x in code.split(' '))
+    print '    ' + hexed + ' '*(44-len(hexed)) + \
+          '//  ' + inst  + (' '*(14-len(inst)) + args if args else '')
+
+print '''/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkJumper_generated_DEFINED
+#define SkJumper_generated_DEFINED
+
+// This file is generated semi-automatically with this command:
+//   $ src/jumper/build_stages.py
+'''
+parse_object_file('aarch64.o', 'unsigned int')
+parse_object_file('armv7.o',   'unsigned int', target='elf32-littlearm')
+parse_object_file('hsw.o',     'unsigned char')
+parse_object_file('sse41.o',   'unsigned char')
+parse_object_file('sse2.o',    'unsigned char')
+print '#endif//SkJumper_generated_DEFINED'