Back to code as data arrays, this time in .text.
authorMike Klein <mtklein@chromium.org>
Tue, 7 Mar 2017 12:59:52 +0000 (07:59 -0500)
committerSkia Commit-Bot <skia-commit-bot@chromium.org>
Tue, 7 Mar 2017 14:55:32 +0000 (14:55 +0000)
This technique lets us generate a single source file, use the C++
preprocessor, and avoid the pain of working with assemblers.

By using the section attribute or declspec allocate, we can put these
data arrays into the .text section, making them ordinary code.

This is like the previous solution, except it should actually run.

CQ_INCLUDE_TRYBOTS=skia.primary:Test-Win2k8-MSVC-GCE-CPU-AVX2-x86_64-Debug,Test-Mac-Clang-MacMini6.2-CPU-AVX-x86_64-Debug,Test-Ubuntu-Clang-GCE-CPU-AVX2-x86_64-Debug,Test-Ubuntu-GCC-GCE-CPU-AVX2-x86_64-Debug

Change-Id: Ide7675f6cf32eb4831ff02906acbdc3faaeaa684
Reviewed-on: https://skia-review.googlesource.com/9336
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>

BUILD.gn
src/jumper/SkJumper.cpp
src/jumper/SkJumper_generated.S [deleted file]
src/jumper/SkJumper_generated.cpp [new file with mode: 0644]
src/jumper/SkJumper_generated_win.S [deleted file]
src/jumper/build_stages.py

index dcc579d..a4e4a2e 100644 (file)
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -505,13 +505,9 @@ optional("jumper") {
   public_defines = [ "SK_JUMPER" ]
   sources = [
     "src/jumper/SkJumper.cpp",
+    "src/jumper/SkJumper_generated.cpp",
     "src/jumper/SkJumper_stages.cpp",
   ]
-  if (is_win && target_cpu == "x64") {
-    sources += [ "src/jumper/SkJumper_generated_win.S" ]
-  } else if (!is_win) {
-    sources += [ "src/jumper/SkJumper_generated.S" ]
-  }
 }
 
 optional("typeface_freetype") {
index 71c8644..ce55b87 100644 (file)
@@ -95,12 +95,8 @@ static K kConstants = {
 // We'll only ever call start_pipeline(), which then chains into the rest for us.
 using StageFn = void(void);
 
-// Some platforms expect C "name" maps to asm "_name", others to "name".
-#if defined(__APPLE__)
-    #define ASM(name, suffix) sk_##name##_##suffix
-#else
-    #define ASM(name, suffix) _sk_##name##_##suffix
-#endif
+// TODO: maybe don't need this wrapper anymore.
+#define ASM(name, suffix) sk_##name##_##suffix
 
 extern "C" {
 
diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S
deleted file mode 100644 (file)
index 34aeab4..0000000
+++ /dev/null
@@ -1,6618 +0,0 @@
-# Copyright 2017 Google Inc.
-#
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-# This file is generated semi-automatically with this command:
-#   $ src/jumper/build_stages.py
-
-.text
-#if defined(__aarch64__)
-.balign 4
-
-.globl _sk_start_pipeline_aarch64
-_sk_start_pipeline_aarch64:
-  .long  0xa9bd5bf7                          // stp           x23, x22, [sp,#-48]!
-  .long  0xa90153f5                          // stp           x21, x20, [sp,#16]
-  .long  0xa9027bf3                          // stp           x19, x30, [sp,#32]
-  .long  0xaa0103f5                          // mov           x21, x1
-  .long  0xf84086b7                          // ldr           x23, [x21],#8
-  .long  0xaa0003f6                          // mov           x22, x0
-  .long  0xaa0303f3                          // mov           x19, x3
-  .long  0xaa0203f4                          // mov           x20, x2
-  .long  0x910012c8                          // add           x8, x22, #0x4
-  .long  0xeb13011f                          // cmp           x8, x19
-  .long  0x54000069                          // b.ls          34 <sk_start_pipeline_aarch64+0x34>
-  .long  0xaa1603e0                          // mov           x0, x22
-  .long  0x14000012                          // b             78 <sk_start_pipeline_aarch64+0x78>
-  .long  0x6f00e400                          // movi          v0.2d, #0x0
-  .long  0x6f00e401                          // movi          v1.2d, #0x0
-  .long  0x6f00e402                          // movi          v2.2d, #0x0
-  .long  0x6f00e403                          // movi          v3.2d, #0x0
-  .long  0x6f00e404                          // movi          v4.2d, #0x0
-  .long  0x6f00e405                          // movi          v5.2d, #0x0
-  .long  0x6f00e406                          // movi          v6.2d, #0x0
-  .long  0x6f00e407                          // movi          v7.2d, #0x0
-  .long  0xaa1603e0                          // mov           x0, x22
-  .long  0xaa1503e1                          // mov           x1, x21
-  .long  0xaa1403e2                          // mov           x2, x20
-  .long  0xd63f02e0                          // blr           x23
-  .long  0x910022c8                          // add           x8, x22, #0x8
-  .long  0x910012c0                          // add           x0, x22, #0x4
-  .long  0xeb13011f                          // cmp           x8, x19
-  .long  0xaa0003f6                          // mov           x22, x0
-  .long  0x54fffe09                          // b.ls          34 <sk_start_pipeline_aarch64+0x34>
-  .long  0xa9427bf3                          // ldp           x19, x30, [sp,#32]
-  .long  0xa94153f5                          // ldp           x21, x20, [sp,#16]
-  .long  0xa8c35bf7                          // ldp           x23, x22, [sp],#48
-  .long  0xd65f03c0                          // ret
-
-.globl _sk_just_return_aarch64
-_sk_just_return_aarch64:
-  .long  0xd65f03c0                          // ret
-
-.globl _sk_seed_shader_aarch64
-_sk_seed_shader_aarch64:
-  .long  0xaa0203e9                          // mov           x9, x2
-  .long  0xa9400c28                          // ldp           x8, x3, [x1]
-  .long  0x4ddfc922                          // ld1r          {v2.4s}, [x9], #4
-  .long  0x3cc14047                          // ldur          q7, [x2,#20]
-  .long  0x4e040c00                          // dup           v0.4s, w0
-  .long  0x4d40c901                          // ld1r          {v1.4s}, [x8]
-  .long  0x4d40c926                          // ld1r          {v6.4s}, [x9]
-  .long  0x4e21d800                          // scvtf         v0.4s, v0.4s
-  .long  0x91004028                          // add           x8, x1, #0x10
-  .long  0x4e21d821                          // scvtf         v1.4s, v1.4s
-  .long  0x4e26d400                          // fadd          v0.4s, v0.4s, v6.4s
-  .long  0x6f00e403                          // movi          v3.2d, #0x0
-  .long  0x6f00e404                          // movi          v4.2d, #0x0
-  .long  0x6f00e405                          // movi          v5.2d, #0x0
-  .long  0x4e26d421                          // fadd          v1.4s, v1.4s, v6.4s
-  .long  0x6f00e406                          // movi          v6.2d, #0x0
-  .long  0x4e20d4e0                          // fadd          v0.4s, v7.4s, v0.4s
-  .long  0x6f00e407                          // movi          v7.2d, #0x0
-  .long  0xaa0803e1                          // mov           x1, x8
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_constant_color_aarch64
-_sk_constant_color_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0x3dc00103                          // ldr           q3, [x8]
-  .long  0x4e040460                          // dup           v0.4s, v3.s[0]
-  .long  0x4e0c0461                          // dup           v1.4s, v3.s[1]
-  .long  0x4e140462                          // dup           v2.4s, v3.s[2]
-  .long  0x4e1c0463                          // dup           v3.4s, v3.s[3]
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_clear_aarch64
-_sk_clear_aarch64:
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x6f00e400                          // movi          v0.2d, #0x0
-  .long  0x6f00e401                          // movi          v1.2d, #0x0
-  .long  0x6f00e402                          // movi          v2.2d, #0x0
-  .long  0x6f00e403                          // movi          v3.2d, #0x0
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_plus__aarch64
-_sk_plus__aarch64:
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4e24d400                          // fadd          v0.4s, v0.4s, v4.4s
-  .long  0x4e25d421                          // fadd          v1.4s, v1.4s, v5.4s
-  .long  0x4e26d442                          // fadd          v2.4s, v2.4s, v6.4s
-  .long  0x4e27d463                          // fadd          v3.4s, v3.4s, v7.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_srcover_aarch64
-_sk_srcover_aarch64:
-  .long  0x4d40c850                          // ld1r          {v16.4s}, [x2]
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4ea3d610                          // fsub          v16.4s, v16.4s, v3.4s
-  .long  0x4e24ce00                          // fmla          v0.4s, v16.4s, v4.4s
-  .long  0x4e25ce01                          // fmla          v1.4s, v16.4s, v5.4s
-  .long  0x4e26ce02                          // fmla          v2.4s, v16.4s, v6.4s
-  .long  0x4e27ce03                          // fmla          v3.4s, v16.4s, v7.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_dstover_aarch64
-_sk_dstover_aarch64:
-  .long  0x4d40c851                          // ld1r          {v17.4s}, [x2]
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4ea41c90                          // mov           v16.16b, v4.16b
-  .long  0x4ea61cd2                          // mov           v18.16b, v6.16b
-  .long  0x4ea7d634                          // fsub          v20.4s, v17.4s, v7.4s
-  .long  0x4ea51cb1                          // mov           v17.16b, v5.16b
-  .long  0x4ea71cf3                          // mov           v19.16b, v7.16b
-  .long  0x4e20ce90                          // fmla          v16.4s, v20.4s, v0.4s
-  .long  0x4e21ce91                          // fmla          v17.4s, v20.4s, v1.4s
-  .long  0x4e22ce92                          // fmla          v18.4s, v20.4s, v2.4s
-  .long  0x4e23ce93                          // fmla          v19.4s, v20.4s, v3.4s
-  .long  0x4eb01e00                          // mov           v0.16b, v16.16b
-  .long  0x4eb11e21                          // mov           v1.16b, v17.16b
-  .long  0x4eb21e42                          // mov           v2.16b, v18.16b
-  .long  0x4eb31e63                          // mov           v3.16b, v19.16b
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_clamp_0_aarch64
-_sk_clamp_0_aarch64:
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x6f00e410                          // movi          v16.2d, #0x0
-  .long  0x4e30f400                          // fmax          v0.4s, v0.4s, v16.4s
-  .long  0x4e30f421                          // fmax          v1.4s, v1.4s, v16.4s
-  .long  0x4e30f442                          // fmax          v2.4s, v2.4s, v16.4s
-  .long  0x4e30f463                          // fmax          v3.4s, v3.4s, v16.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_clamp_1_aarch64
-_sk_clamp_1_aarch64:
-  .long  0x4d40c850                          // ld1r          {v16.4s}, [x2]
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4eb0f400                          // fmin          v0.4s, v0.4s, v16.4s
-  .long  0x4eb0f421                          // fmin          v1.4s, v1.4s, v16.4s
-  .long  0x4eb0f442                          // fmin          v2.4s, v2.4s, v16.4s
-  .long  0x4eb0f463                          // fmin          v3.4s, v3.4s, v16.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_clamp_a_aarch64
-_sk_clamp_a_aarch64:
-  .long  0x4d40c850                          // ld1r          {v16.4s}, [x2]
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4eb0f463                          // fmin          v3.4s, v3.4s, v16.4s
-  .long  0x4ea3f400                          // fmin          v0.4s, v0.4s, v3.4s
-  .long  0x4ea3f421                          // fmin          v1.4s, v1.4s, v3.4s
-  .long  0x4ea3f442                          // fmin          v2.4s, v2.4s, v3.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_set_rgb_aarch64
-_sk_set_rgb_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xaa0803e9                          // mov           x9, x8
-  .long  0x4ddfc920                          // ld1r          {v0.4s}, [x9], #4
-  .long  0x91002108                          // add           x8, x8, #0x8
-  .long  0x4d40c902                          // ld1r          {v2.4s}, [x8]
-  .long  0x4d40c921                          // ld1r          {v1.4s}, [x9]
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_swap_rb_aarch64
-_sk_swap_rb_aarch64:
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4ea01c10                          // mov           v16.16b, v0.16b
-  .long  0x4ea21c40                          // mov           v0.16b, v2.16b
-  .long  0x4eb01e02                          // mov           v2.16b, v16.16b
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_swap_aarch64
-_sk_swap_aarch64:
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4ea31c70                          // mov           v16.16b, v3.16b
-  .long  0x4ea21c51                          // mov           v17.16b, v2.16b
-  .long  0x4ea11c32                          // mov           v18.16b, v1.16b
-  .long  0x4ea01c13                          // mov           v19.16b, v0.16b
-  .long  0x4ea41c80                          // mov           v0.16b, v4.16b
-  .long  0x4ea51ca1                          // mov           v1.16b, v5.16b
-  .long  0x4ea61cc2                          // mov           v2.16b, v6.16b
-  .long  0x4ea71ce3                          // mov           v3.16b, v7.16b
-  .long  0x4eb31e64                          // mov           v4.16b, v19.16b
-  .long  0x4eb21e45                          // mov           v5.16b, v18.16b
-  .long  0x4eb11e26                          // mov           v6.16b, v17.16b
-  .long  0x4eb01e07                          // mov           v7.16b, v16.16b
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_move_src_dst_aarch64
-_sk_move_src_dst_aarch64:
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4ea01c04                          // mov           v4.16b, v0.16b
-  .long  0x4ea11c25                          // mov           v5.16b, v1.16b
-  .long  0x4ea21c46                          // mov           v6.16b, v2.16b
-  .long  0x4ea31c67                          // mov           v7.16b, v3.16b
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_move_dst_src_aarch64
-_sk_move_dst_src_aarch64:
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4ea41c80                          // mov           v0.16b, v4.16b
-  .long  0x4ea51ca1                          // mov           v1.16b, v5.16b
-  .long  0x4ea61cc2                          // mov           v2.16b, v6.16b
-  .long  0x4ea71ce3                          // mov           v3.16b, v7.16b
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_premul_aarch64
-_sk_premul_aarch64:
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x6e23dc00                          // fmul          v0.4s, v0.4s, v3.4s
-  .long  0x6e23dc21                          // fmul          v1.4s, v1.4s, v3.4s
-  .long  0x6e23dc42                          // fmul          v2.4s, v2.4s, v3.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_unpremul_aarch64
-_sk_unpremul_aarch64:
-  .long  0x4d40c850                          // ld1r          {v16.4s}, [x2]
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4ea0d871                          // fcmeq         v17.4s, v3.4s, #0.0
-  .long  0x6e23fe10                          // fdiv          v16.4s, v16.4s, v3.4s
-  .long  0x4e711e10                          // bic           v16.16b, v16.16b, v17.16b
-  .long  0x6e20de00                          // fmul          v0.4s, v16.4s, v0.4s
-  .long  0x6e21de01                          // fmul          v1.4s, v16.4s, v1.4s
-  .long  0x6e22de02                          // fmul          v2.4s, v16.4s, v2.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_from_srgb_aarch64
-_sk_from_srgb_aarch64:
-  .long  0x9100e048                          // add           x8, x2, #0x38
-  .long  0x4d40c910                          // ld1r          {v16.4s}, [x8]
-  .long  0x9100d048                          // add           x8, x2, #0x34
-  .long  0x2d47cc52                          // ldp           s18, s19, [x2,#60]
-  .long  0x4d40c911                          // ld1r          {v17.4s}, [x8]
-  .long  0x6e22dc54                          // fmul          v20.4s, v2.4s, v2.4s
-  .long  0x4eb01e15                          // mov           v21.16b, v16.16b
-  .long  0x4eb01e17                          // mov           v23.16b, v16.16b
-  .long  0x4f921050                          // fmla          v16.4s, v2.4s, v18.s[0]
-  .long  0x4eb11e36                          // mov           v22.16b, v17.16b
-  .long  0x4eb11e38                          // mov           v24.16b, v17.16b
-  .long  0x4e34ce11                          // fmla          v17.4s, v16.4s, v20.4s
-  .long  0x6e20dc10                          // fmul          v16.4s, v0.4s, v0.4s
-  .long  0x91011048                          // add           x8, x2, #0x44
-  .long  0x4f921015                          // fmla          v21.4s, v0.4s, v18.s[0]
-  .long  0x4e30ceb6                          // fmla          v22.4s, v21.4s, v16.4s
-  .long  0x4d40c910                          // ld1r          {v16.4s}, [x8]
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x6e21dc34                          // fmul          v20.4s, v1.4s, v1.4s
-  .long  0x4f921037                          // fmla          v23.4s, v1.4s, v18.s[0]
-  .long  0x4f939015                          // fmul          v21.4s, v0.4s, v19.s[0]
-  .long  0x4f939032                          // fmul          v18.4s, v1.4s, v19.s[0]
-  .long  0x4f939053                          // fmul          v19.4s, v2.4s, v19.s[0]
-  .long  0x6ea0e600                          // fcmgt         v0.4s, v16.4s, v0.4s
-  .long  0x6ea1e601                          // fcmgt         v1.4s, v16.4s, v1.4s
-  .long  0x6ea2e602                          // fcmgt         v2.4s, v16.4s, v2.4s
-  .long  0x4e34cef8                          // fmla          v24.4s, v23.4s, v20.4s
-  .long  0x6e761ea0                          // bsl           v0.16b, v21.16b, v22.16b
-  .long  0x6e781e41                          // bsl           v1.16b, v18.16b, v24.16b
-  .long  0x6e711e62                          // bsl           v2.16b, v19.16b, v17.16b
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_to_srgb_aarch64
-_sk_to_srgb_aarch64:
-  .long  0x6ea1d811                          // frsqrte       v17.4s, v0.4s
-  .long  0x6ea1d835                          // frsqrte       v21.4s, v1.4s
-  .long  0x6e31de37                          // fmul          v23.4s, v17.4s, v17.4s
-  .long  0x6ea1d856                          // frsqrte       v22.4s, v2.4s
-  .long  0x6e35deb9                          // fmul          v25.4s, v21.4s, v21.4s
-  .long  0x4eb7fc17                          // frsqrts       v23.4s, v0.4s, v23.4s
-  .long  0x91015048                          // add           x8, x2, #0x54
-  .long  0x6e36deda                          // fmul          v26.4s, v22.4s, v22.4s
-  .long  0x4eb9fc39                          // frsqrts       v25.4s, v1.4s, v25.4s
-  .long  0x6e37de31                          // fmul          v17.4s, v17.4s, v23.4s
-  .long  0x4d40c914                          // ld1r          {v20.4s}, [x8]
-  .long  0x4ebafc5a                          // frsqrts       v26.4s, v2.4s, v26.4s
-  .long  0x6e39deb5                          // fmul          v21.4s, v21.4s, v25.4s
-  .long  0x4ea1da37                          // frecpe        v23.4s, v17.4s
-  .long  0xbd405053                          // ldr           s19, [x2,#80]
-  .long  0x91016048                          // add           x8, x2, #0x58
-  .long  0x6e3aded6                          // fmul          v22.4s, v22.4s, v26.4s
-  .long  0x4ea1dabb                          // frecpe        v27.4s, v21.4s
-  .long  0x4e37fe3d                          // frecps        v29.4s, v17.4s, v23.4s
-  .long  0x2d494052                          // ldp           s18, s16, [x2,#72]
-  .long  0x4d40c918                          // ld1r          {v24.4s}, [x8]
-  .long  0x4ea1dadc                          // frecpe        v28.4s, v22.4s
-  .long  0x6e3ddef7                          // fmul          v23.4s, v23.4s, v29.4s
-  .long  0x4e3bfebd                          // frecps        v29.4s, v21.4s, v27.4s
-  .long  0x6e3ddf7b                          // fmul          v27.4s, v27.4s, v29.4s
-  .long  0x4e3cfedd                          // frecps        v29.4s, v22.4s, v28.4s
-  .long  0x6e3ddf9c                          // fmul          v28.4s, v28.4s, v29.4s
-  .long  0x4eb41e9d                          // mov           v29.16b, v20.16b
-  .long  0x6ea1da39                          // frsqrte       v25.4s, v17.4s
-  .long  0x4f9312fd                          // fmla          v29.4s, v23.4s, v19.s[0]
-  .long  0x4eb41e97                          // mov           v23.16b, v20.16b
-  .long  0x4f92901a                          // fmul          v26.4s, v0.4s, v18.s[0]
-  .long  0x4f931377                          // fmla          v23.4s, v27.4s, v19.s[0]
-  .long  0x4f931394                          // fmla          v20.4s, v28.4s, v19.s[0]
-  .long  0x4f929033                          // fmul          v19.4s, v1.4s, v18.s[0]
-  .long  0x4f929052                          // fmul          v18.4s, v2.4s, v18.s[0]
-  .long  0x6ea0e700                          // fcmgt         v0.4s, v24.4s, v0.4s
-  .long  0x6ea1e701                          // fcmgt         v1.4s, v24.4s, v1.4s
-  .long  0x6ea2e702                          // fcmgt         v2.4s, v24.4s, v2.4s
-  .long  0x6e39df38                          // fmul          v24.4s, v25.4s, v25.4s
-  .long  0x6ea1dabb                          // frsqrte       v27.4s, v21.4s
-  .long  0x4eb8fe31                          // frsqrts       v17.4s, v17.4s, v24.4s
-  .long  0x6ea1dadc                          // frsqrte       v28.4s, v22.4s
-  .long  0x6e3bdf78                          // fmul          v24.4s, v27.4s, v27.4s
-  .long  0x6e31df31                          // fmul          v17.4s, v25.4s, v17.4s
-  .long  0x4eb8feb5                          // frsqrts       v21.4s, v21.4s, v24.4s
-  .long  0x6e3cdf98                          // fmul          v24.4s, v28.4s, v28.4s
-  .long  0x4f90123d                          // fmla          v29.4s, v17.4s, v16.s[0]
-  .long  0x4d40c851                          // ld1r          {v17.4s}, [x2]
-  .long  0x4eb8fed6                          // frsqrts       v22.4s, v22.4s, v24.4s
-  .long  0x6e35df75                          // fmul          v21.4s, v27.4s, v21.4s
-  .long  0x6e36df96                          // fmul          v22.4s, v28.4s, v22.4s
-  .long  0xf8408423                          // ldr           x3, [x1],#8
-  .long  0x4f9012b7                          // fmla          v23.4s, v21.4s, v16.s[0]
-  .long  0x4f9012d4                          // fmla          v20.4s, v22.4s, v16.s[0]
-  .long  0x4ebdf630                          // fmin          v16.4s, v17.4s, v29.4s
-  .long  0x4eb7f635                          // fmin          v21.4s, v17.4s, v23.4s
-  .long  0x4eb4f631                          // fmin          v17.4s, v17.4s, v20.4s
-  .long  0x6e701f40                          // bsl           v0.16b, v26.16b, v16.16b
-  .long  0x6e751e61                          // bsl           v1.16b, v19.16b, v21.16b
-  .long  0x6e711e42                          // bsl           v2.16b, v18.16b, v17.16b
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_scale_1_float_aarch64
-_sk_scale_1_float_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xbd400110                          // ldr           s16, [x8]
-  .long  0x4f909000                          // fmul          v0.4s, v0.4s, v16.s[0]
-  .long  0x4f909021                          // fmul          v1.4s, v1.4s, v16.s[0]
-  .long  0x4f909042                          // fmul          v2.4s, v2.4s, v16.s[0]
-  .long  0x4f909063                          // fmul          v3.4s, v3.4s, v16.s[0]
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_scale_u8_aarch64
-_sk_scale_u8_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xbd400c51                          // ldr           s17, [x2,#12]
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x8b000108                          // add           x8, x8, x0
-  .long  0x39400109                          // ldrb          w9, [x8]
-  .long  0x3940050a                          // ldrb          w10, [x8,#1]
-  .long  0x3940090b                          // ldrb          w11, [x8,#2]
-  .long  0x39400d08                          // ldrb          w8, [x8,#3]
-  .long  0x4e021d30                          // mov           v16.h[0], w9
-  .long  0x4e061d50                          // mov           v16.h[1], w10
-  .long  0x4e0a1d70                          // mov           v16.h[2], w11
-  .long  0x4e0e1d10                          // mov           v16.h[3], w8
-  .long  0x2f07b7f0                          // bic           v16.4h, #0xff, lsl #8
-  .long  0x2f10a610                          // uxtl          v16.4s, v16.4h
-  .long  0x6e21da10                          // ucvtf         v16.4s, v16.4s
-  .long  0x4f919210                          // fmul          v16.4s, v16.4s, v17.s[0]
-  .long  0x6e20de00                          // fmul          v0.4s, v16.4s, v0.4s
-  .long  0x6e21de01                          // fmul          v1.4s, v16.4s, v1.4s
-  .long  0x6e22de02                          // fmul          v2.4s, v16.4s, v2.4s
-  .long  0x6e23de03                          // fmul          v3.4s, v16.4s, v3.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_lerp_1_float_aarch64
-_sk_lerp_1_float_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0x4ea4d411                          // fsub          v17.4s, v0.4s, v4.4s
-  .long  0x4ea41c80                          // mov           v0.16b, v4.16b
-  .long  0x4ea5d432                          // fsub          v18.4s, v1.4s, v5.4s
-  .long  0xbd400110                          // ldr           s16, [x8]
-  .long  0x4ea51ca1                          // mov           v1.16b, v5.16b
-  .long  0x4f901220                          // fmla          v0.4s, v17.4s, v16.s[0]
-  .long  0x4ea6d451                          // fsub          v17.4s, v2.4s, v6.4s
-  .long  0x4f901241                          // fmla          v1.4s, v18.4s, v16.s[0]
-  .long  0x4ea61cc2                          // mov           v2.16b, v6.16b
-  .long  0x4ea7d472                          // fsub          v18.4s, v3.4s, v7.4s
-  .long  0x4ea71ce3                          // mov           v3.16b, v7.16b
-  .long  0x4f901222                          // fmla          v2.4s, v17.4s, v16.s[0]
-  .long  0x4f901243                          // fmla          v3.4s, v18.4s, v16.s[0]
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_lerp_u8_aarch64
-_sk_lerp_u8_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xbd400c51                          // ldr           s17, [x2,#12]
-  .long  0x4ea4d412                          // fsub          v18.4s, v0.4s, v4.4s
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x8b000108                          // add           x8, x8, x0
-  .long  0x39400109                          // ldrb          w9, [x8]
-  .long  0x3940050a                          // ldrb          w10, [x8,#1]
-  .long  0x3940090b                          // ldrb          w11, [x8,#2]
-  .long  0x39400d08                          // ldrb          w8, [x8,#3]
-  .long  0x4e021d30                          // mov           v16.h[0], w9
-  .long  0x4e061d50                          // mov           v16.h[1], w10
-  .long  0x4e0a1d70                          // mov           v16.h[2], w11
-  .long  0x4e0e1d10                          // mov           v16.h[3], w8
-  .long  0x2f07b7f0                          // bic           v16.4h, #0xff, lsl #8
-  .long  0x2f10a600                          // uxtl          v0.4s, v16.4h
-  .long  0x6e21d800                          // ucvtf         v0.4s, v0.4s
-  .long  0x4f919010                          // fmul          v16.4s, v0.4s, v17.s[0]
-  .long  0x4ea41c80                          // mov           v0.16b, v4.16b
-  .long  0x4ea5d431                          // fsub          v17.4s, v1.4s, v5.4s
-  .long  0x4ea51ca1                          // mov           v1.16b, v5.16b
-  .long  0x4e32ce00                          // fmla          v0.4s, v16.4s, v18.4s
-  .long  0x4ea6d452                          // fsub          v18.4s, v2.4s, v6.4s
-  .long  0x4e31ce01                          // fmla          v1.4s, v16.4s, v17.4s
-  .long  0x4ea61cc2                          // mov           v2.16b, v6.16b
-  .long  0x4ea7d471                          // fsub          v17.4s, v3.4s, v7.4s
-  .long  0x4ea71ce3                          // mov           v3.16b, v7.16b
-  .long  0x4e32ce02                          // fmla          v2.4s, v16.4s, v18.4s
-  .long  0x4e31ce03                          // fmla          v3.4s, v16.4s, v17.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_lerp_565_aarch64
-_sk_lerp_565_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xd37ff809                          // lsl           x9, x0, #1
-  .long  0x2d4ec851                          // ldp           s17, s18, [x2,#116]
-  .long  0x4ea4d413                          // fsub          v19.4s, v0.4s, v4.4s
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x4ea41c80                          // mov           v0.16b, v4.16b
-  .long  0xfc696903                          // ldr           d3, [x8,x9]
-  .long  0x9101a048                          // add           x8, x2, #0x68
-  .long  0x4d40c910                          // ld1r          {v16.4s}, [x8]
-  .long  0x9101b048                          // add           x8, x2, #0x6c
-  .long  0x2f10a463                          // uxtl          v3.4s, v3.4h
-  .long  0x4e231e10                          // and           v16.16b, v16.16b, v3.16b
-  .long  0x4e21da10                          // scvtf         v16.4s, v16.4s
-  .long  0x4f919210                          // fmul          v16.4s, v16.4s, v17.s[0]
-  .long  0x4d40c911                          // ld1r          {v17.4s}, [x8]
-  .long  0x9101c048                          // add           x8, x2, #0x70
-  .long  0x4e33ce00                          // fmla          v0.4s, v16.4s, v19.4s
-  .long  0x4ea5d430                          // fsub          v16.4s, v1.4s, v5.4s
-  .long  0x4e231e31                          // and           v17.16b, v17.16b, v3.16b
-  .long  0x4e21da31                          // scvtf         v17.4s, v17.4s
-  .long  0x4f929231                          // fmul          v17.4s, v17.4s, v18.s[0]
-  .long  0x4d40c912                          // ld1r          {v18.4s}, [x8]
-  .long  0x4ea51ca1                          // mov           v1.16b, v5.16b
-  .long  0x4e30ce21                          // fmla          v1.4s, v17.4s, v16.4s
-  .long  0xbd407c50                          // ldr           s16, [x2,#124]
-  .long  0x4e231e52                          // and           v18.16b, v18.16b, v3.16b
-  .long  0x4d40c843                          // ld1r          {v3.4s}, [x2]
-  .long  0x4e21da52                          // scvtf         v18.4s, v18.4s
-  .long  0x4ea6d451                          // fsub          v17.4s, v2.4s, v6.4s
-  .long  0x4ea61cc2                          // mov           v2.16b, v6.16b
-  .long  0x4f909250                          // fmul          v16.4s, v18.4s, v16.s[0]
-  .long  0x4e31ce02                          // fmla          v2.4s, v16.4s, v17.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_load_tables_aarch64
-_sk_load_tables_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0x9100404b                          // add           x11, x2, #0x10
-  .long  0x4d40c960                          // ld1r          {v0.4s}, [x11]
-  .long  0xd37ef409                          // lsl           x9, x0, #2
-  .long  0xa9402d0a                          // ldp           x10, x11, [x8]
-  .long  0x3ce96942                          // ldr           q2, [x10,x9]
-  .long  0xa9412109                          // ldp           x9, x8, [x8,#16]
-  .long  0x4e221c01                          // and           v1.16b, v0.16b, v2.16b
-  .long  0x0e143c2c                          // mov           w12, v1.s[2]
-  .long  0xbc6c5971                          // ldr           s17, [x11,w12,uxtw #2]
-  .long  0x1e26002c                          // fmov          w12, s1
-  .long  0x6f380443                          // ushr          v3.4s, v2.4s, #8
-  .long  0x6f300450                          // ushr          v16.4s, v2.4s, #16
-  .long  0x8b2c496c                          // add           x12, x11, w12, uxtw #2
-  .long  0x0e0c3c2a                          // mov           w10, v1.s[1]
-  .long  0x0e1c3c2d                          // mov           w13, v1.s[3]
-  .long  0x4e231c01                          // and           v1.16b, v0.16b, v3.16b
-  .long  0x4e301c03                          // and           v3.16b, v0.16b, v16.16b
-  .long  0x0d408180                          // ld1           {v0.s}[0], [x12]
-  .long  0x0e143c2c                          // mov           w12, v1.s[2]
-  .long  0xbc6c5932                          // ldr           s18, [x9,w12,uxtw #2]
-  .long  0x1e26002c                          // fmov          w12, s1
-  .long  0x8b2a496a                          // add           x10, x11, w10, uxtw #2
-  .long  0xbc6d5970                          // ldr           s16, [x11,w13,uxtw #2]
-  .long  0x0e0c3c2b                          // mov           w11, v1.s[1]
-  .long  0x0e1c3c2d                          // mov           w13, v1.s[3]
-  .long  0x8b2c492c                          // add           x12, x9, w12, uxtw #2
-  .long  0xbc6d5933                          // ldr           s19, [x9,w13,uxtw #2]
-  .long  0x0e0c3c6d                          // mov           w13, v3.s[1]
-  .long  0x8b2b4929                          // add           x9, x9, w11, uxtw #2
-  .long  0x0e143c6b                          // mov           w11, v3.s[2]
-  .long  0x0d408181                          // ld1           {v1.s}[0], [x12]
-  .long  0x0e1c3c6c                          // mov           w12, v3.s[3]
-  .long  0x0d409140                          // ld1           {v0.s}[1], [x10]
-  .long  0x1e26006a                          // fmov          w10, s3
-  .long  0xbd400c43                          // ldr           s3, [x2,#12]
-  .long  0x6f280442                          // ushr          v2.4s, v2.4s, #24
-  .long  0x4e21d842                          // scvtf         v2.4s, v2.4s
-  .long  0x8b2a490a                          // add           x10, x8, w10, uxtw #2
-  .long  0x4f839043                          // fmul          v3.4s, v2.4s, v3.s[0]
-  .long  0x0d408142                          // ld1           {v2.s}[0], [x10]
-  .long  0x8b2d490a                          // add           x10, x8, w13, uxtw #2
-  .long  0x6e140620                          // mov           v0.s[2], v17.s[0]
-  .long  0xbc6b5911                          // ldr           s17, [x8,w11,uxtw #2]
-  .long  0x0d409121                          // ld1           {v1.s}[1], [x9]
-  .long  0x0d409142                          // ld1           {v2.s}[1], [x10]
-  .long  0x6e1c0600                          // mov           v0.s[3], v16.s[0]
-  .long  0xbc6c5910                          // ldr           s16, [x8,w12,uxtw #2]
-  .long  0x6e140641                          // mov           v1.s[2], v18.s[0]
-  .long  0x6e140622                          // mov           v2.s[2], v17.s[0]
-  .long  0x6e1c0661                          // mov           v1.s[3], v19.s[0]
-  .long  0x6e1c0602                          // mov           v2.s[3], v16.s[0]
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_load_a8_aarch64
-_sk_load_a8_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xbd400c43                          // ldr           s3, [x2,#12]
-  .long  0x6f00e400                          // movi          v0.2d, #0x0
-  .long  0x6f00e401                          // movi          v1.2d, #0x0
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x8b000108                          // add           x8, x8, x0
-  .long  0x39400109                          // ldrb          w9, [x8]
-  .long  0x3940050a                          // ldrb          w10, [x8,#1]
-  .long  0x3940090b                          // ldrb          w11, [x8,#2]
-  .long  0x39400d08                          // ldrb          w8, [x8,#3]
-  .long  0x4e021d22                          // mov           v2.h[0], w9
-  .long  0x4e061d42                          // mov           v2.h[1], w10
-  .long  0x4e0a1d62                          // mov           v2.h[2], w11
-  .long  0x4e0e1d02                          // mov           v2.h[3], w8
-  .long  0x2f07b7e2                          // bic           v2.4h, #0xff, lsl #8
-  .long  0x2f10a442                          // uxtl          v2.4s, v2.4h
-  .long  0x6e21d842                          // ucvtf         v2.4s, v2.4s
-  .long  0x4f839043                          // fmul          v3.4s, v2.4s, v3.s[0]
-  .long  0x6f00e402                          // movi          v2.2d, #0x0
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_store_a8_aarch64
-_sk_store_a8_aarch64:
-  .long  0xf9400028                          // ldr           x8, [x1]
-  .long  0xbd400850                          // ldr           s16, [x2,#8]
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x4f909070                          // fmul          v16.4s, v3.4s, v16.s[0]
-  .long  0x6e21aa10                          // fcvtnu        v16.4s, v16.4s
-  .long  0x0e612a10                          // xtn           v16.4h, v16.4s
-  .long  0x0e0e3e09                          // umov          w9, v16.h[3]
-  .long  0x8b000108                          // add           x8, x8, x0
-  .long  0x39000d09                          // strb          w9, [x8,#3]
-  .long  0x0e0a3e09                          // umov          w9, v16.h[2]
-  .long  0x39000909                          // strb          w9, [x8,#2]
-  .long  0x0e063e09                          // umov          w9, v16.h[1]
-  .long  0x39000509                          // strb          w9, [x8,#1]
-  .long  0x0e023e09                          // umov          w9, v16.h[0]
-  .long  0x39000109                          // strb          w9, [x8]
-  .long  0xf9400423                          // ldr           x3, [x1,#8]
-  .long  0x91004021                          // add           x1, x1, #0x10
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_load_565_aarch64
-_sk_load_565_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xd37ff809                          // lsl           x9, x0, #1
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0xfc696900                          // ldr           d0, [x8,x9]
-  .long  0x9101a048                          // add           x8, x2, #0x68
-  .long  0x4d40c901                          // ld1r          {v1.4s}, [x8]
-  .long  0x9101b048                          // add           x8, x2, #0x6c
-  .long  0x4d40c902                          // ld1r          {v2.4s}, [x8]
-  .long  0x9101c048                          // add           x8, x2, #0x70
-  .long  0x4d40c903                          // ld1r          {v3.4s}, [x8]
-  .long  0x2f10a400                          // uxtl          v0.4s, v0.4h
-  .long  0x4e201c21                          // and           v1.16b, v1.16b, v0.16b
-  .long  0x4e201c42                          // and           v2.16b, v2.16b, v0.16b
-  .long  0x4e201c71                          // and           v17.16b, v3.16b, v0.16b
-  .long  0x2d4e8c50                          // ldp           s16, s3, [x2,#116]
-  .long  0x4e21d820                          // scvtf         v0.4s, v1.4s
-  .long  0x4e21d841                          // scvtf         v1.4s, v2.4s
-  .long  0x4e21da22                          // scvtf         v2.4s, v17.4s
-  .long  0x4f909000                          // fmul          v0.4s, v0.4s, v16.s[0]
-  .long  0xbd407c50                          // ldr           s16, [x2,#124]
-  .long  0x4f839021                          // fmul          v1.4s, v1.4s, v3.s[0]
-  .long  0x4d40c843                          // ld1r          {v3.4s}, [x2]
-  .long  0x4f909042                          // fmul          v2.4s, v2.4s, v16.s[0]
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_store_565_aarch64
-_sk_store_565_aarch64:
-  .long  0x2d504450                          // ldp           s16, s17, [x2,#128]
-  .long  0xf9400028                          // ldr           x8, [x1]
-  .long  0xd37ff809                          // lsl           x9, x0, #1
-  .long  0x4f909012                          // fmul          v18.4s, v0.4s, v16.s[0]
-  .long  0x4f919031                          // fmul          v17.4s, v1.4s, v17.s[0]
-  .long  0x6e21aa52                          // fcvtnu        v18.4s, v18.4s
-  .long  0x6e21aa31                          // fcvtnu        v17.4s, v17.4s
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x4f909050                          // fmul          v16.4s, v2.4s, v16.s[0]
-  .long  0x4f2b5652                          // shl           v18.4s, v18.4s, #11
-  .long  0x4f255631                          // shl           v17.4s, v17.4s, #5
-  .long  0x4eb21e31                          // orr           v17.16b, v17.16b, v18.16b
-  .long  0x6e21aa10                          // fcvtnu        v16.4s, v16.4s
-  .long  0x4eb01e30                          // orr           v16.16b, v17.16b, v16.16b
-  .long  0x0e612a10                          // xtn           v16.4h, v16.4s
-  .long  0xfc296910                          // str           d16, [x8,x9]
-  .long  0xf9400423                          // ldr           x3, [x1,#8]
-  .long  0x91004021                          // add           x1, x1, #0x10
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_load_8888_aarch64
-_sk_load_8888_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xd37ef409                          // lsl           x9, x0, #2
-  .long  0xbd400c42                          // ldr           s2, [x2,#12]
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x3ce96900                          // ldr           q0, [x8,x9]
-  .long  0x91004048                          // add           x8, x2, #0x10
-  .long  0x4d40c901                          // ld1r          {v1.4s}, [x8]
-  .long  0x6f380410                          // ushr          v16.4s, v0.4s, #8
-  .long  0x6f300411                          // ushr          v17.4s, v0.4s, #16
-  .long  0x4e201c23                          // and           v3.16b, v1.16b, v0.16b
-  .long  0x6f280400                          // ushr          v0.4s, v0.4s, #24
-  .long  0x4e301c30                          // and           v16.16b, v1.16b, v16.16b
-  .long  0x4e311c21                          // and           v1.16b, v1.16b, v17.16b
-  .long  0x4e21d863                          // scvtf         v3.4s, v3.4s
-  .long  0x4e21d811                          // scvtf         v17.4s, v0.4s
-  .long  0x4e21da10                          // scvtf         v16.4s, v16.4s
-  .long  0x4e21d832                          // scvtf         v18.4s, v1.4s
-  .long  0x4f829060                          // fmul          v0.4s, v3.4s, v2.s[0]
-  .long  0x4f829223                          // fmul          v3.4s, v17.4s, v2.s[0]
-  .long  0x4f829201                          // fmul          v1.4s, v16.4s, v2.s[0]
-  .long  0x4f829242                          // fmul          v2.4s, v18.4s, v2.s[0]
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_store_8888_aarch64
-_sk_store_8888_aarch64:
-  .long  0xbd400850                          // ldr           s16, [x2,#8]
-  .long  0xf9400028                          // ldr           x8, [x1]
-  .long  0xd37ef409                          // lsl           x9, x0, #2
-  .long  0x4f909032                          // fmul          v18.4s, v1.4s, v16.s[0]
-  .long  0x4f909011                          // fmul          v17.4s, v0.4s, v16.s[0]
-  .long  0x6e21aa52                          // fcvtnu        v18.4s, v18.4s
-  .long  0x6e21aa31                          // fcvtnu        v17.4s, v17.4s
-  .long  0x4f285652                          // shl           v18.4s, v18.4s, #8
-  .long  0x4eb11e51                          // orr           v17.16b, v18.16b, v17.16b
-  .long  0x4f909052                          // fmul          v18.4s, v2.4s, v16.s[0]
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x4f909070                          // fmul          v16.4s, v3.4s, v16.s[0]
-  .long  0x6e21aa52                          // fcvtnu        v18.4s, v18.4s
-  .long  0x6e21aa10                          // fcvtnu        v16.4s, v16.4s
-  .long  0x4f305652                          // shl           v18.4s, v18.4s, #16
-  .long  0x4eb21e31                          // orr           v17.16b, v17.16b, v18.16b
-  .long  0x4f385610                          // shl           v16.4s, v16.4s, #24
-  .long  0x4eb01e30                          // orr           v16.16b, v17.16b, v16.16b
-  .long  0x3ca96910                          // str           q16, [x8,x9]
-  .long  0xf9400423                          // ldr           x3, [x1,#8]
-  .long  0x91004021                          // add           x1, x1, #0x10
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_load_f16_aarch64
-_sk_load_f16_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x8b000d08                          // add           x8, x8, x0, lsl #3
-  .long  0x0c400510                          // ld4           {v16.4h-v19.4h}, [x8]
-  .long  0x0e217a00                          // fcvtl         v0.4s, v16.4h
-  .long  0x0e217a21                          // fcvtl         v1.4s, v17.4h
-  .long  0x0e217a42                          // fcvtl         v2.4s, v18.4h
-  .long  0x0e217a63                          // fcvtl         v3.4s, v19.4h
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_store_f16_aarch64
-_sk_store_f16_aarch64:
-  .long  0xf9400028                          // ldr           x8, [x1]
-  .long  0x0e216810                          // fcvtn         v16.4h, v0.4s
-  .long  0x0e216831                          // fcvtn         v17.4h, v1.4s
-  .long  0x0e216852                          // fcvtn         v18.4h, v2.4s
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x0e216873                          // fcvtn         v19.4h, v3.4s
-  .long  0x8b000d08                          // add           x8, x8, x0, lsl #3
-  .long  0x0c000510                          // st4           {v16.4h-v19.4h}, [x8]
-  .long  0xf9400423                          // ldr           x3, [x1,#8]
-  .long  0x91004021                          // add           x1, x1, #0x10
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_store_f32_aarch64
-_sk_store_f32_aarch64:
-  .long  0xf9400028                          // ldr           x8, [x1]
-  .long  0xf9400108                          // ldr           x8, [x8]
-  .long  0x8b001108                          // add           x8, x8, x0, lsl #4
-  .long  0x4c000900                          // st4           {v0.4s-v3.4s}, [x8]
-  .long  0xf9400423                          // ldr           x3, [x1,#8]
-  .long  0x91004021                          // add           x1, x1, #0x10
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_clamp_x_aarch64
-_sk_clamp_x_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0x6f00e411                          // movi          v17.2d, #0x0
-  .long  0x4e20f620                          // fmax          v0.4s, v17.4s, v0.4s
-  .long  0x6f07e7f1                          // movi          v17.2d, #0xffffffffffffffff
-  .long  0x4d40c910                          // ld1r          {v16.4s}, [x8]
-  .long  0x4eb18610                          // add           v16.4s, v16.4s, v17.4s
-  .long  0x4eb0f400                          // fmin          v0.4s, v0.4s, v16.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_clamp_y_aarch64
-_sk_clamp_y_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0x6f00e411                          // movi          v17.2d, #0x0
-  .long  0x4e21f621                          // fmax          v1.4s, v17.4s, v1.4s
-  .long  0x6f07e7f1                          // movi          v17.2d, #0xffffffffffffffff
-  .long  0x4d40c910                          // ld1r          {v16.4s}, [x8]
-  .long  0x4eb18610                          // add           v16.4s, v16.4s, v17.4s
-  .long  0x4eb0f421                          // fmin          v1.4s, v1.4s, v16.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_repeat_x_aarch64
-_sk_repeat_x_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0x6f07e7f1                          // movi          v17.2d, #0xffffffffffffffff
-  .long  0xbd400110                          // ldr           s16, [x8]
-  .long  0x4e040612                          // dup           v18.4s, v16.s[0]
-  .long  0x4eb18651                          // add           v17.4s, v18.4s, v17.4s
-  .long  0x6e32fc12                          // fdiv          v18.4s, v0.4s, v18.4s
-  .long  0x4e219a52                          // frintm        v18.4s, v18.4s
-  .long  0x4f905240                          // fmls          v0.4s, v18.4s, v16.s[0]
-  .long  0x4eb1f400                          // fmin          v0.4s, v0.4s, v17.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_repeat_y_aarch64
-_sk_repeat_y_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0x6f07e7f1                          // movi          v17.2d, #0xffffffffffffffff
-  .long  0xbd400110                          // ldr           s16, [x8]
-  .long  0x4e040612                          // dup           v18.4s, v16.s[0]
-  .long  0x4eb18651                          // add           v17.4s, v18.4s, v17.4s
-  .long  0x6e32fc32                          // fdiv          v18.4s, v1.4s, v18.4s
-  .long  0x4e219a52                          // frintm        v18.4s, v18.4s
-  .long  0x4f905241                          // fmls          v1.4s, v18.4s, v16.s[0]
-  .long  0x4eb1f421                          // fmin          v1.4s, v1.4s, v17.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_mirror_x_aarch64
-_sk_mirror_x_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xbd400110                          // ldr           s16, [x8]
-  .long  0x4e040611                          // dup           v17.4s, v16.s[0]
-  .long  0x1e302a10                          // fadd          s16, s16, s16
-  .long  0x4eb1d400                          // fsub          v0.4s, v0.4s, v17.4s
-  .long  0x4e040612                          // dup           v18.4s, v16.s[0]
-  .long  0x6e32fc12                          // fdiv          v18.4s, v0.4s, v18.4s
-  .long  0x4e219a52                          // frintm        v18.4s, v18.4s
-  .long  0x4f905240                          // fmls          v0.4s, v18.4s, v16.s[0]
-  .long  0x6f07e7f0                          // movi          v16.2d, #0xffffffffffffffff
-  .long  0x4eb1d400                          // fsub          v0.4s, v0.4s, v17.4s
-  .long  0x4eb08630                          // add           v16.4s, v17.4s, v16.4s
-  .long  0x4ea0f800                          // fabs          v0.4s, v0.4s
-  .long  0x4eb0f400                          // fmin          v0.4s, v0.4s, v16.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_mirror_y_aarch64
-_sk_mirror_y_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xbd400110                          // ldr           s16, [x8]
-  .long  0x4e040611                          // dup           v17.4s, v16.s[0]
-  .long  0x1e302a10                          // fadd          s16, s16, s16
-  .long  0x4eb1d421                          // fsub          v1.4s, v1.4s, v17.4s
-  .long  0x4e040612                          // dup           v18.4s, v16.s[0]
-  .long  0x6e32fc32                          // fdiv          v18.4s, v1.4s, v18.4s
-  .long  0x4e219a52                          // frintm        v18.4s, v18.4s
-  .long  0x4f905241                          // fmls          v1.4s, v18.4s, v16.s[0]
-  .long  0x6f07e7f0                          // movi          v16.2d, #0xffffffffffffffff
-  .long  0x4eb1d421                          // fsub          v1.4s, v1.4s, v17.4s
-  .long  0x4eb08630                          // add           v16.4s, v17.4s, v16.4s
-  .long  0x4ea0f821                          // fabs          v1.4s, v1.4s
-  .long  0x4eb0f421                          // fmin          v1.4s, v1.4s, v16.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_matrix_2x3_aarch64
-_sk_matrix_2x3_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xaa0803e9                          // mov           x9, x8
-  .long  0x9100410a                          // add           x10, x8, #0x10
-  .long  0x4ddfc932                          // ld1r          {v18.4s}, [x9], #4
-  .long  0x4d40c950                          // ld1r          {v16.4s}, [x10]
-  .long  0x2d415113                          // ldp           s19, s20, [x8,#8]
-  .long  0x9100510a                          // add           x10, x8, #0x14
-  .long  0x4d40c951                          // ld1r          {v17.4s}, [x10]
-  .long  0x4f931030                          // fmla          v16.4s, v1.4s, v19.s[0]
-  .long  0xbd400133                          // ldr           s19, [x9]
-  .long  0x4f941031                          // fmla          v17.4s, v1.4s, v20.s[0]
-  .long  0x4e20ce50                          // fmla          v16.4s, v18.4s, v0.4s
-  .long  0x4f931011                          // fmla          v17.4s, v0.4s, v19.s[0]
-  .long  0x4eb01e00                          // mov           v0.16b, v16.16b
-  .long  0x4eb11e21                          // mov           v1.16b, v17.16b
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_matrix_3x4_aarch64
-_sk_matrix_3x4_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xaa0803e9                          // mov           x9, x8
-  .long  0x9100910a                          // add           x10, x8, #0x24
-  .long  0x4ddfc933                          // ld1r          {v19.4s}, [x9], #4
-  .long  0x4d40c950                          // ld1r          {v16.4s}, [x10]
-  .long  0x9100a10a                          // add           x10, x8, #0x28
-  .long  0x4d40c951                          // ld1r          {v17.4s}, [x10]
-  .long  0x9100b10a                          // add           x10, x8, #0x2c
-  .long  0x2d435514                          // ldp           s20, s21, [x8,#24]
-  .long  0xbd402116                          // ldr           s22, [x8,#32]
-  .long  0x4d40c952                          // ld1r          {v18.4s}, [x10]
-  .long  0x4f941050                          // fmla          v16.4s, v2.4s, v20.s[0]
-  .long  0x4f951051                          // fmla          v17.4s, v2.4s, v21.s[0]
-  .long  0x4f961052                          // fmla          v18.4s, v2.4s, v22.s[0]
-  .long  0x2d425502                          // ldp           s2, s21, [x8,#16]
-  .long  0x2d415d14                          // ldp           s20, s23, [x8,#8]
-  .long  0x4f821031                          // fmla          v17.4s, v1.4s, v2.s[0]
-  .long  0xbd400122                          // ldr           s2, [x9]
-  .long  0x4f971030                          // fmla          v16.4s, v1.4s, v23.s[0]
-  .long  0x4f951032                          // fmla          v18.4s, v1.4s, v21.s[0]
-  .long  0x4e20ce70                          // fmla          v16.4s, v19.4s, v0.4s
-  .long  0x4f941012                          // fmla          v18.4s, v0.4s, v20.s[0]
-  .long  0x4f821011                          // fmla          v17.4s, v0.4s, v2.s[0]
-  .long  0x4eb01e00                          // mov           v0.16b, v16.16b
-  .long  0x4eb11e21                          // mov           v1.16b, v17.16b
-  .long  0x4eb21e42                          // mov           v2.16b, v18.16b
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_matrix_perspective_aarch64
-_sk_matrix_perspective_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xaa0803e9                          // mov           x9, x8
-  .long  0x9100510a                          // add           x10, x8, #0x14
-  .long  0x4ddfc930                          // ld1r          {v16.4s}, [x9], #4
-  .long  0x4d40c951                          // ld1r          {v17.4s}, [x10]
-  .long  0x9100810a                          // add           x10, x8, #0x20
-  .long  0x4d40c952                          // ld1r          {v18.4s}, [x10]
-  .long  0x2d41d113                          // ldp           s19, s20, [x8,#12]
-  .long  0x2d435915                          // ldp           s21, s22, [x8,#24]
-  .long  0x91002108                          // add           x8, x8, #0x8
-  .long  0x4f941031                          // fmla          v17.4s, v1.4s, v20.s[0]
-  .long  0x4d40c914                          // ld1r          {v20.4s}, [x8]
-  .long  0x4f961032                          // fmla          v18.4s, v1.4s, v22.s[0]
-  .long  0xbd400136                          // ldr           s22, [x9]
-  .long  0x4f951012                          // fmla          v18.4s, v0.4s, v21.s[0]
-  .long  0x4f931011                          // fmla          v17.4s, v0.4s, v19.s[0]
-  .long  0x4f961034                          // fmla          v20.4s, v1.4s, v22.s[0]
-  .long  0x4ea1da41                          // frecpe        v1.4s, v18.4s
-  .long  0x4e21fe52                          // frecps        v18.4s, v18.4s, v1.4s
-  .long  0x6e32dc32                          // fmul          v18.4s, v1.4s, v18.4s
-  .long  0x4e20ce14                          // fmla          v20.4s, v16.4s, v0.4s
-  .long  0x6e32de21                          // fmul          v1.4s, v17.4s, v18.4s
-  .long  0x6e32de80                          // fmul          v0.4s, v20.4s, v18.4s
-  .long  0xd61f0060                          // br            x3
-
-.globl _sk_linear_gradient_2stops_aarch64
-_sk_linear_gradient_2stops_aarch64:
-  .long  0xa8c10c28                          // ldp           x8, x3, [x1],#16
-  .long  0xad404503                          // ldp           q3, q17, [x8]
-  .long  0x4e040470                          // dup           v16.4s, v3.s[0]
-  .long  0x4e0c0461                          // dup           v1.4s, v3.s[1]
-  .long  0x4e140462                          // dup           v2.4s, v3.s[2]
-  .long  0x4e1c0463                          // dup           v3.4s, v3.s[3]
-  .long  0x4f911010                          // fmla          v16.4s, v0.4s, v17.s[0]
-  .long  0x4fb11001                          // fmla          v1.4s, v0.4s, v17.s[1]
-  .long  0x4f911802                          // fmla          v2.4s, v0.4s, v17.s[2]
-  .long  0x4fb11803                          // fmla          v3.4s, v0.4s, v17.s[3]
-  .long  0x4eb01e00                          // mov           v0.16b, v16.16b
-  .long  0xd61f0060                          // br            x3
-#elif defined(__arm__)
-.balign 4
-
-.globl _sk_start_pipeline_vfp4
-_sk_start_pipeline_vfp4:
-  .long  0xe92d41f0                          // push          {r4, r5, r6, r7, r8, lr}
-  .long  0xe1a07001                          // mov           r7, r1
-  .long  0xe1a04000                          // mov           r4, r0
-  .long  0xe1a05003                          // mov           r5, r3
-  .long  0xe1a08002                          // mov           r8, r2
-  .long  0xe4976004                          // ldr           r6, [r7], #4
-  .long  0xe2840002                          // add           r0, r4, #2
-  .long  0xea00000d                          // b             58 <sk_start_pipeline_vfp4+0x58>
-  .long  0xf2800010                          // vmov.i32      d0, #0
-  .long  0xe1a00004                          // mov           r0, r4
-  .long  0xf2801010                          // vmov.i32      d1, #0
-  .long  0xe1a01007                          // mov           r1, r7
-  .long  0xf2802010                          // vmov.i32      d2, #0
-  .long  0xe1a02008                          // mov           r2, r8
-  .long  0xf2803010                          // vmov.i32      d3, #0
-  .long  0xf2804010                          // vmov.i32      d4, #0
-  .long  0xf2805010                          // vmov.i32      d5, #0
-  .long  0xf2806010                          // vmov.i32      d6, #0
-  .long  0xf2807010                          // vmov.i32      d7, #0
-  .long  0xe12fff36                          // blx           r6
-  .long  0xe2840004                          // add           r0, r4, #4
-  .long  0xe2844002                          // add           r4, r4, #2
-  .long  0xe1500005                          // cmp           r0, r5
-  .long  0x9affffef                          // bls           20 <sk_start_pipeline_vfp4+0x20>
-  .long  0xe1a00004                          // mov           r0, r4
-  .long  0xe8bd81f0                          // pop           {r4, r5, r6, r7, r8, pc}
-
-.globl _sk_just_return_vfp4
-_sk_just_return_vfp4:
-  .long  0xe12fff1e                          // bx            lr
-
-.globl _sk_seed_shader_vfp4
-_sk_seed_shader_vfp4:
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xee800b90                          // vdup.32       d16, r0
-  .long  0xf3fb0620                          // vcvt.f32.s32  d16, d16
-  .long  0xedd23b05                          // vldr          d19, [r2, #20]
-  .long  0xf2803010                          // vmov.i32      d3, #0
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe2823004                          // add           r3, r2, #4
-  .long  0xf3fb1621                          // vcvt.f32.s32  d17, d17
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xf2804010                          // vmov.i32      d4, #0
-  .long  0xf2400da2                          // vadd.f32      d16, d16, d18
-  .long  0xf2805010                          // vmov.i32      d5, #0
-  .long  0xf4a22c9f                          // vld1.32       {d2[]}, [r2 :32]
-  .long  0xf2011da2                          // vadd.f32      d1, d17, d18
-  .long  0xf2806010                          // vmov.i32      d6, #0
-  .long  0xf2030da0                          // vadd.f32      d0, d19, d16
-  .long  0xf2807010                          // vmov.i32      d7, #0
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_constant_color_vfp4
-_sk_constant_color_vfp4:
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xf4630a0f                          // vld1.8        {d16-d17}, [r3]
-  .long  0xf3b40c20                          // vdup.32       d0, d16[0]
-  .long  0xf3bc1c20                          // vdup.32       d1, d16[1]
-  .long  0xf3b42c21                          // vdup.32       d2, d17[0]
-  .long  0xf3bc3c21                          // vdup.32       d3, d17[1]
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_clear_vfp4
-_sk_clear_vfp4:
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf2800010                          // vmov.i32      d0, #0
-  .long  0xf2801010                          // vmov.i32      d1, #0
-  .long  0xf2802010                          // vmov.i32      d2, #0
-  .long  0xf2803010                          // vmov.i32      d3, #0
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_plus__vfp4
-_sk_plus__vfp4:
-  .long  0xf2000d04                          // vadd.f32      d0, d0, d4
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf2011d05                          // vadd.f32      d1, d1, d5
-  .long  0xf2022d06                          // vadd.f32      d2, d2, d6
-  .long  0xf2033d07                          // vadd.f32      d3, d3, d7
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_srcover_vfp4
-_sk_srcover_vfp4:
-  .long  0xf4e20c9f                          // vld1.32       {d16[]}, [r2 :32]
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf2600d83                          // vsub.f32      d16, d16, d3
-  .long  0xf2040c30                          // vfma.f32      d0, d4, d16
-  .long  0xf2051c30                          // vfma.f32      d1, d5, d16
-  .long  0xf2062c30                          // vfma.f32      d2, d6, d16
-  .long  0xf2073c30                          // vfma.f32      d3, d7, d16
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_dstover_vfp4
-_sk_dstover_vfp4:
-  .long  0xf4e20c9f                          // vld1.32       {d16[]}, [r2 :32]
-  .long  0xf2651115                          // vorr          d17, d5, d5
-  .long  0xf2604d87                          // vsub.f32      d20, d16, d7
-  .long  0xf2640114                          // vorr          d16, d4, d4
-  .long  0xf2662116                          // vorr          d18, d6, d6
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf2673117                          // vorr          d19, d7, d7
-  .long  0xf2400c34                          // vfma.f32      d16, d0, d20
-  .long  0xf2411c34                          // vfma.f32      d17, d1, d20
-  .long  0xf2422c34                          // vfma.f32      d18, d2, d20
-  .long  0xf2433c34                          // vfma.f32      d19, d3, d20
-  .long  0xf22001b0                          // vorr          d0, d16, d16
-  .long  0xf22111b1                          // vorr          d1, d17, d17
-  .long  0xf22221b2                          // vorr          d2, d18, d18
-  .long  0xf22331b3                          // vorr          d3, d19, d19
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_clamp_0_vfp4
-_sk_clamp_0_vfp4:
-  .long  0xf2c00010                          // vmov.i32      d16, #0
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf2000f20                          // vmax.f32      d0, d0, d16
-  .long  0xf2011f20                          // vmax.f32      d1, d1, d16
-  .long  0xf2022f20                          // vmax.f32      d2, d2, d16
-  .long  0xf2033f20                          // vmax.f32      d3, d3, d16
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_clamp_1_vfp4
-_sk_clamp_1_vfp4:
-  .long  0xf4e20c9f                          // vld1.32       {d16[]}, [r2 :32]
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf2200f20                          // vmin.f32      d0, d0, d16
-  .long  0xf2211f20                          // vmin.f32      d1, d1, d16
-  .long  0xf2222f20                          // vmin.f32      d2, d2, d16
-  .long  0xf2233f20                          // vmin.f32      d3, d3, d16
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_clamp_a_vfp4
-_sk_clamp_a_vfp4:
-  .long  0xf4e20c9f                          // vld1.32       {d16[]}, [r2 :32]
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf2233f20                          // vmin.f32      d3, d3, d16
-  .long  0xf2200f03                          // vmin.f32      d0, d0, d3
-  .long  0xf2211f03                          // vmin.f32      d1, d1, d3
-  .long  0xf2222f03                          // vmin.f32      d2, d2, d3
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_set_rgb_vfp4
-_sk_set_rgb_vfp4:
-  .long  0xe92d4800                          // push          {fp, lr}
-  .long  0xe591e000                          // ldr           lr, [r1]
-  .long  0xe591c004                          // ldr           ip, [r1, #4]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe28e3008                          // add           r3, lr, #8
-  .long  0xf4ae0c9f                          // vld1.32       {d0[]}, [lr :32]
-  .long  0xf4a32c9f                          // vld1.32       {d2[]}, [r3 :32]
-  .long  0xe28e3004                          // add           r3, lr, #4
-  .long  0xf4a31c9f                          // vld1.32       {d1[]}, [r3 :32]
-  .long  0xe8bd4800                          // pop           {fp, lr}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_swap_rb_vfp4
-_sk_swap_rb_vfp4:
-  .long  0xeef00b40                          // vmov.f64      d16, d0
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xeeb00b42                          // vmov.f64      d0, d2
-  .long  0xeeb02b60                          // vmov.f64      d2, d16
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_swap_vfp4
-_sk_swap_vfp4:
-  .long  0xeef00b43                          // vmov.f64      d16, d3
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xeef01b42                          // vmov.f64      d17, d2
-  .long  0xeef02b41                          // vmov.f64      d18, d1
-  .long  0xeef03b40                          // vmov.f64      d19, d0
-  .long  0xeeb00b44                          // vmov.f64      d0, d4
-  .long  0xeeb01b45                          // vmov.f64      d1, d5
-  .long  0xeeb02b46                          // vmov.f64      d2, d6
-  .long  0xeeb03b47                          // vmov.f64      d3, d7
-  .long  0xeeb04b63                          // vmov.f64      d4, d19
-  .long  0xeeb05b62                          // vmov.f64      d5, d18
-  .long  0xeeb06b61                          // vmov.f64      d6, d17
-  .long  0xeeb07b60                          // vmov.f64      d7, d16
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_move_src_dst_vfp4
-_sk_move_src_dst_vfp4:
-  .long  0xeeb04b40                          // vmov.f64      d4, d0
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xeeb05b41                          // vmov.f64      d5, d1
-  .long  0xeeb06b42                          // vmov.f64      d6, d2
-  .long  0xeeb07b43                          // vmov.f64      d7, d3
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_move_dst_src_vfp4
-_sk_move_dst_src_vfp4:
-  .long  0xeeb00b44                          // vmov.f64      d0, d4
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xeeb01b45                          // vmov.f64      d1, d5
-  .long  0xeeb02b46                          // vmov.f64      d2, d6
-  .long  0xeeb03b47                          // vmov.f64      d3, d7
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_premul_vfp4
-_sk_premul_vfp4:
-  .long  0xf3000d13                          // vmul.f32      d0, d0, d3
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf3011d13                          // vmul.f32      d1, d1, d3
-  .long  0xf3022d13                          // vmul.f32      d2, d2, d3
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_unpremul_vfp4
-_sk_unpremul_vfp4:
-  .long  0xed2d8b04                          // vpush         {d8-d9}
-  .long  0xed928a00                          // vldr          s16, [r2]
-  .long  0xf2c00010                          // vmov.i32      d16, #0
-  .long  0xf3f91503                          // vceq.f32      d17, d3, #0
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xeec89a23                          // vdiv.f32      s19, s16, s7
-  .long  0xee889a03                          // vdiv.f32      s18, s16, s6
-  .long  0xf3501199                          // vbsl          d17, d16, d9
-  .long  0xf3010d90                          // vmul.f32      d0, d17, d0
-  .long  0xf3011d91                          // vmul.f32      d1, d17, d1
-  .long  0xf3012d92                          // vmul.f32      d2, d17, d2
-  .long  0xecbd8b04                          // vpop          {d8-d9}
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_from_srgb_vfp4
-_sk_from_srgb_vfp4:
-  .long  0xed2d8b02                          // vpush         {d8}
-  .long  0xe282303c                          // add           r3, r2, #60
-  .long  0xed928a10                          // vldr          s16, [r2, #64]
-  .long  0xf3402d10                          // vmul.f32      d18, d0, d0
-  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
-  .long  0xe2823038                          // add           r3, r2, #56
-  .long  0xf3413d11                          // vmul.f32      d19, d1, d1
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe2823044                          // add           r3, r2, #68
-  .long  0xf26141b1                          // vorr          d20, d17, d17
-  .long  0xf26171b1                          // vorr          d23, d17, d17
-  .long  0xf4e38c9f                          // vld1.32       {d24[]}, [r3 :32]
-  .long  0xf2404c30                          // vfma.f32      d20, d0, d16
-  .long  0xe2823034                          // add           r3, r2, #52
-  .long  0xf2417c30                          // vfma.f32      d23, d1, d16
-  .long  0xf2421c30                          // vfma.f32      d17, d2, d16
-  .long  0xf3425d12                          // vmul.f32      d21, d2, d2
-  .long  0xf2e16948                          // vmul.f32      d22, d1, d8[0]
-  .long  0xf2e00948                          // vmul.f32      d16, d0, d8[0]
-  .long  0xf2e29948                          // vmul.f32      d25, d2, d8[0]
-  .long  0xf3282e82                          // vcgt.f32      d2, d24, d2
-  .long  0xf3281e81                          // vcgt.f32      d1, d24, d1
-  .long  0xf3280e80                          // vcgt.f32      d0, d24, d0
-  .long  0xf4e38c9f                          // vld1.32       {d24[]}, [r3 :32]
-  .long  0xf268a1b8                          // vorr          d26, d24, d24
-  .long  0xf242acb4                          // vfma.f32      d26, d18, d20
-  .long  0xf26821b8                          // vorr          d18, d24, d24
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf2432cb7                          // vfma.f32      d18, d19, d23
-  .long  0xf2458cb1                          // vfma.f32      d24, d21, d17
-  .long  0xf31001ba                          // vbsl          d0, d16, d26
-  .long  0xf31611b2                          // vbsl          d1, d22, d18
-  .long  0xf31921b8                          // vbsl          d2, d25, d24
-  .long  0xecbd8b02                          // vpop          {d8}
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_to_srgb_vfp4
-_sk_to_srgb_vfp4:
-  .long  0xed2d8b02                          // vpush         {d8}
-  .long  0xf3fb0580                          // vrsqrte.f32   d16, d0
-  .long  0xe2823050                          // add           r3, r2, #80
-  .long  0xf3fb1581                          // vrsqrte.f32   d17, d1
-  .long  0xed928a12                          // vldr          s16, [r2, #72]
-  .long  0xf3fb2582                          // vrsqrte.f32   d18, d2
-  .long  0xf3403db0                          // vmul.f32      d19, d16, d16
-  .long  0xf3414db1                          // vmul.f32      d20, d17, d17
-  .long  0xf3425db2                          // vmul.f32      d21, d18, d18
-  .long  0xf2603f33                          // vrsqrts.f32   d19, d0, d19
-  .long  0xf2614f34                          // vrsqrts.f32   d20, d1, d20
-  .long  0xf2625f35                          // vrsqrts.f32   d21, d2, d21
-  .long  0xf3400db3                          // vmul.f32      d16, d16, d19
-  .long  0xf3411db4                          // vmul.f32      d17, d17, d20
-  .long  0xf3422db5                          // vmul.f32      d18, d18, d21
-  .long  0xf3fb3520                          // vrecpe.f32    d19, d16
-  .long  0xf3fb4521                          // vrecpe.f32    d20, d17
-  .long  0xf3fb6522                          // vrecpe.f32    d22, d18
-  .long  0xf3fb55a2                          // vrsqrte.f32   d21, d18
-  .long  0xf3fb75a0                          // vrsqrte.f32   d23, d16
-  .long  0xf3fb85a1                          // vrsqrte.f32   d24, d17
-  .long  0xf2409fb3                          // vrecps.f32    d25, d16, d19
-  .long  0xf241afb4                          // vrecps.f32    d26, d17, d20
-  .long  0xf242bfb6                          // vrecps.f32    d27, d18, d22
-  .long  0xf345cdb5                          // vmul.f32      d28, d21, d21
-  .long  0xf347ddb7                          // vmul.f32      d29, d23, d23
-  .long  0xf348edb8                          // vmul.f32      d30, d24, d24
-  .long  0xf2622fbc                          // vrsqrts.f32   d18, d18, d28
-  .long  0xf2600fbd                          // vrsqrts.f32   d16, d16, d29
-  .long  0xf2611fbe                          // vrsqrts.f32   d17, d17, d30
-  .long  0xf3433db9                          // vmul.f32      d19, d19, d25
-  .long  0xf4e39c9f                          // vld1.32       {d25[]}, [r3 :32]
-  .long  0xe2823054                          // add           r3, r2, #84
-  .long  0xf3444dba                          // vmul.f32      d20, d20, d26
-  .long  0xf3466dbb                          // vmul.f32      d22, d22, d27
-  .long  0xf4e3ac9f                          // vld1.32       {d26[]}, [r3 :32]
-  .long  0xe282304c                          // add           r3, r2, #76
-  .long  0xf26ab1ba                          // vorr          d27, d26, d26
-  .long  0xf249bcb3                          // vfma.f32      d27, d25, d19
-  .long  0xf26a31ba                          // vorr          d19, d26, d26
-  .long  0xf2493cb4                          // vfma.f32      d19, d25, d20
-  .long  0xf4e34c9f                          // vld1.32       {d20[]}, [r3 :32]
-  .long  0xf249acb6                          // vfma.f32      d26, d25, d22
-  .long  0xe2823058                          // add           r3, r2, #88
-  .long  0xf3452db2                          // vmul.f32      d18, d21, d18
-  .long  0xf3470db0                          // vmul.f32      d16, d23, d16
-  .long  0xf3481db1                          // vmul.f32      d17, d24, d17
-  .long  0xf2e05948                          // vmul.f32      d21, d0, d8[0]
-  .long  0xf244bcb0                          // vfma.f32      d27, d20, d16
-  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
-  .long  0xf2443cb1                          // vfma.f32      d19, d20, d17
-  .long  0xf244acb2                          // vfma.f32      d26, d20, d18
-  .long  0xf4e24c9f                          // vld1.32       {d20[]}, [r2 :32]
-  .long  0xf2e11948                          // vmul.f32      d17, d1, d8[0]
-  .long  0xf2e22948                          // vmul.f32      d18, d2, d8[0]
-  .long  0xf3201e81                          // vcgt.f32      d1, d16, d1
-  .long  0xe4913004                          // ldr           r3, [r1], #4
-  .long  0xf3200e80                          // vcgt.f32      d0, d16, d0
-  .long  0xf3202e82                          // vcgt.f32      d2, d16, d2
-  .long  0xf2640fab                          // vmin.f32      d16, d20, d27
-  .long  0xf2643fa3                          // vmin.f32      d19, d20, d19
-  .long  0xf2644faa                          // vmin.f32      d20, d20, d26
-  .long  0xf31501b0                          // vbsl          d0, d21, d16
-  .long  0xf31111b3                          // vbsl          d1, d17, d19
-  .long  0xf31221b4                          // vbsl          d2, d18, d20
-  .long  0xecbd8b02                          // vpop          {d8}
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_scale_1_float_vfp4
-_sk_scale_1_float_vfp4:
-  .long  0xed2d8b02                          // vpush         {d8}
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xed938a00                          // vldr          s16, [r3]
-  .long  0xf2a00948                          // vmul.f32      d0, d0, d8[0]
-  .long  0xf2a11948                          // vmul.f32      d1, d1, d8[0]
-  .long  0xf2a22948                          // vmul.f32      d2, d2, d8[0]
-  .long  0xf2a33948                          // vmul.f32      d3, d3, d8[0]
-  .long  0xecbd8b02                          // vpop          {d8}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_scale_u8_vfp4
-_sk_scale_u8_vfp4:
-  .long  0xed2d8b02                          // vpush         {d8}
-  .long  0xe24dd008                          // sub           sp, sp, #8
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xe0833000                          // add           r3, r3, r0
-  .long  0xe1d330b0                          // ldrh          r3, [r3]
-  .long  0xe1cd30b4                          // strh          r3, [sp, #4]
-  .long  0xe28d3004                          // add           r3, sp, #4
-  .long  0xed928a03                          // vldr          s16, [r2, #12]
-  .long  0xf4e3041f                          // vld1.16       {d16[0]}, [r3 :16]
-  .long  0xf3c80a30                          // vmovl.u8      q8, d16
-  .long  0xf3d00a30                          // vmovl.u16     q8, d16
-  .long  0xf3fb06a0                          // vcvt.f32.u32  d16, d16
-  .long  0xf2e009c8                          // vmul.f32      d16, d16, d8[0]
-  .long  0xf3000d90                          // vmul.f32      d0, d16, d0
-  .long  0xf3001d91                          // vmul.f32      d1, d16, d1
-  .long  0xf3002d92                          // vmul.f32      d2, d16, d2
-  .long  0xf3003d93                          // vmul.f32      d3, d16, d3
-  .long  0xe28dd008                          // add           sp, sp, #8
-  .long  0xecbd8b02                          // vpop          {d8}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_lerp_1_float_vfp4
-_sk_lerp_1_float_vfp4:
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xf2600d04                          // vsub.f32      d16, d0, d4
-  .long  0xf2611d05                          // vsub.f32      d17, d1, d5
-  .long  0xf2622d06                          // vsub.f32      d18, d2, d6
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xf2633d07                          // vsub.f32      d19, d3, d7
-  .long  0xf4e34c9f                          // vld1.32       {d20[]}, [r3 :32]
-  .long  0xf2240114                          // vorr          d0, d4, d4
-  .long  0xf2251115                          // vorr          d1, d5, d5
-  .long  0xf2262116                          // vorr          d2, d6, d6
-  .long  0xf2273117                          // vorr          d3, d7, d7
-  .long  0xf2000cb4                          // vfma.f32      d0, d16, d20
-  .long  0xf2011cb4                          // vfma.f32      d1, d17, d20
-  .long  0xf2022cb4                          // vfma.f32      d2, d18, d20
-  .long  0xf2033cb4                          // vfma.f32      d3, d19, d20
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_lerp_u8_vfp4
-_sk_lerp_u8_vfp4:
-  .long  0xed2d8b02                          // vpush         {d8}
-  .long  0xe24dd008                          // sub           sp, sp, #8
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xf2612d05                          // vsub.f32      d18, d1, d5
-  .long  0xf2623d06                          // vsub.f32      d19, d2, d6
-  .long  0xf2634d07                          // vsub.f32      d20, d3, d7
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xf2251115                          // vorr          d1, d5, d5
-  .long  0xf2262116                          // vorr          d2, d6, d6
-  .long  0xe0833000                          // add           r3, r3, r0
-  .long  0xf2273117                          // vorr          d3, d7, d7
-  .long  0xe1d330b0                          // ldrh          r3, [r3]
-  .long  0xe1cd30b4                          // strh          r3, [sp, #4]
-  .long  0xe28d3004                          // add           r3, sp, #4
-  .long  0xed928a03                          // vldr          s16, [r2, #12]
-  .long  0xf4e3041f                          // vld1.16       {d16[0]}, [r3 :16]
-  .long  0xf3c80a30                          // vmovl.u8      q8, d16
-  .long  0xf3d00a30                          // vmovl.u16     q8, d16
-  .long  0xf3fb06a0                          // vcvt.f32.u32  d16, d16
-  .long  0xf2601d04                          // vsub.f32      d17, d0, d4
-  .long  0xf2240114                          // vorr          d0, d4, d4
-  .long  0xf2e009c8                          // vmul.f32      d16, d16, d8[0]
-  .long  0xf2010cb0                          // vfma.f32      d0, d17, d16
-  .long  0xf2021cb0                          // vfma.f32      d1, d18, d16
-  .long  0xf2032cb0                          // vfma.f32      d2, d19, d16
-  .long  0xf2043cb0                          // vfma.f32      d3, d20, d16
-  .long  0xe28dd008                          // add           sp, sp, #8
-  .long  0xecbd8b02                          // vpop          {d8}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_lerp_565_vfp4
-_sk_lerp_565_vfp4:
-  .long  0xed2d8b04                          // vpush         {d8-d9}
-  .long  0xe24dd008                          // sub           sp, sp, #8
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xf2603d04                          // vsub.f32      d19, d0, d4
-  .long  0xf2240114                          // vorr          d0, d4, d4
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xe7933080                          // ldr           r3, [r3, r0, lsl #1]
-  .long  0xe58d3004                          // str           r3, [sp, #4]
-  .long  0xe28d3004                          // add           r3, sp, #4
-  .long  0xed923a1d                          // vldr          s6, [r2, #116]
-  .long  0xf4e3083f                          // vld1.32       {d16[0]}, [r3 :32]
-  .long  0xe282306c                          // add           r3, r2, #108
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe2823068                          // add           r3, r2, #104
-  .long  0xf3d04a30                          // vmovl.u16     q10, d16
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xe2823070                          // add           r3, r2, #112
-  .long  0xf24201b4                          // vand          d16, d18, d20
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xf24221b4                          // vand          d18, d18, d20
-  .long  0xf24111b4                          // vand          d17, d17, d20
-  .long  0xf3fb0620                          // vcvt.f32.s32  d16, d16
-  .long  0xed928a1e                          // vldr          s16, [r2, #120]
-  .long  0xf3fb1621                          // vcvt.f32.s32  d17, d17
-  .long  0xed929a1f                          // vldr          s18, [r2, #124]
-  .long  0xf3fb2622                          // vcvt.f32.s32  d18, d18
-  .long  0xf2614d05                          // vsub.f32      d20, d1, d5
-  .long  0xf2e009c3                          // vmul.f32      d16, d16, d3[0]
-  .long  0xf4a23c9f                          // vld1.32       {d3[]}, [r2 :32]
-  .long  0xf2625d06                          // vsub.f32      d21, d2, d6
-  .long  0xf2e119c8                          // vmul.f32      d17, d17, d8[0]
-  .long  0xf2e229c9                          // vmul.f32      d18, d18, d9[0]
-  .long  0xf2251115                          // vorr          d1, d5, d5
-  .long  0xf2262116                          // vorr          d2, d6, d6
-  .long  0xf2030cb0                          // vfma.f32      d0, d19, d16
-  .long  0xf2041cb1                          // vfma.f32      d1, d20, d17
-  .long  0xf2052cb2                          // vfma.f32      d2, d21, d18
-  .long  0xe28dd008                          // add           sp, sp, #8
-  .long  0xecbd8b04                          // vpop          {d8-d9}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_load_tables_vfp4
-_sk_load_tables_vfp4:
-  .long  0xe92d48f0                          // push          {r4, r5, r6, r7, fp, lr}
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xe2826010                          // add           r6, r2, #16
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe593e000                          // ldr           lr, [r3]
-  .long  0xe99300b0                          // ldmib         r3, {r4, r5, r7}
-  .long  0xf4e60c9f                          // vld1.32       {d16[]}, [r6 :32]
-  .long  0xe08e6100                          // add           r6, lr, r0, lsl #2
-  .long  0xedd61b00                          // vldr          d17, [r6]
-  .long  0xf24021b1                          // vand          d18, d16, d17
-  .long  0xed922a03                          // vldr          s4, [r2, #12]
-  .long  0xf3f03031                          // vshr.u32      d19, d17, #16
-  .long  0xee326b90                          // vmov.32       r6, d18[1]
-  .long  0xe0846106                          // add           r6, r4, r6, lsl #2
-  .long  0xedd60a00                          // vldr          s1, [r6]
-  .long  0xee126b90                          // vmov.32       r6, d18[0]
-  .long  0xf3f82031                          // vshr.u32      d18, d17, #8
-  .long  0xf24021b2                          // vand          d18, d16, d18
-  .long  0xf24001b3                          // vand          d16, d16, d19
-  .long  0xee103b90                          // vmov.32       r3, d16[0]
-  .long  0xe0846106                          // add           r6, r4, r6, lsl #2
-  .long  0xee304b90                          // vmov.32       r4, d16[1]
-  .long  0xf3e80031                          // vshr.u32      d16, d17, #24
-  .long  0xed960a00                          // vldr          s0, [r6]
-  .long  0xee326b90                          // vmov.32       r6, d18[1]
-  .long  0xf3fb0620                          // vcvt.f32.s32  d16, d16
-  .long  0xe0873103                          // add           r3, r7, r3, lsl #2
-  .long  0xf2a039c2                          // vmul.f32      d3, d16, d2[0]
-  .long  0xe0874104                          // add           r4, r7, r4, lsl #2
-  .long  0xedd42a00                          // vldr          s5, [r4]
-  .long  0xe0856106                          // add           r6, r5, r6, lsl #2
-  .long  0xed932a00                          // vldr          s4, [r3]
-  .long  0xedd61a00                          // vldr          s3, [r6]
-  .long  0xee126b90                          // vmov.32       r6, d18[0]
-  .long  0xe0856106                          // add           r6, r5, r6, lsl #2
-  .long  0xed961a00                          // vldr          s2, [r6]
-  .long  0xe8bd48f0                          // pop           {r4, r5, r6, r7, fp, lr}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_load_a8_vfp4
-_sk_load_a8_vfp4:
-  .long  0xe24dd004                          // sub           sp, sp, #4
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xf2801010                          // vmov.i32      d1, #0
-  .long  0xf2802010                          // vmov.i32      d2, #0
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xe0833000                          // add           r3, r3, r0
-  .long  0xe1d330b0                          // ldrh          r3, [r3]
-  .long  0xe1cd30b0                          // strh          r3, [sp]
-  .long  0xe1a0300d                          // mov           r3, sp
-  .long  0xf4e3041f                          // vld1.16       {d16[0]}, [r3 :16]
-  .long  0xed920a03                          // vldr          s0, [r2, #12]
-  .long  0xf3c80a30                          // vmovl.u8      q8, d16
-  .long  0xf3d00a30                          // vmovl.u16     q8, d16
-  .long  0xf3fb06a0                          // vcvt.f32.u32  d16, d16
-  .long  0xf2a039c0                          // vmul.f32      d3, d16, d0[0]
-  .long  0xf2800010                          // vmov.i32      d0, #0
-  .long  0xe28dd004                          // add           sp, sp, #4
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_store_a8_vfp4
-_sk_store_a8_vfp4:
-  .long  0xe92d4800                          // push          {fp, lr}
-  .long  0xe2823008                          // add           r3, r2, #8
-  .long  0xf2c3061f                          // vmov.i32      d16, #1056964608
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe5913000                          // ldr           r3, [r1]
-  .long  0xf2430c31                          // vfma.f32      d16, d3, d17
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xf3fb07a0                          // vcvt.u32.f32  d16, d16
-  .long  0xee10eb90                          // vmov.32       lr, d16[0]
-  .long  0xee30cb90                          // vmov.32       ip, d16[1]
-  .long  0xe7e3e000                          // strb          lr, [r3, r0]!
-  .long  0xe5c3c001                          // strb          ip, [r3, #1]
-  .long  0xe5913004                          // ldr           r3, [r1, #4]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe8bd4800                          // pop           {fp, lr}
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_load_565_vfp4
-_sk_load_565_vfp4:
-  .long  0xe24dd004                          // sub           sp, sp, #4
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xe7933080                          // ldr           r3, [r3, r0, lsl #1]
-  .long  0xe58d3000                          // str           r3, [sp]
-  .long  0xe1a0300d                          // mov           r3, sp
-  .long  0xf4e3083f                          // vld1.32       {d16[0]}, [r3 :32]
-  .long  0xe282306c                          // add           r3, r2, #108
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe2823068                          // add           r3, r2, #104
-  .long  0xf3d04a30                          // vmovl.u16     q10, d16
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xe2823070                          // add           r3, r2, #112
-  .long  0xf24201b4                          // vand          d16, d18, d20
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xf24111b4                          // vand          d17, d17, d20
-  .long  0xf24221b4                          // vand          d18, d18, d20
-  .long  0xf4a23c9f                          // vld1.32       {d3[]}, [r2 :32]
-  .long  0xf3fb0620                          // vcvt.f32.s32  d16, d16
-  .long  0xf3fb1621                          // vcvt.f32.s32  d17, d17
-  .long  0xf3fb2622                          // vcvt.f32.s32  d18, d18
-  .long  0xed920a1d                          // vldr          s0, [r2, #116]
-  .long  0xed921a1e                          // vldr          s2, [r2, #120]
-  .long  0xed922a1f                          // vldr          s4, [r2, #124]
-  .long  0xf2a009c0                          // vmul.f32      d0, d16, d0[0]
-  .long  0xf2a119c1                          // vmul.f32      d1, d17, d1[0]
-  .long  0xf2a229c2                          // vmul.f32      d2, d18, d2[0]
-  .long  0xe28dd004                          // add           sp, sp, #4
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_store_565_vfp4
-_sk_store_565_vfp4:
-  .long  0xe2823080                          // add           r3, r2, #128
-  .long  0xf2c3361f                          // vmov.i32      d19, #1056964608
-  .long  0xf2c3461f                          // vmov.i32      d20, #1056964608
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe2823084                          // add           r3, r2, #132
-  .long  0xf2403c31                          // vfma.f32      d19, d0, d17
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xf2c3061f                          // vmov.i32      d16, #1056964608
-  .long  0xf2414c32                          // vfma.f32      d20, d1, d18
-  .long  0xf2420c31                          // vfma.f32      d16, d2, d17
-  .long  0xe5913000                          // ldr           r3, [r1]
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xf3fb17a3                          // vcvt.u32.f32  d17, d19
-  .long  0xe0833080                          // add           r3, r3, r0, lsl #1
-  .long  0xf3fb27a4                          // vcvt.u32.f32  d18, d20
-  .long  0xf3fb07a0                          // vcvt.u32.f32  d16, d16
-  .long  0xf2eb1531                          // vshl.s32      d17, d17, #11
-  .long  0xf2e52532                          // vshl.s32      d18, d18, #5
-  .long  0xf26101b0                          // vorr          d16, d17, d16
-  .long  0xf26001b2                          // vorr          d16, d16, d18
-  .long  0xf3f60121                          // vuzp.16       d16, d17
-  .long  0xf4c3080f                          // vst1.32       {d16[0]}, [r3]
-  .long  0xe5913004                          // ldr           r3, [r1, #4]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_load_8888_vfp4
-_sk_load_8888_vfp4:
-  .long  0xe92d4800                          // push          {fp, lr}
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xed922a03                          // vldr          s4, [r2, #12]
-  .long  0xe593e000                          // ldr           lr, [r3]
-  .long  0xe2823010                          // add           r3, r2, #16
-  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
-  .long  0xe08e3100                          // add           r3, lr, r0, lsl #2
-  .long  0xedd31b00                          // vldr          d17, [r3]
-  .long  0xf24021b1                          // vand          d18, d16, d17
-  .long  0xf3f83031                          // vshr.u32      d19, d17, #8
-  .long  0xf3e84031                          // vshr.u32      d20, d17, #24
-  .long  0xf3f01031                          // vshr.u32      d17, d17, #16
-  .long  0xf24031b3                          // vand          d19, d16, d19
-  .long  0xf24001b1                          // vand          d16, d16, d17
-  .long  0xf3fb2622                          // vcvt.f32.s32  d18, d18
-  .long  0xf3fb4624                          // vcvt.f32.s32  d20, d20
-  .long  0xf3fb1623                          // vcvt.f32.s32  d17, d19
-  .long  0xf3fb0620                          // vcvt.f32.s32  d16, d16
-  .long  0xf2a209c2                          // vmul.f32      d0, d18, d2[0]
-  .long  0xf2a439c2                          // vmul.f32      d3, d20, d2[0]
-  .long  0xf2a119c2                          // vmul.f32      d1, d17, d2[0]
-  .long  0xf2a029c2                          // vmul.f32      d2, d16, d2[0]
-  .long  0xe8bd4800                          // pop           {fp, lr}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_store_8888_vfp4
-_sk_store_8888_vfp4:
-  .long  0xe2823008                          // add           r3, r2, #8
-  .long  0xf2c3261f                          // vmov.i32      d18, #1056964608
-  .long  0xf2c3361f                          // vmov.i32      d19, #1056964608
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xf2c3061f                          // vmov.i32      d16, #1056964608
-  .long  0xf2412c31                          // vfma.f32      d18, d1, d17
-  .long  0xf2423c31                          // vfma.f32      d19, d2, d17
-  .long  0xf2c3461f                          // vmov.i32      d20, #1056964608
-  .long  0xe5913000                          // ldr           r3, [r1]
-  .long  0xf2400c31                          // vfma.f32      d16, d0, d17
-  .long  0xf2434c31                          // vfma.f32      d20, d3, d17
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xe0833100                          // add           r3, r3, r0, lsl #2
-  .long  0xf3fb17a2                          // vcvt.u32.f32  d17, d18
-  .long  0xf3fb27a3                          // vcvt.u32.f32  d18, d19
-  .long  0xf3fb07a0                          // vcvt.u32.f32  d16, d16
-  .long  0xf3fb37a4                          // vcvt.u32.f32  d19, d20
-  .long  0xf2e81531                          // vshl.s32      d17, d17, #8
-  .long  0xf2f02532                          // vshl.s32      d18, d18, #16
-  .long  0xf26101b0                          // vorr          d16, d17, d16
-  .long  0xf2f81533                          // vshl.s32      d17, d19, #24
-  .long  0xf26001b2                          // vorr          d16, d16, d18
-  .long  0xf26001b1                          // vorr          d16, d16, d17
-  .long  0xedc30b00                          // vstr          d16, [r3]
-  .long  0xe5913004                          // ldr           r3, [r1, #4]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe12fff13                          // bx            r3
-
-.globl _sk_load_f16_vfp4
-_sk_load_f16_vfp4:
-  .long  0xed2d8b04                          // vpush         {d8-d9}
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xe0833180                          // add           r3, r3, r0, lsl #3
-  .long  0xf463084f                          // vld2.16       {d16-d17}, [r3]
-  .long  0xf3b62720                          // vcvt.f32.f16  q1, d16
-  .long  0xf3b68721                          // vcvt.f32.f16  q4, d17
-  .long  0xf2220112                          // vorr          d0, d2, d2
-  .long  0xeef00a43                          // vmov.f32      s1, s6
-  .long  0xf2281118                          // vorr          d1, d8, d8
-  .long  0xeeb03a62                          // vmov.f32      s6, s5
-  .long  0xeef01a49                          // vmov.f32      s3, s18
-  .long  0xeeb09a68                          // vmov.f32      s18, s17
-  .long  0xeeb02b43                          // vmov.f64      d2, d3
-  .long  0xeeb03b49                          // vmov.f64      d3, d9
-  .long  0xecbd8b04                          // vpop          {d8-d9}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_store_f16_vfp4
-_sk_store_f16_vfp4:
-  .long  0xeef00b41                          // vmov.f64      d16, d1
-  .long  0xeef03b42                          // vmov.f64      d19, d2
-  .long  0xf2631113                          // vorr          d17, d3, d3
-  .long  0xf2602110                          // vorr          d18, d0, d0
-  .long  0xf3fa00a1                          // vtrn.32       d16, d17
-  .long  0xf3f61620                          // vcvt.f16.f32  d17, q8
-  .long  0xf3fa20a3                          // vtrn.32       d18, d19
-  .long  0xe5913000                          // ldr           r3, [r1]
-  .long  0xf3f60622                          // vcvt.f16.f32  d16, q9
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xe0833180                          // add           r3, r3, r0, lsl #3
-  .long  0xf443084f                          // vst2.16       {d16-d17}, [r3]
-  .long  0xe2813008                          // add           r3, r1, #8
-  .long  0xe591c004                          // ldr           ip, [r1, #4]
-  .long  0xe1a01003                          // mov           r1, r3
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_store_f32_vfp4
-_sk_store_f32_vfp4:
-  .long  0xe5913000                          // ldr           r3, [r1]
-  .long  0xe5933000                          // ldr           r3, [r3]
-  .long  0xe0833200                          // add           r3, r3, r0, lsl #4
-  .long  0xf403008f                          // vst4.32       {d0-d3}, [r3]
-  .long  0xe2813008                          // add           r3, r1, #8
-  .long  0xe591c004                          // ldr           ip, [r1, #4]
-  .long  0xe1a01003                          // mov           r1, r3
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_clamp_x_vfp4
-_sk_clamp_x_vfp4:
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xf2c00010                          // vmov.i32      d16, #0
-  .long  0xf3c71e1f                          // vmov.i8       d17, #255
-  .long  0xf2400f80                          // vmax.f32      d16, d16, d0
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xf26218a1                          // vadd.i32      d17, d18, d17
-  .long  0xf2200fa1                          // vmin.f32      d0, d16, d17
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_clamp_y_vfp4
-_sk_clamp_y_vfp4:
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xf2c00010                          // vmov.i32      d16, #0
-  .long  0xf3c71e1f                          // vmov.i8       d17, #255
-  .long  0xf2400f81                          // vmax.f32      d16, d16, d1
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xf26218a1                          // vadd.i32      d17, d18, d17
-  .long  0xf2201fa1                          // vmin.f32      d1, d16, d17
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_repeat_x_vfp4
-_sk_repeat_x_vfp4:
-  .long  0xed2d8b04                          // vpush         {d8-d9}
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xf2c02010                          // vmov.i32      d18, #0
-  .long  0xf4e23c9f                          // vld1.32       {d19[]}, [r2 :32]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xed938a00                          // vldr          s16, [r3]
-  .long  0xeec09a88                          // vdiv.f32      s19, s1, s16
-  .long  0xee809a08                          // vdiv.f32      s18, s0, s16
-  .long  0xf3fb0709                          // vcvt.s32.f32  d16, d9
-  .long  0xf3fb0620                          // vcvt.f32.s32  d16, d16
-  .long  0xf3601e89                          // vcgt.f32      d17, d16, d9
-  .long  0xf35311b2                          // vbsl          d17, d19, d18
-  .long  0xf3f42c08                          // vdup.32       d18, d8[0]
-  .long  0xf2600da1                          // vsub.f32      d16, d16, d17
-  .long  0xf3c71e1f                          // vmov.i8       d17, #255
-  .long  0xf26218a1                          // vadd.i32      d17, d18, d17
-  .long  0xf2e009c8                          // vmul.f32      d16, d16, d8[0]
-  .long  0xf2600d20                          // vsub.f32      d16, d0, d16
-  .long  0xf2200fa1                          // vmin.f32      d0, d16, d17
-  .long  0xecbd8b04                          // vpop          {d8-d9}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_repeat_y_vfp4
-_sk_repeat_y_vfp4:
-  .long  0xed2d8b04                          // vpush         {d8-d9}
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xf2c02010                          // vmov.i32      d18, #0
-  .long  0xf4e23c9f                          // vld1.32       {d19[]}, [r2 :32]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xed938a00                          // vldr          s16, [r3]
-  .long  0xeec19a88                          // vdiv.f32      s19, s3, s16
-  .long  0xee819a08                          // vdiv.f32      s18, s2, s16
-  .long  0xf3fb0709                          // vcvt.s32.f32  d16, d9
-  .long  0xf3fb0620                          // vcvt.f32.s32  d16, d16
-  .long  0xf3601e89                          // vcgt.f32      d17, d16, d9
-  .long  0xf35311b2                          // vbsl          d17, d19, d18
-  .long  0xf3f42c08                          // vdup.32       d18, d8[0]
-  .long  0xf2600da1                          // vsub.f32      d16, d16, d17
-  .long  0xf3c71e1f                          // vmov.i8       d17, #255
-  .long  0xf26218a1                          // vadd.i32      d17, d18, d17
-  .long  0xf2e009c8                          // vmul.f32      d16, d16, d8[0]
-  .long  0xf2610d20                          // vsub.f32      d16, d1, d16
-  .long  0xf2201fa1                          // vmin.f32      d1, d16, d17
-  .long  0xecbd8b04                          // vpop          {d8-d9}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_mirror_x_vfp4
-_sk_mirror_x_vfp4:
-  .long  0xed2d8b04                          // vpush         {d8-d9}
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xf2c03010                          // vmov.i32      d19, #0
-  .long  0xf4e24c9f                          // vld1.32       {d20[]}, [r2 :32]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xed938a00                          // vldr          s16, [r3]
-  .long  0xee389a08                          // vadd.f32      s18, s16, s16
-  .long  0xf3f40c08                          // vdup.32       d16, d8[0]
-  .long  0xf2200d20                          // vsub.f32      d0, d0, d16
-  .long  0xeec08a89                          // vdiv.f32      s17, s1, s18
-  .long  0xee808a09                          // vdiv.f32      s16, s0, s18
-  .long  0xf3fb1708                          // vcvt.s32.f32  d17, d8
-  .long  0xf3fb1621                          // vcvt.f32.s32  d17, d17
-  .long  0xf3612e88                          // vcgt.f32      d18, d17, d8
-  .long  0xf35421b3                          // vbsl          d18, d20, d19
-  .long  0xf2611da2                          // vsub.f32      d17, d17, d18
-  .long  0xf3c72e1f                          // vmov.i8       d18, #255
-  .long  0xf2e119c9                          // vmul.f32      d17, d17, d9[0]
-  .long  0xf2601d21                          // vsub.f32      d17, d0, d17
-  .long  0xf2611da0                          // vsub.f32      d17, d17, d16
-  .long  0xf26008a2                          // vadd.i32      d16, d16, d18
-  .long  0xf3f91721                          // vabs.f32      d17, d17
-  .long  0xf2210fa0                          // vmin.f32      d0, d17, d16
-  .long  0xecbd8b04                          // vpop          {d8-d9}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_mirror_y_vfp4
-_sk_mirror_y_vfp4:
-  .long  0xed2d8b04                          // vpush         {d8-d9}
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xf2c03010                          // vmov.i32      d19, #0
-  .long  0xf4e24c9f                          // vld1.32       {d20[]}, [r2 :32]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xed938a00                          // vldr          s16, [r3]
-  .long  0xee389a08                          // vadd.f32      s18, s16, s16
-  .long  0xf3f40c08                          // vdup.32       d16, d8[0]
-  .long  0xf2211d20                          // vsub.f32      d1, d1, d16
-  .long  0xeec18a89                          // vdiv.f32      s17, s3, s18
-  .long  0xee818a09                          // vdiv.f32      s16, s2, s18
-  .long  0xf3fb1708                          // vcvt.s32.f32  d17, d8
-  .long  0xf3fb1621                          // vcvt.f32.s32  d17, d17
-  .long  0xf3612e88                          // vcgt.f32      d18, d17, d8
-  .long  0xf35421b3                          // vbsl          d18, d20, d19
-  .long  0xf2611da2                          // vsub.f32      d17, d17, d18
-  .long  0xf3c72e1f                          // vmov.i8       d18, #255
-  .long  0xf2e119c9                          // vmul.f32      d17, d17, d9[0]
-  .long  0xf2611d21                          // vsub.f32      d17, d1, d17
-  .long  0xf2611da0                          // vsub.f32      d17, d17, d16
-  .long  0xf26008a2                          // vadd.i32      d16, d16, d18
-  .long  0xf3f91721                          // vabs.f32      d17, d17
-  .long  0xf2211fa0                          // vmin.f32      d1, d17, d16
-  .long  0xecbd8b04                          // vpop          {d8-d9}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_matrix_2x3_vfp4
-_sk_matrix_2x3_vfp4:
-  .long  0xe92d4800                          // push          {fp, lr}
-  .long  0xe591e000                          // ldr           lr, [r1]
-  .long  0xe591c004                          // ldr           ip, [r1, #4]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe28e300c                          // add           r3, lr, #12
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xe28e3008                          // add           r3, lr, #8
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe28e3010                          // add           r3, lr, #16
-  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
-  .long  0xe28e3014                          // add           r3, lr, #20
-  .long  0xf2410c31                          // vfma.f32      d16, d1, d17
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe28e3004                          // add           r3, lr, #4
-  .long  0xf2411c32                          // vfma.f32      d17, d1, d18
-  .long  0xf4ee2c9f                          // vld1.32       {d18[]}, [lr :32]
-  .long  0xf4e33c9f                          // vld1.32       {d19[]}, [r3 :32]
-  .long  0xf2400c32                          // vfma.f32      d16, d0, d18
-  .long  0xf2401c33                          // vfma.f32      d17, d0, d19
-  .long  0xf22001b0                          // vorr          d0, d16, d16
-  .long  0xf22111b1                          // vorr          d1, d17, d17
-  .long  0xe8bd4800                          // pop           {fp, lr}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_matrix_3x4_vfp4
-_sk_matrix_3x4_vfp4:
-  .long  0xe92d4800                          // push          {fp, lr}
-  .long  0xe591e000                          // ldr           lr, [r1]
-  .long  0xe591c004                          // ldr           ip, [r1, #4]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe28e3020                          // add           r3, lr, #32
-  .long  0xf4e33c9f                          // vld1.32       {d19[]}, [r3 :32]
-  .long  0xe28e302c                          // add           r3, lr, #44
-  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
-  .long  0xe28e301c                          // add           r3, lr, #28
-  .long  0xf2420c33                          // vfma.f32      d16, d2, d19
-  .long  0xf4e34c9f                          // vld1.32       {d20[]}, [r3 :32]
-  .long  0xe28e3018                          // add           r3, lr, #24
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xe28e3024                          // add           r3, lr, #36
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe28e3028                          // add           r3, lr, #40
-  .long  0xf2421c32                          // vfma.f32      d17, d2, d18
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xe28e3010                          // add           r3, lr, #16
-  .long  0xf2422c34                          // vfma.f32      d18, d2, d20
-  .long  0xf4e33c9f                          // vld1.32       {d19[]}, [r3 :32]
-  .long  0xe28e300c                          // add           r3, lr, #12
-  .long  0xf4e34c9f                          // vld1.32       {d20[]}, [r3 :32]
-  .long  0xe28e3014                          // add           r3, lr, #20
-  .long  0xf2411c34                          // vfma.f32      d17, d1, d20
-  .long  0xf4e34c9f                          // vld1.32       {d20[]}, [r3 :32]
-  .long  0xf2410c34                          // vfma.f32      d16, d1, d20
-  .long  0xe28e3004                          // add           r3, lr, #4
-  .long  0xf2412c33                          // vfma.f32      d18, d1, d19
-  .long  0xf4ee3c9f                          // vld1.32       {d19[]}, [lr :32]
-  .long  0xf4e34c9f                          // vld1.32       {d20[]}, [r3 :32]
-  .long  0xe28e3008                          // add           r3, lr, #8
-  .long  0xf2401c33                          // vfma.f32      d17, d0, d19
-  .long  0xf4e33c9f                          // vld1.32       {d19[]}, [r3 :32]
-  .long  0xf2400c33                          // vfma.f32      d16, d0, d19
-  .long  0xf2402c34                          // vfma.f32      d18, d0, d20
-  .long  0xf22101b1                          // vorr          d0, d17, d17
-  .long  0xf22021b0                          // vorr          d2, d16, d16
-  .long  0xf22211b2                          // vorr          d1, d18, d18
-  .long  0xe8bd4800                          // pop           {fp, lr}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_matrix_perspective_vfp4
-_sk_matrix_perspective_vfp4:
-  .long  0xe92d4800                          // push          {fp, lr}
-  .long  0xe591e000                          // ldr           lr, [r1]
-  .long  0xe591c004                          // ldr           ip, [r1, #4]
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xe28e301c                          // add           r3, lr, #28
-  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
-  .long  0xe28e3020                          // add           r3, lr, #32
-  .long  0xf4e31c9f                          // vld1.32       {d17[]}, [r3 :32]
-  .long  0xe28e3018                          // add           r3, lr, #24
-  .long  0xf2411c30                          // vfma.f32      d17, d1, d16
-  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
-  .long  0xe28e3010                          // add           r3, lr, #16
-  .long  0xf2401c30                          // vfma.f32      d17, d0, d16
-  .long  0xf4e30c9f                          // vld1.32       {d16[]}, [r3 :32]
-  .long  0xe28e3004                          // add           r3, lr, #4
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xe28e3008                          // add           r3, lr, #8
-  .long  0xf4e34c9f                          // vld1.32       {d20[]}, [r3 :32]
-  .long  0xe28e3014                          // add           r3, lr, #20
-  .long  0xf2414c32                          // vfma.f32      d20, d1, d18
-  .long  0xf4e32c9f                          // vld1.32       {d18[]}, [r3 :32]
-  .long  0xe28e300c                          // add           r3, lr, #12
-  .long  0xf3fb3521                          // vrecpe.f32    d19, d17
-  .long  0xf2412c30                          // vfma.f32      d18, d1, d16
-  .long  0xf4e35c9f                          // vld1.32       {d21[]}, [r3 :32]
-  .long  0xf2410fb3                          // vrecps.f32    d16, d17, d19
-  .long  0xf4ee1c9f                          // vld1.32       {d17[]}, [lr :32]
-  .long  0xf2404c31                          // vfma.f32      d20, d0, d17
-  .long  0xf2402c35                          // vfma.f32      d18, d0, d21
-  .long  0xf3430db0                          // vmul.f32      d16, d19, d16
-  .long  0xf3040db0                          // vmul.f32      d0, d20, d16
-  .long  0xf3021db0                          // vmul.f32      d1, d18, d16
-  .long  0xe8bd4800                          // pop           {fp, lr}
-  .long  0xe12fff1c                          // bx            ip
-
-.globl _sk_linear_gradient_2stops_vfp4
-_sk_linear_gradient_2stops_vfp4:
-  .long  0xe8911008                          // ldm           r1, {r3, ip}
-  .long  0xe2811008                          // add           r1, r1, #8
-  .long  0xf4632a0d                          // vld1.8        {d18-d19}, [r3]!
-  .long  0xf4634a0f                          // vld1.8        {d20-d21}, [r3]
-  .long  0xf3f40c22                          // vdup.32       d16, d18[0]
-  .long  0xf3f41c24                          // vdup.32       d17, d20[0]
-  .long  0xf2400c31                          // vfma.f32      d16, d0, d17
-  .long  0xf3fc6c24                          // vdup.32       d22, d20[1]
-  .long  0xf3bc1c22                          // vdup.32       d1, d18[1]
-  .long  0xf3b42c23                          // vdup.32       d2, d19[0]
-  .long  0xf2001c36                          // vfma.f32      d1, d0, d22
-  .long  0xf3f41c25                          // vdup.32       d17, d21[0]
-  .long  0xf3fc4c25                          // vdup.32       d20, d21[1]
-  .long  0xf2002c31                          // vfma.f32      d2, d0, d17
-  .long  0xf3bc3c23                          // vdup.32       d3, d19[1]
-  .long  0xf2003c34                          // vfma.f32      d3, d0, d20
-  .long  0xf22001b0                          // vorr          d0, d16, d16
-  .long  0xe12fff1c                          // bx            ip
-#elif defined(__x86_64__)
-
-.globl _sk_start_pipeline_hsw
-_sk_start_pipeline_hsw:
-  .byte  65,87                               // push          %r15
-  .byte  65,86                               // push          %r14
-  .byte  65,85                               // push          %r13
-  .byte  65,84                               // push          %r12
-  .byte  83                                  // push          %rbx
-  .byte  73,137,205                          // mov           %rcx,%r13
-  .byte  73,137,214                          // mov           %rdx,%r14
-  .byte  72,137,251                          // mov           %rdi,%rbx
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  73,137,199                          // mov           %rax,%r15
-  .byte  73,137,244                          // mov           %rsi,%r12
-  .byte  72,141,67,8                         // lea           0x8(%rbx),%rax
-  .byte  76,57,232                           // cmp           %r13,%rax
-  .byte  118,5                               // jbe           28 <_sk_start_pipeline_hsw+0x28>
-  .byte  72,137,223                          // mov           %rbx,%rdi
-  .byte  235,65                              // jmp           69 <_sk_start_pipeline_hsw+0x69>
-  .byte  185,0,0,0,0                         // mov           $0x0,%ecx
-  .byte  197,252,87,192                      // vxorps        %ymm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  197,236,87,210                      // vxorps        %ymm2,%ymm2,%ymm2
-  .byte  197,228,87,219                      // vxorps        %ymm3,%ymm3,%ymm3
-  .byte  197,220,87,228                      // vxorps        %ymm4,%ymm4,%ymm4
-  .byte  197,212,87,237                      // vxorps        %ymm5,%ymm5,%ymm5
-  .byte  197,204,87,246                      // vxorps        %ymm6,%ymm6,%ymm6
-  .byte  197,196,87,255                      // vxorps        %ymm7,%ymm7,%ymm7
-  .byte  72,137,223                          // mov           %rbx,%rdi
-  .byte  76,137,230                          // mov           %r12,%rsi
-  .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  65,255,215                          // callq         *%r15
-  .byte  72,141,123,8                        // lea           0x8(%rbx),%rdi
-  .byte  72,131,195,16                       // add           $0x10,%rbx
-  .byte  76,57,235                           // cmp           %r13,%rbx
-  .byte  72,137,251                          // mov           %rdi,%rbx
-  .byte  118,191                             // jbe           28 <_sk_start_pipeline_hsw+0x28>
-  .byte  76,137,233                          // mov           %r13,%rcx
-  .byte  72,41,249                           // sub           %rdi,%rcx
-  .byte  116,41                              // je            9a <_sk_start_pipeline_hsw+0x9a>
-  .byte  197,252,87,192                      // vxorps        %ymm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  197,236,87,210                      // vxorps        %ymm2,%ymm2,%ymm2
-  .byte  197,228,87,219                      // vxorps        %ymm3,%ymm3,%ymm3
-  .byte  197,220,87,228                      // vxorps        %ymm4,%ymm4,%ymm4
-  .byte  197,212,87,237                      // vxorps        %ymm5,%ymm5,%ymm5
-  .byte  197,204,87,246                      // vxorps        %ymm6,%ymm6,%ymm6
-  .byte  197,196,87,255                      // vxorps        %ymm7,%ymm7,%ymm7
-  .byte  76,137,230                          // mov           %r12,%rsi
-  .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  65,255,215                          // callq         *%r15
-  .byte  76,137,232                          // mov           %r13,%rax
-  .byte  91                                  // pop           %rbx
-  .byte  65,92                               // pop           %r12
-  .byte  65,93                               // pop           %r13
-  .byte  65,94                               // pop           %r14
-  .byte  65,95                               // pop           %r15
-  .byte  197,248,119                         // vzeroupper
-  .byte  195                                 // retq
-
-.globl _sk_just_return_hsw
-_sk_just_return_hsw:
-  .byte  195                                 // retq
-
-.globl _sk_seed_shader_hsw
-_sk_seed_shader_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,249,110,199                     // vmovd         %edi,%xmm0
-  .byte  196,226,125,24,192                  // vbroadcastss  %xmm0,%ymm0
-  .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
-  .byte  196,226,125,24,74,4                 // vbroadcastss  0x4(%rdx),%ymm1
-  .byte  197,252,88,193                      // vaddps        %ymm1,%ymm0,%ymm0
-  .byte  197,252,88,66,20                    // vaddps        0x14(%rdx),%ymm0,%ymm0
-  .byte  196,226,125,24,16                   // vbroadcastss  (%rax),%ymm2
-  .byte  197,252,91,210                      // vcvtdq2ps     %ymm2,%ymm2
-  .byte  197,236,88,201                      // vaddps        %ymm1,%ymm2,%ymm1
-  .byte  196,226,125,24,18                   // vbroadcastss  (%rdx),%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,228,87,219                      // vxorps        %ymm3,%ymm3,%ymm3
-  .byte  197,220,87,228                      // vxorps        %ymm4,%ymm4,%ymm4
-  .byte  197,212,87,237                      // vxorps        %ymm5,%ymm5,%ymm5
-  .byte  197,204,87,246                      // vxorps        %ymm6,%ymm6,%ymm6
-  .byte  197,196,87,255                      // vxorps        %ymm7,%ymm7,%ymm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_constant_color_hsw
-_sk_constant_color_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,226,125,24,0                    // vbroadcastss  (%rax),%ymm0
-  .byte  196,226,125,24,72,4                 // vbroadcastss  0x4(%rax),%ymm1
-  .byte  196,226,125,24,80,8                 // vbroadcastss  0x8(%rax),%ymm2
-  .byte  196,226,125,24,88,12                // vbroadcastss  0xc(%rax),%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clear_hsw
-_sk_clear_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,87,192                      // vxorps        %ymm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  197,236,87,210                      // vxorps        %ymm2,%ymm2,%ymm2
-  .byte  197,228,87,219                      // vxorps        %ymm3,%ymm3,%ymm3
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_plus__hsw
-_sk_plus__hsw:
-  .byte  197,252,88,196                      // vaddps        %ymm4,%ymm0,%ymm0
-  .byte  197,244,88,205                      // vaddps        %ymm5,%ymm1,%ymm1
-  .byte  197,236,88,214                      // vaddps        %ymm6,%ymm2,%ymm2
-  .byte  197,228,88,223                      // vaddps        %ymm7,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_srcover_hsw
-_sk_srcover_hsw:
-  .byte  196,98,125,24,2                     // vbroadcastss  (%rdx),%ymm8
-  .byte  197,60,92,195                       // vsubps        %ymm3,%ymm8,%ymm8
-  .byte  196,194,93,184,192                  // vfmadd231ps   %ymm8,%ymm4,%ymm0
-  .byte  196,194,85,184,200                  // vfmadd231ps   %ymm8,%ymm5,%ymm1
-  .byte  196,194,77,184,208                  // vfmadd231ps   %ymm8,%ymm6,%ymm2
-  .byte  196,194,69,184,216                  // vfmadd231ps   %ymm8,%ymm7,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_dstover_hsw
-_sk_dstover_hsw:
-  .byte  196,98,125,24,2                     // vbroadcastss  (%rdx),%ymm8
-  .byte  197,60,92,199                       // vsubps        %ymm7,%ymm8,%ymm8
-  .byte  196,226,61,168,196                  // vfmadd213ps   %ymm4,%ymm8,%ymm0
-  .byte  196,226,61,168,205                  // vfmadd213ps   %ymm5,%ymm8,%ymm1
-  .byte  196,226,61,168,214                  // vfmadd213ps   %ymm6,%ymm8,%ymm2
-  .byte  196,226,61,168,223                  // vfmadd213ps   %ymm7,%ymm8,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_0_hsw
-_sk_clamp_0_hsw:
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  196,193,124,95,192                  // vmaxps        %ymm8,%ymm0,%ymm0
-  .byte  196,193,116,95,200                  // vmaxps        %ymm8,%ymm1,%ymm1
-  .byte  196,193,108,95,208                  // vmaxps        %ymm8,%ymm2,%ymm2
-  .byte  196,193,100,95,216                  // vmaxps        %ymm8,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_1_hsw
-_sk_clamp_1_hsw:
-  .byte  196,98,125,24,2                     // vbroadcastss  (%rdx),%ymm8
-  .byte  196,193,124,93,192                  // vminps        %ymm8,%ymm0,%ymm0
-  .byte  196,193,116,93,200                  // vminps        %ymm8,%ymm1,%ymm1
-  .byte  196,193,108,93,208                  // vminps        %ymm8,%ymm2,%ymm2
-  .byte  196,193,100,93,216                  // vminps        %ymm8,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_a_hsw
-_sk_clamp_a_hsw:
-  .byte  196,98,125,24,2                     // vbroadcastss  (%rdx),%ymm8
-  .byte  196,193,100,93,216                  // vminps        %ymm8,%ymm3,%ymm3
-  .byte  197,252,93,195                      // vminps        %ymm3,%ymm0,%ymm0
-  .byte  197,244,93,203                      // vminps        %ymm3,%ymm1,%ymm1
-  .byte  197,236,93,211                      // vminps        %ymm3,%ymm2,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_set_rgb_hsw
-_sk_set_rgb_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,226,125,24,0                    // vbroadcastss  (%rax),%ymm0
-  .byte  196,226,125,24,72,4                 // vbroadcastss  0x4(%rax),%ymm1
-  .byte  196,226,125,24,80,8                 // vbroadcastss  0x8(%rax),%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_swap_rb_hsw
-_sk_swap_rb_hsw:
-  .byte  197,124,40,192                      // vmovaps       %ymm0,%ymm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,40,194                      // vmovaps       %ymm2,%ymm0
-  .byte  197,124,41,194                      // vmovaps       %ymm8,%ymm2
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_swap_hsw
-_sk_swap_hsw:
-  .byte  197,124,40,195                      // vmovaps       %ymm3,%ymm8
-  .byte  197,124,40,202                      // vmovaps       %ymm2,%ymm9
-  .byte  197,124,40,209                      // vmovaps       %ymm1,%ymm10
-  .byte  197,124,40,216                      // vmovaps       %ymm0,%ymm11
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,40,196                      // vmovaps       %ymm4,%ymm0
-  .byte  197,252,40,205                      // vmovaps       %ymm5,%ymm1
-  .byte  197,252,40,214                      // vmovaps       %ymm6,%ymm2
-  .byte  197,252,40,223                      // vmovaps       %ymm7,%ymm3
-  .byte  197,124,41,220                      // vmovaps       %ymm11,%ymm4
-  .byte  197,124,41,213                      // vmovaps       %ymm10,%ymm5
-  .byte  197,124,41,206                      // vmovaps       %ymm9,%ymm6
-  .byte  197,124,41,199                      // vmovaps       %ymm8,%ymm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_move_src_dst_hsw
-_sk_move_src_dst_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,40,224                      // vmovaps       %ymm0,%ymm4
-  .byte  197,252,40,233                      // vmovaps       %ymm1,%ymm5
-  .byte  197,252,40,242                      // vmovaps       %ymm2,%ymm6
-  .byte  197,252,40,251                      // vmovaps       %ymm3,%ymm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_move_dst_src_hsw
-_sk_move_dst_src_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,40,196                      // vmovaps       %ymm4,%ymm0
-  .byte  197,252,40,205                      // vmovaps       %ymm5,%ymm1
-  .byte  197,252,40,214                      // vmovaps       %ymm6,%ymm2
-  .byte  197,252,40,223                      // vmovaps       %ymm7,%ymm3
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_premul_hsw
-_sk_premul_hsw:
-  .byte  197,252,89,195                      // vmulps        %ymm3,%ymm0,%ymm0
-  .byte  197,244,89,203                      // vmulps        %ymm3,%ymm1,%ymm1
-  .byte  197,236,89,211                      // vmulps        %ymm3,%ymm2,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_unpremul_hsw
-_sk_unpremul_hsw:
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  196,65,100,194,200,0                // vcmpeqps      %ymm8,%ymm3,%ymm9
-  .byte  196,98,125,24,18                    // vbroadcastss  (%rdx),%ymm10
-  .byte  197,44,94,211                       // vdivps        %ymm3,%ymm10,%ymm10
-  .byte  196,67,45,74,192,144                // vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  197,188,89,210                      // vmulps        %ymm2,%ymm8,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_from_srgb_hsw
-_sk_from_srgb_hsw:
-  .byte  196,98,125,24,66,64                 // vbroadcastss  0x40(%rdx),%ymm8
-  .byte  197,60,89,200                       // vmulps        %ymm0,%ymm8,%ymm9
-  .byte  197,124,89,208                      // vmulps        %ymm0,%ymm0,%ymm10
-  .byte  196,98,125,24,90,60                 // vbroadcastss  0x3c(%rdx),%ymm11
-  .byte  196,98,125,24,98,56                 // vbroadcastss  0x38(%rdx),%ymm12
-  .byte  196,65,124,40,235                   // vmovaps       %ymm11,%ymm13
-  .byte  196,66,125,168,236                  // vfmadd213ps   %ymm12,%ymm0,%ymm13
-  .byte  196,98,125,24,114,52                // vbroadcastss  0x34(%rdx),%ymm14
-  .byte  196,66,45,168,238                   // vfmadd213ps   %ymm14,%ymm10,%ymm13
-  .byte  196,98,125,24,82,68                 // vbroadcastss  0x44(%rdx),%ymm10
-  .byte  196,193,124,194,194,1               // vcmpltps      %ymm10,%ymm0,%ymm0
-  .byte  196,195,21,74,193,0                 // vblendvps     %ymm0,%ymm9,%ymm13,%ymm0
-  .byte  197,60,89,201                       // vmulps        %ymm1,%ymm8,%ymm9
-  .byte  197,116,89,233                      // vmulps        %ymm1,%ymm1,%ymm13
-  .byte  196,65,124,40,251                   // vmovaps       %ymm11,%ymm15
-  .byte  196,66,117,168,252                  // vfmadd213ps   %ymm12,%ymm1,%ymm15
-  .byte  196,66,21,168,254                   // vfmadd213ps   %ymm14,%ymm13,%ymm15
-  .byte  196,193,116,194,202,1               // vcmpltps      %ymm10,%ymm1,%ymm1
-  .byte  196,195,5,74,201,16                 // vblendvps     %ymm1,%ymm9,%ymm15,%ymm1
-  .byte  197,60,89,194                       // vmulps        %ymm2,%ymm8,%ymm8
-  .byte  197,108,89,202                      // vmulps        %ymm2,%ymm2,%ymm9
-  .byte  196,66,109,168,220                  // vfmadd213ps   %ymm12,%ymm2,%ymm11
-  .byte  196,66,53,168,222                   // vfmadd213ps   %ymm14,%ymm9,%ymm11
-  .byte  196,193,108,194,210,1               // vcmpltps      %ymm10,%ymm2,%ymm2
-  .byte  196,195,37,74,208,32                // vblendvps     %ymm2,%ymm8,%ymm11,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_to_srgb_hsw
-_sk_to_srgb_hsw:
-  .byte  197,124,82,192                      // vrsqrtps      %ymm0,%ymm8
-  .byte  196,65,124,83,200                   // vrcpps        %ymm8,%ymm9
-  .byte  196,65,124,82,208                   // vrsqrtps      %ymm8,%ymm10
-  .byte  196,98,125,24,66,72                 // vbroadcastss  0x48(%rdx),%ymm8
-  .byte  197,60,89,216                       // vmulps        %ymm0,%ymm8,%ymm11
-  .byte  196,98,125,24,34                    // vbroadcastss  (%rdx),%ymm12
-  .byte  196,98,125,24,106,76                // vbroadcastss  0x4c(%rdx),%ymm13
-  .byte  196,98,125,24,114,80                // vbroadcastss  0x50(%rdx),%ymm14
-  .byte  196,98,125,24,122,84                // vbroadcastss  0x54(%rdx),%ymm15
-  .byte  196,66,13,168,207                   // vfmadd213ps   %ymm15,%ymm14,%ymm9
-  .byte  196,66,21,184,202                   // vfmadd231ps   %ymm10,%ymm13,%ymm9
-  .byte  196,65,28,93,201                    // vminps        %ymm9,%ymm12,%ymm9
-  .byte  196,98,125,24,82,88                 // vbroadcastss  0x58(%rdx),%ymm10
-  .byte  196,193,124,194,194,1               // vcmpltps      %ymm10,%ymm0,%ymm0
-  .byte  196,195,53,74,195,0                 // vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
-  .byte  197,124,82,201                      // vrsqrtps      %ymm1,%ymm9
-  .byte  196,65,124,83,217                   // vrcpps        %ymm9,%ymm11
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,66,13,168,223                   // vfmadd213ps   %ymm15,%ymm14,%ymm11
-  .byte  196,66,21,184,217                   // vfmadd231ps   %ymm9,%ymm13,%ymm11
-  .byte  197,60,89,201                       // vmulps        %ymm1,%ymm8,%ymm9
-  .byte  196,65,28,93,219                    // vminps        %ymm11,%ymm12,%ymm11
-  .byte  196,193,116,194,202,1               // vcmpltps      %ymm10,%ymm1,%ymm1
-  .byte  196,195,37,74,201,16                // vblendvps     %ymm1,%ymm9,%ymm11,%ymm1
-  .byte  197,124,82,202                      // vrsqrtps      %ymm2,%ymm9
-  .byte  196,65,124,83,217                   // vrcpps        %ymm9,%ymm11
-  .byte  196,66,13,168,223                   // vfmadd213ps   %ymm15,%ymm14,%ymm11
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,66,21,184,217                   // vfmadd231ps   %ymm9,%ymm13,%ymm11
-  .byte  196,65,28,93,203                    // vminps        %ymm11,%ymm12,%ymm9
-  .byte  197,60,89,194                       // vmulps        %ymm2,%ymm8,%ymm8
-  .byte  196,193,108,194,210,1               // vcmpltps      %ymm10,%ymm2,%ymm2
-  .byte  196,195,53,74,208,32                // vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_scale_1_float_hsw
-_sk_scale_1_float_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  197,188,89,210                      // vmulps        %ymm2,%ymm8,%ymm2
-  .byte  197,188,89,219                      // vmulps        %ymm3,%ymm8,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_scale_u8_hsw
-_sk_scale_u8_hsw:
-  .byte  73,137,200                          // mov           %rcx,%r8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,1,248                            // add           %rdi,%rax
-  .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,48                              // jne           41a <_sk_scale_u8_hsw+0x40>
-  .byte  197,123,16,0                        // vmovsd        (%rax),%xmm8
-  .byte  196,66,125,49,192                   // vpmovzxbd     %xmm8,%ymm8
-  .byte  196,65,124,91,192                   // vcvtdq2ps     %ymm8,%ymm8
-  .byte  196,98,125,24,74,12                 // vbroadcastss  0xc(%rdx),%ymm9
-  .byte  196,65,60,89,193                    // vmulps        %ymm9,%ymm8,%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  197,188,89,210                      // vmulps        %ymm2,%ymm8,%ymm2
-  .byte  197,188,89,219                      // vmulps        %ymm3,%ymm8,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,137,193                          // mov           %r8,%rcx
-  .byte  255,224                             // jmpq          *%rax
-  .byte  49,201                              // xor           %ecx,%ecx
-  .byte  77,137,194                          // mov           %r8,%r10
-  .byte  69,49,201                           // xor           %r9d,%r9d
-  .byte  68,15,182,24                        // movzbl        (%rax),%r11d
-  .byte  72,255,192                          // inc           %rax
-  .byte  73,211,227                          // shl           %cl,%r11
-  .byte  77,9,217                            // or            %r11,%r9
-  .byte  72,131,193,8                        // add           $0x8,%rcx
-  .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           422 <_sk_scale_u8_hsw+0x48>
-  .byte  196,65,249,110,193                  // vmovq         %r9,%xmm8
-  .byte  235,175                             // jmp           3ee <_sk_scale_u8_hsw+0x14>
-
-.globl _sk_lerp_1_float_hsw
-_sk_lerp_1_float_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  197,252,92,196                      // vsubps        %ymm4,%ymm0,%ymm0
-  .byte  196,226,61,168,196                  // vfmadd213ps   %ymm4,%ymm8,%ymm0
-  .byte  197,244,92,205                      // vsubps        %ymm5,%ymm1,%ymm1
-  .byte  196,226,61,168,205                  // vfmadd213ps   %ymm5,%ymm8,%ymm1
-  .byte  197,236,92,214                      // vsubps        %ymm6,%ymm2,%ymm2
-  .byte  196,226,61,168,214                  // vfmadd213ps   %ymm6,%ymm8,%ymm2
-  .byte  197,228,92,223                      // vsubps        %ymm7,%ymm3,%ymm3
-  .byte  196,226,61,168,223                  // vfmadd213ps   %ymm7,%ymm8,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_lerp_u8_hsw
-_sk_lerp_u8_hsw:
-  .byte  73,137,200                          // mov           %rcx,%r8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,1,248                            // add           %rdi,%rax
-  .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,68                              // jne           4c2 <_sk_lerp_u8_hsw+0x54>
-  .byte  197,123,16,0                        // vmovsd        (%rax),%xmm8
-  .byte  196,66,125,49,192                   // vpmovzxbd     %xmm8,%ymm8
-  .byte  196,65,124,91,192                   // vcvtdq2ps     %ymm8,%ymm8
-  .byte  196,98,125,24,74,12                 // vbroadcastss  0xc(%rdx),%ymm9
-  .byte  196,65,60,89,193                    // vmulps        %ymm9,%ymm8,%ymm8
-  .byte  197,252,92,196                      // vsubps        %ymm4,%ymm0,%ymm0
-  .byte  196,226,61,168,196                  // vfmadd213ps   %ymm4,%ymm8,%ymm0
-  .byte  197,244,92,205                      // vsubps        %ymm5,%ymm1,%ymm1
-  .byte  196,226,61,168,205                  // vfmadd213ps   %ymm5,%ymm8,%ymm1
-  .byte  197,236,92,214                      // vsubps        %ymm6,%ymm2,%ymm2
-  .byte  196,226,61,168,214                  // vfmadd213ps   %ymm6,%ymm8,%ymm2
-  .byte  197,228,92,223                      // vsubps        %ymm7,%ymm3,%ymm3
-  .byte  196,226,61,168,223                  // vfmadd213ps   %ymm7,%ymm8,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,137,193                          // mov           %r8,%rcx
-  .byte  255,224                             // jmpq          *%rax
-  .byte  49,201                              // xor           %ecx,%ecx
-  .byte  77,137,194                          // mov           %r8,%r10
-  .byte  69,49,201                           // xor           %r9d,%r9d
-  .byte  68,15,182,24                        // movzbl        (%rax),%r11d
-  .byte  72,255,192                          // inc           %rax
-  .byte  73,211,227                          // shl           %cl,%r11
-  .byte  77,9,217                            // or            %r11,%r9
-  .byte  72,131,193,8                        // add           $0x8,%rcx
-  .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           4ca <_sk_lerp_u8_hsw+0x5c>
-  .byte  196,65,249,110,193                  // vmovq         %r9,%xmm8
-  .byte  235,155                             // jmp           482 <_sk_lerp_u8_hsw+0x14>
-
-.globl _sk_lerp_565_hsw
-_sk_lerp_565_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,16                           // mov           (%rax),%r10
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,123                             // jne           56c <_sk_lerp_565_hsw+0x85>
-  .byte  196,193,122,111,28,122              // vmovdqu       (%r10,%rdi,2),%xmm3
-  .byte  196,226,125,51,219                  // vpmovzxwd     %xmm3,%ymm3
-  .byte  196,98,125,88,66,104                // vpbroadcastd  0x68(%rdx),%ymm8
-  .byte  197,61,219,195                      // vpand         %ymm3,%ymm8,%ymm8
-  .byte  196,65,124,91,192                   // vcvtdq2ps     %ymm8,%ymm8
-  .byte  196,98,125,24,74,116                // vbroadcastss  0x74(%rdx),%ymm9
-  .byte  196,65,52,89,192                    // vmulps        %ymm8,%ymm9,%ymm8
-  .byte  196,98,125,88,74,108                // vpbroadcastd  0x6c(%rdx),%ymm9
-  .byte  197,53,219,203                      // vpand         %ymm3,%ymm9,%ymm9
-  .byte  196,65,124,91,201                   // vcvtdq2ps     %ymm9,%ymm9
-  .byte  196,98,125,24,82,120                // vbroadcastss  0x78(%rdx),%ymm10
-  .byte  196,65,44,89,201                    // vmulps        %ymm9,%ymm10,%ymm9
-  .byte  196,98,125,88,82,112                // vpbroadcastd  0x70(%rdx),%ymm10
-  .byte  197,173,219,219                     // vpand         %ymm3,%ymm10,%ymm3
-  .byte  197,252,91,219                      // vcvtdq2ps     %ymm3,%ymm3
-  .byte  196,98,125,24,82,124                // vbroadcastss  0x7c(%rdx),%ymm10
-  .byte  197,172,89,219                      // vmulps        %ymm3,%ymm10,%ymm3
-  .byte  197,252,92,196                      // vsubps        %ymm4,%ymm0,%ymm0
-  .byte  196,226,61,168,196                  // vfmadd213ps   %ymm4,%ymm8,%ymm0
-  .byte  197,244,92,205                      // vsubps        %ymm5,%ymm1,%ymm1
-  .byte  196,226,53,168,205                  // vfmadd213ps   %ymm5,%ymm9,%ymm1
-  .byte  197,236,92,214                      // vsubps        %ymm6,%ymm2,%ymm2
-  .byte  196,226,101,168,214                 // vfmadd213ps   %ymm6,%ymm3,%ymm2
-  .byte  196,226,125,24,26                   // vbroadcastss  (%rdx),%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  65,137,200                          // mov           %ecx,%r8d
-  .byte  65,128,224,7                        // and           $0x7,%r8b
-  .byte  197,225,239,219                     // vpxor         %xmm3,%xmm3,%xmm3
-  .byte  65,254,200                          // dec           %r8b
-  .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,111,255,255,255              // ja            4f7 <_sk_lerp_565_hsw+0x10>
-  .byte  76,141,13,73,0,0,0                  // lea           0x49(%rip),%r9        # 5d8 <_sk_lerp_565_hsw+0xf1>
-  .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
-  .byte  76,1,200                            // add           %r9,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  197,225,239,219                     // vpxor         %xmm3,%xmm3,%xmm3
-  .byte  196,193,97,196,92,122,12,6          // vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3
-  .byte  196,193,97,196,92,122,10,5          // vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3
-  .byte  196,193,97,196,92,122,8,4           // vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3
-  .byte  196,193,97,196,92,122,6,3           // vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3
-  .byte  196,193,97,196,92,122,4,2           // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
-  .byte  196,193,97,196,92,122,2,1           // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
-  .byte  196,193,97,196,28,122,0             // vpinsrw       $0x0,(%r10,%rdi,2),%xmm3,%xmm3
-  .byte  233,31,255,255,255                  // jmpq          4f7 <_sk_lerp_565_hsw+0x10>
-  .byte  244                                 // hlt
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  236                                 // in            (%dx),%al
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,228                             // jmpq          *%rsp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  220,255                             // fdivr         %st,%st(7)
-  .byte  255                                 // (bad)
-  .byte  255,212                             // callq         *%rsp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,204                             // dec           %esp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,192                             // inc           %eax
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_load_tables_hsw
-_sk_load_tables_hsw:
-  .byte  73,137,200                          // mov           %rcx,%r8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
-  .byte  76,3,8                              // add           (%rax),%r9
-  .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,106                             // jne           673 <_sk_load_tables_hsw+0x7f>
-  .byte  196,193,126,111,25                  // vmovdqu       (%r9),%ymm3
-  .byte  196,226,125,88,82,16                // vpbroadcastd  0x10(%rdx),%ymm2
-  .byte  197,237,219,203                     // vpand         %ymm3,%ymm2,%ymm1
-  .byte  196,65,61,118,192                   // vpcmpeqd      %ymm8,%ymm8,%ymm8
-  .byte  72,139,72,8                         // mov           0x8(%rax),%rcx
-  .byte  76,139,72,16                        // mov           0x10(%rax),%r9
-  .byte  196,65,53,118,201                   // vpcmpeqd      %ymm9,%ymm9,%ymm9
-  .byte  196,226,53,146,4,137                // vgatherdps    %ymm9,(%rcx,%ymm1,4),%ymm0
-  .byte  197,245,114,211,8                   // vpsrld        $0x8,%ymm3,%ymm1
-  .byte  197,109,219,201                     // vpand         %ymm1,%ymm2,%ymm9
-  .byte  196,65,45,118,210                   // vpcmpeqd      %ymm10,%ymm10,%ymm10
-  .byte  196,130,45,146,12,137               // vgatherdps    %ymm10,(%r9,%ymm9,4),%ymm1
-  .byte  72,139,64,24                        // mov           0x18(%rax),%rax
-  .byte  197,181,114,211,16                  // vpsrld        $0x10,%ymm3,%ymm9
-  .byte  196,65,109,219,201                  // vpand         %ymm9,%ymm2,%ymm9
-  .byte  196,162,61,146,20,136               // vgatherdps    %ymm8,(%rax,%ymm9,4),%ymm2
-  .byte  197,229,114,211,24                  // vpsrld        $0x18,%ymm3,%ymm3
-  .byte  197,252,91,219                      // vcvtdq2ps     %ymm3,%ymm3
-  .byte  196,98,125,24,66,12                 // vbroadcastss  0xc(%rdx),%ymm8
-  .byte  196,193,100,89,216                  // vmulps        %ymm8,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,137,193                          // mov           %r8,%rcx
-  .byte  255,224                             // jmpq          *%rax
-  .byte  185,8,0,0,0                         // mov           $0x8,%ecx
-  .byte  68,41,193                           // sub           %r8d,%ecx
-  .byte  192,225,3                           // shl           $0x3,%cl
-  .byte  73,199,194,255,255,255,255          // mov           $0xffffffffffffffff,%r10
-  .byte  73,211,234                          // shr           %cl,%r10
-  .byte  196,193,249,110,194                 // vmovq         %r10,%xmm0
-  .byte  196,226,125,33,192                  // vpmovsxbd     %xmm0,%ymm0
-  .byte  196,194,125,140,25                  // vpmaskmovd    (%r9),%ymm0,%ymm3
-  .byte  233,114,255,255,255                 // jmpq          60e <_sk_load_tables_hsw+0x1a>
-
-.globl _sk_load_a8_hsw
-_sk_load_a8_hsw:
-  .byte  73,137,200                          // mov           %rcx,%r8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,1,248                            // add           %rdi,%rax
-  .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,42                              // jne           6d6 <_sk_load_a8_hsw+0x3a>
-  .byte  197,251,16,0                        // vmovsd        (%rax),%xmm0
-  .byte  196,226,125,49,192                  // vpmovzxbd     %xmm0,%ymm0
-  .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
-  .byte  196,226,125,24,74,12                // vbroadcastss  0xc(%rdx),%ymm1
-  .byte  197,252,89,217                      // vmulps        %ymm1,%ymm0,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,87,192                      // vxorps        %ymm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  197,236,87,210                      // vxorps        %ymm2,%ymm2,%ymm2
-  .byte  76,137,193                          // mov           %r8,%rcx
-  .byte  255,224                             // jmpq          *%rax
-  .byte  49,201                              // xor           %ecx,%ecx
-  .byte  77,137,194                          // mov           %r8,%r10
-  .byte  69,49,201                           // xor           %r9d,%r9d
-  .byte  68,15,182,24                        // movzbl        (%rax),%r11d
-  .byte  72,255,192                          // inc           %rax
-  .byte  73,211,227                          // shl           %cl,%r11
-  .byte  77,9,217                            // or            %r11,%r9
-  .byte  72,131,193,8                        // add           $0x8,%rcx
-  .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           6de <_sk_load_a8_hsw+0x42>
-  .byte  196,193,249,110,193                 // vmovq         %r9,%xmm0
-  .byte  235,181                             // jmp           6b0 <_sk_load_a8_hsw+0x14>
-
-.globl _sk_store_a8_hsw
-_sk_store_a8_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,8                            // mov           (%rax),%r9
-  .byte  196,98,125,24,66,8                  // vbroadcastss  0x8(%rdx),%ymm8
-  .byte  197,60,89,195                       // vmulps        %ymm3,%ymm8,%ymm8
-  .byte  196,65,125,91,192                   // vcvtps2dq     %ymm8,%ymm8
-  .byte  196,67,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm9
-  .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
-  .byte  196,65,57,103,192                   // vpackuswb     %xmm8,%xmm8,%xmm8
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           72e <_sk_store_a8_hsw+0x33>
-  .byte  196,65,123,17,4,57                  // vmovsd        %xmm8,(%r9,%rdi,1)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  137,200                             // mov           %ecx,%eax
-  .byte  36,7                                // and           $0x7,%al
-  .byte  254,200                             // dec           %al
-  .byte  68,15,182,192                       // movzbl        %al,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            72a <_sk_store_a8_hsw+0x2f>
-  .byte  196,66,121,48,192                   // vpmovzxbw     %xmm8,%xmm8
-  .byte  76,141,21,66,0,0,0                  // lea           0x42(%rip),%r10        # 78c <_sk_store_a8_hsw+0x91>
-  .byte  75,99,4,130                         // movslq        (%r10,%r8,4),%rax
-  .byte  76,1,208                            // add           %r10,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  196,67,121,20,68,57,6,12            // vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,5,10            // vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,4,8             // vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,3,6             // vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,2,4             // vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,1,2             // vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
-  .byte  196,67,121,20,4,57,0                // vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
-  .byte  235,158                             // jmp           72a <_sk_store_a8_hsw+0x2f>
-  .byte  247,255                             // idiv          %edi
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  239                                 // out           %eax,(%dx)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,231                             // jmpq          *%rdi
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  223,255                             // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,215                             // callq         *%rdi
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,207                             // dec           %edi
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,199                             // inc           %edi
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_load_565_hsw
-_sk_load_565_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,16                           // mov           (%rax),%r10
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,92                              // jne           80e <_sk_load_565_hsw+0x66>
-  .byte  196,193,122,111,4,122               // vmovdqu       (%r10,%rdi,2),%xmm0
-  .byte  196,226,125,51,208                  // vpmovzxwd     %xmm0,%ymm2
-  .byte  196,226,125,88,66,104               // vpbroadcastd  0x68(%rdx),%ymm0
-  .byte  197,253,219,194                     // vpand         %ymm2,%ymm0,%ymm0
-  .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
-  .byte  196,226,125,24,74,116               // vbroadcastss  0x74(%rdx),%ymm1
-  .byte  197,244,89,192                      // vmulps        %ymm0,%ymm1,%ymm0
-  .byte  196,226,125,88,74,108               // vpbroadcastd  0x6c(%rdx),%ymm1
-  .byte  197,245,219,202                     // vpand         %ymm2,%ymm1,%ymm1
-  .byte  197,252,91,201                      // vcvtdq2ps     %ymm1,%ymm1
-  .byte  196,226,125,24,90,120               // vbroadcastss  0x78(%rdx),%ymm3
-  .byte  197,228,89,201                      // vmulps        %ymm1,%ymm3,%ymm1
-  .byte  196,226,125,88,90,112               // vpbroadcastd  0x70(%rdx),%ymm3
-  .byte  197,229,219,210                     // vpand         %ymm2,%ymm3,%ymm2
-  .byte  197,252,91,210                      // vcvtdq2ps     %ymm2,%ymm2
-  .byte  196,226,125,24,90,124               // vbroadcastss  0x7c(%rdx),%ymm3
-  .byte  197,228,89,210                      // vmulps        %ymm2,%ymm3,%ymm2
-  .byte  196,226,125,24,26                   // vbroadcastss  (%rdx),%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  65,137,200                          // mov           %ecx,%r8d
-  .byte  65,128,224,7                        // and           $0x7,%r8b
-  .byte  197,249,239,192                     // vpxor         %xmm0,%xmm0,%xmm0
-  .byte  65,254,200                          // dec           %r8b
-  .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,146                             // ja            7b8 <_sk_load_565_hsw+0x10>
-  .byte  76,141,13,75,0,0,0                  // lea           0x4b(%rip),%r9        # 878 <_sk_load_565_hsw+0xd0>
-  .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
-  .byte  76,1,200                            // add           %r9,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  197,249,239,192                     // vpxor         %xmm0,%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,12,6         // vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,10,5         // vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,8,4          // vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,6,3          // vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,4,2          // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,2,1          // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,4,122,0             // vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  233,66,255,255,255                  // jmpq          7b8 <_sk_load_565_hsw+0x10>
-  .byte  102,144                             // xchg          %ax,%ax
-  .byte  242,255                             // repnz         (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  234                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,226                             // jmpq          *%rdx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  218,255                             // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,210                             // callq         *%rdx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,202                             // dec           %edx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  190                                 // .byte         0xbe
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_store_565_hsw
-_sk_store_565_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,8                            // mov           (%rax),%r9
-  .byte  196,98,125,24,130,128,0,0,0         // vbroadcastss  0x80(%rdx),%ymm8
-  .byte  197,60,89,200                       // vmulps        %ymm0,%ymm8,%ymm9
-  .byte  196,65,125,91,201                   // vcvtps2dq     %ymm9,%ymm9
-  .byte  196,193,53,114,241,11               // vpslld        $0xb,%ymm9,%ymm9
-  .byte  196,98,125,24,146,132,0,0,0         // vbroadcastss  0x84(%rdx),%ymm10
-  .byte  197,44,89,209                       // vmulps        %ymm1,%ymm10,%ymm10
-  .byte  196,65,125,91,210                   // vcvtps2dq     %ymm10,%ymm10
-  .byte  196,193,45,114,242,5                // vpslld        $0x5,%ymm10,%ymm10
-  .byte  196,65,45,235,201                   // vpor          %ymm9,%ymm10,%ymm9
-  .byte  197,60,89,194                       // vmulps        %ymm2,%ymm8,%ymm8
-  .byte  196,65,125,91,192                   // vcvtps2dq     %ymm8,%ymm8
-  .byte  196,65,53,235,192                   // vpor          %ymm8,%ymm9,%ymm8
-  .byte  196,67,125,57,193,1                 // vextracti128  $0x1,%ymm8,%xmm9
-  .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           8f6 <_sk_store_565_hsw+0x62>
-  .byte  196,65,122,127,4,121                // vmovdqu       %xmm8,(%r9,%rdi,2)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  137,200                             // mov           %ecx,%eax
-  .byte  36,7                                // and           $0x7,%al
-  .byte  254,200                             // dec           %al
-  .byte  68,15,182,192                       // movzbl        %al,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            8f2 <_sk_store_565_hsw+0x5e>
-  .byte  76,141,21,71,0,0,0                  // lea           0x47(%rip),%r10        # 954 <_sk_store_565_hsw+0xc0>
-  .byte  75,99,4,130                         // movslq        (%r10,%r8,4),%rax
-  .byte  76,1,208                            // add           %r10,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  196,67,121,21,68,121,12,6           // vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,10,5           // vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,8,4            // vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,6,3            // vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,4,2            // vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,2,1            // vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
-  .byte  197,121,126,192                     // vmovd         %xmm8,%eax
-  .byte  102,65,137,4,121                    // mov           %ax,(%r9,%rdi,2)
-  .byte  235,161                             // jmp           8f2 <_sk_store_565_hsw+0x5e>
-  .byte  15,31,0                             // nopl          (%rax)
-  .byte  242,255                             // repnz         (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  234                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,226                             // jmpq          *%rdx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  218,255                             // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,210                             // callq         *%rdx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,202                             // dec           %edx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,194                             // inc           %edx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_load_8888_hsw
-_sk_load_8888_hsw:
-  .byte  73,137,200                          // mov           %rcx,%r8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
-  .byte  76,3,8                              // add           (%rax),%r9
-  .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,85                              // jne           9da <_sk_load_8888_hsw+0x6a>
-  .byte  196,193,126,111,25                  // vmovdqu       (%r9),%ymm3
-  .byte  196,226,125,88,82,16                // vpbroadcastd  0x10(%rdx),%ymm2
-  .byte  197,237,219,195                     // vpand         %ymm3,%ymm2,%ymm0
-  .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
-  .byte  196,98,125,24,66,12                 // vbroadcastss  0xc(%rdx),%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  197,245,114,211,8                   // vpsrld        $0x8,%ymm3,%ymm1
-  .byte  197,237,219,201                     // vpand         %ymm1,%ymm2,%ymm1
-  .byte  197,252,91,201                      // vcvtdq2ps     %ymm1,%ymm1
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  197,181,114,211,16                  // vpsrld        $0x10,%ymm3,%ymm9
-  .byte  196,193,109,219,209                 // vpand         %ymm9,%ymm2,%ymm2
-  .byte  197,252,91,210                      // vcvtdq2ps     %ymm2,%ymm2
-  .byte  197,188,89,210                      // vmulps        %ymm2,%ymm8,%ymm2
-  .byte  197,229,114,211,24                  // vpsrld        $0x18,%ymm3,%ymm3
-  .byte  197,252,91,219                      // vcvtdq2ps     %ymm3,%ymm3
-  .byte  196,193,100,89,216                  // vmulps        %ymm8,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,137,193                          // mov           %r8,%rcx
-  .byte  255,224                             // jmpq          *%rax
-  .byte  185,8,0,0,0                         // mov           $0x8,%ecx
-  .byte  68,41,193                           // sub           %r8d,%ecx
-  .byte  192,225,3                           // shl           $0x3,%cl
-  .byte  72,199,192,255,255,255,255          // mov           $0xffffffffffffffff,%rax
-  .byte  72,211,232                          // shr           %cl,%rax
-  .byte  196,225,249,110,192                 // vmovq         %rax,%xmm0
-  .byte  196,226,125,33,192                  // vpmovsxbd     %xmm0,%ymm0
-  .byte  196,194,125,140,25                  // vpmaskmovd    (%r9),%ymm0,%ymm3
-  .byte  235,138                             // jmp           98a <_sk_load_8888_hsw+0x1a>
-
-.globl _sk_store_8888_hsw
-_sk_store_8888_hsw:
-  .byte  73,137,200                          // mov           %rcx,%r8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,141,12,189,0,0,0,0               // lea           0x0(,%rdi,4),%r9
-  .byte  76,3,8                              // add           (%rax),%r9
-  .byte  196,98,125,24,66,8                  // vbroadcastss  0x8(%rdx),%ymm8
-  .byte  197,60,89,200                       // vmulps        %ymm0,%ymm8,%ymm9
-  .byte  196,65,125,91,201                   // vcvtps2dq     %ymm9,%ymm9
-  .byte  197,60,89,209                       // vmulps        %ymm1,%ymm8,%ymm10
-  .byte  196,65,125,91,210                   // vcvtps2dq     %ymm10,%ymm10
-  .byte  196,193,45,114,242,8                // vpslld        $0x8,%ymm10,%ymm10
-  .byte  196,65,45,235,201                   // vpor          %ymm9,%ymm10,%ymm9
-  .byte  197,60,89,210                       // vmulps        %ymm2,%ymm8,%ymm10
-  .byte  196,65,125,91,210                   // vcvtps2dq     %ymm10,%ymm10
-  .byte  196,193,45,114,242,16               // vpslld        $0x10,%ymm10,%ymm10
-  .byte  197,60,89,195                       // vmulps        %ymm3,%ymm8,%ymm8
-  .byte  196,65,125,91,192                   // vcvtps2dq     %ymm8,%ymm8
-  .byte  196,193,61,114,240,24               // vpslld        $0x18,%ymm8,%ymm8
-  .byte  196,65,45,235,192                   // vpor          %ymm8,%ymm10,%ymm8
-  .byte  196,65,53,235,192                   // vpor          %ymm8,%ymm9,%ymm8
-  .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,12                              // jne           a6c <_sk_store_8888_hsw+0x6c>
-  .byte  196,65,126,127,1                    // vmovdqu       %ymm8,(%r9)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,137,193                          // mov           %r8,%rcx
-  .byte  255,224                             // jmpq          *%rax
-  .byte  185,8,0,0,0                         // mov           $0x8,%ecx
-  .byte  68,41,193                           // sub           %r8d,%ecx
-  .byte  192,225,3                           // shl           $0x3,%cl
-  .byte  72,199,192,255,255,255,255          // mov           $0xffffffffffffffff,%rax
-  .byte  72,211,232                          // shr           %cl,%rax
-  .byte  196,97,249,110,200                  // vmovq         %rax,%xmm9
-  .byte  196,66,125,33,201                   // vpmovsxbd     %xmm9,%ymm9
-  .byte  196,66,53,142,1                     // vpmaskmovd    %ymm8,%ymm9,(%r9)
-  .byte  235,211                             // jmp           a65 <_sk_store_8888_hsw+0x65>
-
-.globl _sk_load_f16_hsw
-_sk_load_f16_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,97                              // jne           afd <_sk_load_f16_hsw+0x6b>
-  .byte  197,249,16,12,248                   // vmovupd       (%rax,%rdi,8),%xmm1
-  .byte  197,249,16,84,248,16                // vmovupd       0x10(%rax,%rdi,8),%xmm2
-  .byte  197,249,16,92,248,32                // vmovupd       0x20(%rax,%rdi,8),%xmm3
-  .byte  197,121,16,68,248,48                // vmovupd       0x30(%rax,%rdi,8),%xmm8
-  .byte  197,241,97,194                      // vpunpcklwd    %xmm2,%xmm1,%xmm0
-  .byte  197,241,105,202                     // vpunpckhwd    %xmm2,%xmm1,%xmm1
-  .byte  196,193,97,97,208                   // vpunpcklwd    %xmm8,%xmm3,%xmm2
-  .byte  196,193,97,105,216                  // vpunpckhwd    %xmm8,%xmm3,%xmm3
-  .byte  197,121,97,193                      // vpunpcklwd    %xmm1,%xmm0,%xmm8
-  .byte  197,121,105,201                     // vpunpckhwd    %xmm1,%xmm0,%xmm9
-  .byte  197,233,97,203                      // vpunpcklwd    %xmm3,%xmm2,%xmm1
-  .byte  197,233,105,219                     // vpunpckhwd    %xmm3,%xmm2,%xmm3
-  .byte  197,185,108,193                     // vpunpcklqdq   %xmm1,%xmm8,%xmm0
-  .byte  196,226,125,19,192                  // vcvtph2ps     %xmm0,%ymm0
-  .byte  197,185,109,201                     // vpunpckhqdq   %xmm1,%xmm8,%xmm1
-  .byte  196,226,125,19,201                  // vcvtph2ps     %xmm1,%ymm1
-  .byte  197,177,108,211                     // vpunpcklqdq   %xmm3,%xmm9,%xmm2
-  .byte  196,226,125,19,210                  // vcvtph2ps     %xmm2,%ymm2
-  .byte  197,177,109,219                     // vpunpckhqdq   %xmm3,%xmm9,%xmm3
-  .byte  196,226,125,19,219                  // vcvtph2ps     %xmm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  197,251,16,12,248                   // vmovsd        (%rax,%rdi,8),%xmm1
-  .byte  196,65,57,87,192                    // vxorpd        %xmm8,%xmm8,%xmm8
-  .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  117,6                               // jne           b13 <_sk_load_f16_hsw+0x81>
-  .byte  197,250,126,201                     // vmovq         %xmm1,%xmm1
-  .byte  235,30                              // jmp           b31 <_sk_load_f16_hsw+0x9f>
-  .byte  197,241,22,76,248,8                 // vmovhpd       0x8(%rax,%rdi,8),%xmm1,%xmm1
-  .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,18                              // jb            b31 <_sk_load_f16_hsw+0x9f>
-  .byte  197,251,16,84,248,16                // vmovsd        0x10(%rax,%rdi,8),%xmm2
-  .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  117,19                              // jne           b3e <_sk_load_f16_hsw+0xac>
-  .byte  197,250,126,210                     // vmovq         %xmm2,%xmm2
-  .byte  235,46                              // jmp           b5f <_sk_load_f16_hsw+0xcd>
-  .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,117,255,255,255                 // jmpq          ab3 <_sk_load_f16_hsw+0x21>
-  .byte  197,233,22,84,248,24                // vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
-  .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,21                              // jb            b5f <_sk_load_f16_hsw+0xcd>
-  .byte  197,251,16,92,248,32                // vmovsd        0x20(%rax,%rdi,8),%xmm3
-  .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  117,18                              // jne           b68 <_sk_load_f16_hsw+0xd6>
-  .byte  197,250,126,219                     // vmovq         %xmm3,%xmm3
-  .byte  233,84,255,255,255                  // jmpq          ab3 <_sk_load_f16_hsw+0x21>
-  .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  233,75,255,255,255                  // jmpq          ab3 <_sk_load_f16_hsw+0x21>
-  .byte  197,225,22,92,248,40                // vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
-  .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,59,255,255,255               // jb            ab3 <_sk_load_f16_hsw+0x21>
-  .byte  197,123,16,68,248,48                // vmovsd        0x30(%rax,%rdi,8),%xmm8
-  .byte  233,48,255,255,255                  // jmpq          ab3 <_sk_load_f16_hsw+0x21>
-
-.globl _sk_store_f16_hsw
-_sk_store_f16_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  196,195,125,29,192,4                // vcvtps2ph     $0x4,%ymm0,%xmm8
-  .byte  196,195,125,29,201,4                // vcvtps2ph     $0x4,%ymm1,%xmm9
-  .byte  196,195,125,29,210,4                // vcvtps2ph     $0x4,%ymm2,%xmm10
-  .byte  196,195,125,29,219,4                // vcvtps2ph     $0x4,%ymm3,%xmm11
-  .byte  196,65,57,97,225                    // vpunpcklwd    %xmm9,%xmm8,%xmm12
-  .byte  196,65,57,105,193                   // vpunpckhwd    %xmm9,%xmm8,%xmm8
-  .byte  196,65,41,97,203                    // vpunpcklwd    %xmm11,%xmm10,%xmm9
-  .byte  196,65,41,105,235                   // vpunpckhwd    %xmm11,%xmm10,%xmm13
-  .byte  196,65,25,98,217                    // vpunpckldq    %xmm9,%xmm12,%xmm11
-  .byte  196,65,25,106,209                   // vpunpckhdq    %xmm9,%xmm12,%xmm10
-  .byte  196,65,57,98,205                    // vpunpckldq    %xmm13,%xmm8,%xmm9
-  .byte  196,65,57,106,197                   // vpunpckhdq    %xmm13,%xmm8,%xmm8
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,27                              // jne           be8 <_sk_store_f16_hsw+0x65>
-  .byte  197,120,17,28,248                   // vmovups       %xmm11,(%rax,%rdi,8)
-  .byte  197,120,17,84,248,16                // vmovups       %xmm10,0x10(%rax,%rdi,8)
-  .byte  197,120,17,76,248,32                // vmovups       %xmm9,0x20(%rax,%rdi,8)
-  .byte  197,122,127,68,248,48               // vmovdqu       %xmm8,0x30(%rax,%rdi,8)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  197,121,214,28,248                  // vmovq         %xmm11,(%rax,%rdi,8)
-  .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,241                             // je            be4 <_sk_store_f16_hsw+0x61>
-  .byte  197,121,23,92,248,8                 // vmovhpd       %xmm11,0x8(%rax,%rdi,8)
-  .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,229                             // jb            be4 <_sk_store_f16_hsw+0x61>
-  .byte  197,121,214,84,248,16               // vmovq         %xmm10,0x10(%rax,%rdi,8)
-  .byte  116,221                             // je            be4 <_sk_store_f16_hsw+0x61>
-  .byte  197,121,23,84,248,24                // vmovhpd       %xmm10,0x18(%rax,%rdi,8)
-  .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,209                             // jb            be4 <_sk_store_f16_hsw+0x61>
-  .byte  197,121,214,76,248,32               // vmovq         %xmm9,0x20(%rax,%rdi,8)
-  .byte  116,201                             // je            be4 <_sk_store_f16_hsw+0x61>
-  .byte  197,121,23,76,248,40                // vmovhpd       %xmm9,0x28(%rax,%rdi,8)
-  .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,189                             // jb            be4 <_sk_store_f16_hsw+0x61>
-  .byte  197,121,214,68,248,48               // vmovq         %xmm8,0x30(%rax,%rdi,8)
-  .byte  235,181                             // jmp           be4 <_sk_store_f16_hsw+0x61>
-
-.globl _sk_store_f32_hsw
-_sk_store_f32_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,0                            // mov           (%rax),%r8
-  .byte  72,141,4,189,0,0,0,0                // lea           0x0(,%rdi,4),%rax
-  .byte  197,124,20,193                      // vunpcklps     %ymm1,%ymm0,%ymm8
-  .byte  197,124,21,217                      // vunpckhps     %ymm1,%ymm0,%ymm11
-  .byte  197,108,20,203                      // vunpcklps     %ymm3,%ymm2,%ymm9
-  .byte  197,108,21,227                      // vunpckhps     %ymm3,%ymm2,%ymm12
-  .byte  196,65,61,20,209                    // vunpcklpd     %ymm9,%ymm8,%ymm10
-  .byte  196,65,61,21,201                    // vunpckhpd     %ymm9,%ymm8,%ymm9
-  .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
-  .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,55                              // jne           c9c <_sk_store_f32_hsw+0x6d>
-  .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
-  .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
-  .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
-  .byte  196,67,61,6,195,49                  // vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
-  .byte  196,65,125,17,36,128                // vmovupd       %ymm12,(%r8,%rax,4)
-  .byte  196,65,125,17,108,128,32            // vmovupd       %ymm13,0x20(%r8,%rax,4)
-  .byte  196,65,125,17,76,128,64             // vmovupd       %ymm9,0x40(%r8,%rax,4)
-  .byte  196,65,125,17,68,128,96             // vmovupd       %ymm8,0x60(%r8,%rax,4)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
-  .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            c98 <_sk_store_f32_hsw+0x69>
-  .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
-  .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            c98 <_sk_store_f32_hsw+0x69>
-  .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
-  .byte  116,218                             // je            c98 <_sk_store_f32_hsw+0x69>
-  .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
-  .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            c98 <_sk_store_f32_hsw+0x69>
-  .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  .byte  116,195                             // je            c98 <_sk_store_f32_hsw+0x69>
-  .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
-  .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,181                             // jb            c98 <_sk_store_f32_hsw+0x69>
-  .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  .byte  235,171                             // jmp           c98 <_sk_store_f32_hsw+0x69>
-
-.globl _sk_clamp_x_hsw
-_sk_clamp_x_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  197,188,95,192                      // vmaxps        %ymm0,%ymm8,%ymm0
-  .byte  196,98,125,88,0                     // vpbroadcastd  (%rax),%ymm8
-  .byte  196,65,53,118,201                   // vpcmpeqd      %ymm9,%ymm9,%ymm9
-  .byte  196,65,61,254,193                   // vpaddd        %ymm9,%ymm8,%ymm8
-  .byte  196,193,124,93,192                  // vminps        %ymm8,%ymm0,%ymm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_y_hsw
-_sk_clamp_y_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  197,188,95,201                      // vmaxps        %ymm1,%ymm8,%ymm1
-  .byte  196,98,125,88,0                     // vpbroadcastd  (%rax),%ymm8
-  .byte  196,65,53,118,201                   // vpcmpeqd      %ymm9,%ymm9,%ymm9
-  .byte  196,65,61,254,193                   // vpaddd        %ymm9,%ymm8,%ymm8
-  .byte  196,193,116,93,200                  // vminps        %ymm8,%ymm1,%ymm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_repeat_x_hsw
-_sk_repeat_x_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,65,124,94,200                   // vdivps        %ymm8,%ymm0,%ymm9
-  .byte  196,67,125,8,201,1                  // vroundps      $0x1,%ymm9,%ymm9
-  .byte  196,98,61,172,200                   // vfnmadd213ps  %ymm0,%ymm8,%ymm9
-  .byte  197,253,118,192                     // vpcmpeqd      %ymm0,%ymm0,%ymm0
-  .byte  197,189,254,192                     // vpaddd        %ymm0,%ymm8,%ymm0
-  .byte  197,180,93,192                      // vminps        %ymm0,%ymm9,%ymm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_repeat_y_hsw
-_sk_repeat_y_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,65,116,94,200                   // vdivps        %ymm8,%ymm1,%ymm9
-  .byte  196,67,125,8,201,1                  // vroundps      $0x1,%ymm9,%ymm9
-  .byte  196,98,61,172,201                   // vfnmadd213ps  %ymm1,%ymm8,%ymm9
-  .byte  197,245,118,201                     // vpcmpeqd      %ymm1,%ymm1,%ymm1
-  .byte  197,189,254,201                     // vpaddd        %ymm1,%ymm8,%ymm1
-  .byte  197,180,93,201                      // vminps        %ymm1,%ymm9,%ymm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_mirror_x_hsw
-_sk_mirror_x_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,122,16,0                        // vmovss        (%rax),%xmm8
-  .byte  196,66,125,24,200                   // vbroadcastss  %xmm8,%ymm9
-  .byte  196,65,124,92,209                   // vsubps        %ymm9,%ymm0,%ymm10
-  .byte  196,193,58,88,192                   // vaddss        %xmm8,%xmm8,%xmm0
-  .byte  196,226,125,24,192                  // vbroadcastss  %xmm0,%ymm0
-  .byte  197,44,94,192                       // vdivps        %ymm0,%ymm10,%ymm8
-  .byte  196,67,125,8,192,1                  // vroundps      $0x1,%ymm8,%ymm8
-  .byte  196,66,125,172,194                  // vfnmadd213ps  %ymm10,%ymm0,%ymm8
-  .byte  196,193,60,92,193                   // vsubps        %ymm9,%ymm8,%ymm0
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  197,60,92,192                       // vsubps        %ymm0,%ymm8,%ymm8
-  .byte  197,188,84,192                      // vandps        %ymm0,%ymm8,%ymm0
-  .byte  196,65,61,118,192                   // vpcmpeqd      %ymm8,%ymm8,%ymm8
-  .byte  196,65,53,254,192                   // vpaddd        %ymm8,%ymm9,%ymm8
-  .byte  196,193,124,93,192                  // vminps        %ymm8,%ymm0,%ymm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_mirror_y_hsw
-_sk_mirror_y_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,122,16,0                        // vmovss        (%rax),%xmm8
-  .byte  196,66,125,24,200                   // vbroadcastss  %xmm8,%ymm9
-  .byte  196,65,116,92,209                   // vsubps        %ymm9,%ymm1,%ymm10
-  .byte  196,193,58,88,200                   // vaddss        %xmm8,%xmm8,%xmm1
-  .byte  196,226,125,24,201                  // vbroadcastss  %xmm1,%ymm1
-  .byte  197,44,94,193                       // vdivps        %ymm1,%ymm10,%ymm8
-  .byte  196,67,125,8,192,1                  // vroundps      $0x1,%ymm8,%ymm8
-  .byte  196,66,117,172,194                  // vfnmadd213ps  %ymm10,%ymm1,%ymm8
-  .byte  196,193,60,92,201                   // vsubps        %ymm9,%ymm8,%ymm1
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  197,60,92,193                       // vsubps        %ymm1,%ymm8,%ymm8
-  .byte  197,188,84,201                      // vandps        %ymm1,%ymm8,%ymm1
-  .byte  196,65,61,118,192                   // vpcmpeqd      %ymm8,%ymm8,%ymm8
-  .byte  196,65,53,254,192                   // vpaddd        %ymm8,%ymm9,%ymm8
-  .byte  196,193,116,93,200                  // vminps        %ymm8,%ymm1,%ymm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_2x3_hsw
-_sk_matrix_2x3_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,8                     // vbroadcastss  (%rax),%ymm9
-  .byte  196,98,125,24,80,8                  // vbroadcastss  0x8(%rax),%ymm10
-  .byte  196,98,125,24,64,16                 // vbroadcastss  0x10(%rax),%ymm8
-  .byte  196,66,117,184,194                  // vfmadd231ps   %ymm10,%ymm1,%ymm8
-  .byte  196,66,125,184,193                  // vfmadd231ps   %ymm9,%ymm0,%ymm8
-  .byte  196,98,125,24,80,4                  // vbroadcastss  0x4(%rax),%ymm10
-  .byte  196,98,125,24,88,12                 // vbroadcastss  0xc(%rax),%ymm11
-  .byte  196,98,125,24,72,20                 // vbroadcastss  0x14(%rax),%ymm9
-  .byte  196,66,117,184,203                  // vfmadd231ps   %ymm11,%ymm1,%ymm9
-  .byte  196,66,125,184,202                  // vfmadd231ps   %ymm10,%ymm0,%ymm9
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,124,41,192                      // vmovaps       %ymm8,%ymm0
-  .byte  197,124,41,201                      // vmovaps       %ymm9,%ymm1
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_3x4_hsw
-_sk_matrix_3x4_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,8                     // vbroadcastss  (%rax),%ymm9
-  .byte  196,98,125,24,80,12                 // vbroadcastss  0xc(%rax),%ymm10
-  .byte  196,98,125,24,88,24                 // vbroadcastss  0x18(%rax),%ymm11
-  .byte  196,98,125,24,64,36                 // vbroadcastss  0x24(%rax),%ymm8
-  .byte  196,66,109,184,195                  // vfmadd231ps   %ymm11,%ymm2,%ymm8
-  .byte  196,66,117,184,194                  // vfmadd231ps   %ymm10,%ymm1,%ymm8
-  .byte  196,66,125,184,193                  // vfmadd231ps   %ymm9,%ymm0,%ymm8
-  .byte  196,98,125,24,80,4                  // vbroadcastss  0x4(%rax),%ymm10
-  .byte  196,98,125,24,88,16                 // vbroadcastss  0x10(%rax),%ymm11
-  .byte  196,98,125,24,96,28                 // vbroadcastss  0x1c(%rax),%ymm12
-  .byte  196,98,125,24,72,40                 // vbroadcastss  0x28(%rax),%ymm9
-  .byte  196,66,109,184,204                  // vfmadd231ps   %ymm12,%ymm2,%ymm9
-  .byte  196,66,117,184,203                  // vfmadd231ps   %ymm11,%ymm1,%ymm9
-  .byte  196,66,125,184,202                  // vfmadd231ps   %ymm10,%ymm0,%ymm9
-  .byte  196,98,125,24,88,8                  // vbroadcastss  0x8(%rax),%ymm11
-  .byte  196,98,125,24,96,20                 // vbroadcastss  0x14(%rax),%ymm12
-  .byte  196,98,125,24,104,32                // vbroadcastss  0x20(%rax),%ymm13
-  .byte  196,98,125,24,80,44                 // vbroadcastss  0x2c(%rax),%ymm10
-  .byte  196,66,109,184,213                  // vfmadd231ps   %ymm13,%ymm2,%ymm10
-  .byte  196,66,117,184,212                  // vfmadd231ps   %ymm12,%ymm1,%ymm10
-  .byte  196,66,125,184,211                  // vfmadd231ps   %ymm11,%ymm0,%ymm10
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,124,41,192                      // vmovaps       %ymm8,%ymm0
-  .byte  197,124,41,201                      // vmovaps       %ymm9,%ymm1
-  .byte  197,124,41,210                      // vmovaps       %ymm10,%ymm2
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_perspective_hsw
-_sk_matrix_perspective_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,98,125,24,72,4                  // vbroadcastss  0x4(%rax),%ymm9
-  .byte  196,98,125,24,80,8                  // vbroadcastss  0x8(%rax),%ymm10
-  .byte  196,66,117,184,209                  // vfmadd231ps   %ymm9,%ymm1,%ymm10
-  .byte  196,66,125,184,208                  // vfmadd231ps   %ymm8,%ymm0,%ymm10
-  .byte  196,98,125,24,64,12                 // vbroadcastss  0xc(%rax),%ymm8
-  .byte  196,98,125,24,72,16                 // vbroadcastss  0x10(%rax),%ymm9
-  .byte  196,98,125,24,88,20                 // vbroadcastss  0x14(%rax),%ymm11
-  .byte  196,66,117,184,217                  // vfmadd231ps   %ymm9,%ymm1,%ymm11
-  .byte  196,66,125,184,216                  // vfmadd231ps   %ymm8,%ymm0,%ymm11
-  .byte  196,98,125,24,64,24                 // vbroadcastss  0x18(%rax),%ymm8
-  .byte  196,98,125,24,72,28                 // vbroadcastss  0x1c(%rax),%ymm9
-  .byte  196,98,125,24,96,32                 // vbroadcastss  0x20(%rax),%ymm12
-  .byte  196,66,117,184,225                  // vfmadd231ps   %ymm9,%ymm1,%ymm12
-  .byte  196,66,125,184,224                  // vfmadd231ps   %ymm8,%ymm0,%ymm12
-  .byte  196,193,124,83,204                  // vrcpps        %ymm12,%ymm1
-  .byte  197,172,89,193                      // vmulps        %ymm1,%ymm10,%ymm0
-  .byte  197,164,89,201                      // vmulps        %ymm1,%ymm11,%ymm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_linear_gradient_2stops_hsw
-_sk_linear_gradient_2stops_hsw:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,226,125,24,72,16                // vbroadcastss  0x10(%rax),%ymm1
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,98,125,184,193                  // vfmadd231ps   %ymm1,%ymm0,%ymm8
-  .byte  196,226,125,24,80,20                // vbroadcastss  0x14(%rax),%ymm2
-  .byte  196,226,125,24,72,4                 // vbroadcastss  0x4(%rax),%ymm1
-  .byte  196,226,125,184,202                 // vfmadd231ps   %ymm2,%ymm0,%ymm1
-  .byte  196,226,125,24,88,24                // vbroadcastss  0x18(%rax),%ymm3
-  .byte  196,226,125,24,80,8                 // vbroadcastss  0x8(%rax),%ymm2
-  .byte  196,226,125,184,211                 // vfmadd231ps   %ymm3,%ymm0,%ymm2
-  .byte  196,98,125,24,72,28                 // vbroadcastss  0x1c(%rax),%ymm9
-  .byte  196,226,125,24,88,12                // vbroadcastss  0xc(%rax),%ymm3
-  .byte  196,194,125,184,217                 // vfmadd231ps   %ymm9,%ymm0,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,124,41,192                      // vmovaps       %ymm8,%ymm0
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_start_pipeline_avx
-_sk_start_pipeline_avx:
-  .byte  65,87                               // push          %r15
-  .byte  65,86                               // push          %r14
-  .byte  65,85                               // push          %r13
-  .byte  65,84                               // push          %r12
-  .byte  83                                  // push          %rbx
-  .byte  73,137,205                          // mov           %rcx,%r13
-  .byte  73,137,214                          // mov           %rdx,%r14
-  .byte  72,137,251                          // mov           %rdi,%rbx
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  73,137,199                          // mov           %rax,%r15
-  .byte  73,137,244                          // mov           %rsi,%r12
-  .byte  72,141,67,8                         // lea           0x8(%rbx),%rax
-  .byte  76,57,232                           // cmp           %r13,%rax
-  .byte  118,5                               // jbe           28 <_sk_start_pipeline_avx+0x28>
-  .byte  72,137,223                          // mov           %rbx,%rdi
-  .byte  235,65                              // jmp           69 <_sk_start_pipeline_avx+0x69>
-  .byte  185,0,0,0,0                         // mov           $0x0,%ecx
-  .byte  197,252,87,192                      // vxorps        %ymm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  197,236,87,210                      // vxorps        %ymm2,%ymm2,%ymm2
-  .byte  197,228,87,219                      // vxorps        %ymm3,%ymm3,%ymm3
-  .byte  197,220,87,228                      // vxorps        %ymm4,%ymm4,%ymm4
-  .byte  197,212,87,237                      // vxorps        %ymm5,%ymm5,%ymm5
-  .byte  197,204,87,246                      // vxorps        %ymm6,%ymm6,%ymm6
-  .byte  197,196,87,255                      // vxorps        %ymm7,%ymm7,%ymm7
-  .byte  72,137,223                          // mov           %rbx,%rdi
-  .byte  76,137,230                          // mov           %r12,%rsi
-  .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  65,255,215                          // callq         *%r15
-  .byte  72,141,123,8                        // lea           0x8(%rbx),%rdi
-  .byte  72,131,195,16                       // add           $0x10,%rbx
-  .byte  76,57,235                           // cmp           %r13,%rbx
-  .byte  72,137,251                          // mov           %rdi,%rbx
-  .byte  118,191                             // jbe           28 <_sk_start_pipeline_avx+0x28>
-  .byte  76,137,233                          // mov           %r13,%rcx
-  .byte  72,41,249                           // sub           %rdi,%rcx
-  .byte  116,41                              // je            9a <_sk_start_pipeline_avx+0x9a>
-  .byte  197,252,87,192                      // vxorps        %ymm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  197,236,87,210                      // vxorps        %ymm2,%ymm2,%ymm2
-  .byte  197,228,87,219                      // vxorps        %ymm3,%ymm3,%ymm3
-  .byte  197,220,87,228                      // vxorps        %ymm4,%ymm4,%ymm4
-  .byte  197,212,87,237                      // vxorps        %ymm5,%ymm5,%ymm5
-  .byte  197,204,87,246                      // vxorps        %ymm6,%ymm6,%ymm6
-  .byte  197,196,87,255                      // vxorps        %ymm7,%ymm7,%ymm7
-  .byte  76,137,230                          // mov           %r12,%rsi
-  .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  65,255,215                          // callq         *%r15
-  .byte  76,137,232                          // mov           %r13,%rax
-  .byte  91                                  // pop           %rbx
-  .byte  65,92                               // pop           %r12
-  .byte  65,93                               // pop           %r13
-  .byte  65,94                               // pop           %r14
-  .byte  65,95                               // pop           %r15
-  .byte  197,248,119                         // vzeroupper
-  .byte  195                                 // retq
-
-.globl _sk_just_return_avx
-_sk_just_return_avx:
-  .byte  195                                 // retq
-
-.globl _sk_seed_shader_avx
-_sk_seed_shader_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,249,110,199                     // vmovd         %edi,%xmm0
-  .byte  197,249,112,192,0                   // vpshufd       $0x0,%xmm0,%xmm0
-  .byte  196,227,125,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
-  .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
-  .byte  196,226,125,24,74,4                 // vbroadcastss  0x4(%rdx),%ymm1
-  .byte  197,252,88,193                      // vaddps        %ymm1,%ymm0,%ymm0
-  .byte  197,252,88,66,20                    // vaddps        0x14(%rdx),%ymm0,%ymm0
-  .byte  196,226,125,24,16                   // vbroadcastss  (%rax),%ymm2
-  .byte  197,252,91,210                      // vcvtdq2ps     %ymm2,%ymm2
-  .byte  197,236,88,201                      // vaddps        %ymm1,%ymm2,%ymm1
-  .byte  196,226,125,24,18                   // vbroadcastss  (%rdx),%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,228,87,219                      // vxorps        %ymm3,%ymm3,%ymm3
-  .byte  197,220,87,228                      // vxorps        %ymm4,%ymm4,%ymm4
-  .byte  197,212,87,237                      // vxorps        %ymm5,%ymm5,%ymm5
-  .byte  197,204,87,246                      // vxorps        %ymm6,%ymm6,%ymm6
-  .byte  197,196,87,255                      // vxorps        %ymm7,%ymm7,%ymm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_constant_color_avx
-_sk_constant_color_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,226,125,24,0                    // vbroadcastss  (%rax),%ymm0
-  .byte  196,226,125,24,72,4                 // vbroadcastss  0x4(%rax),%ymm1
-  .byte  196,226,125,24,80,8                 // vbroadcastss  0x8(%rax),%ymm2
-  .byte  196,226,125,24,88,12                // vbroadcastss  0xc(%rax),%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clear_avx
-_sk_clear_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,87,192                      // vxorps        %ymm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  197,236,87,210                      // vxorps        %ymm2,%ymm2,%ymm2
-  .byte  197,228,87,219                      // vxorps        %ymm3,%ymm3,%ymm3
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_plus__avx
-_sk_plus__avx:
-  .byte  197,252,88,196                      // vaddps        %ymm4,%ymm0,%ymm0
-  .byte  197,244,88,205                      // vaddps        %ymm5,%ymm1,%ymm1
-  .byte  197,236,88,214                      // vaddps        %ymm6,%ymm2,%ymm2
-  .byte  197,228,88,223                      // vaddps        %ymm7,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_srcover_avx
-_sk_srcover_avx:
-  .byte  196,98,125,24,2                     // vbroadcastss  (%rdx),%ymm8
-  .byte  197,60,92,195                       // vsubps        %ymm3,%ymm8,%ymm8
-  .byte  197,60,89,204                       // vmulps        %ymm4,%ymm8,%ymm9
-  .byte  197,180,88,192                      // vaddps        %ymm0,%ymm9,%ymm0
-  .byte  197,60,89,205                       // vmulps        %ymm5,%ymm8,%ymm9
-  .byte  197,180,88,201                      // vaddps        %ymm1,%ymm9,%ymm1
-  .byte  197,60,89,206                       // vmulps        %ymm6,%ymm8,%ymm9
-  .byte  197,180,88,210                      // vaddps        %ymm2,%ymm9,%ymm2
-  .byte  197,60,89,199                       // vmulps        %ymm7,%ymm8,%ymm8
-  .byte  197,188,88,219                      // vaddps        %ymm3,%ymm8,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_dstover_avx
-_sk_dstover_avx:
-  .byte  196,98,125,24,2                     // vbroadcastss  (%rdx),%ymm8
-  .byte  197,60,92,199                       // vsubps        %ymm7,%ymm8,%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  197,252,88,196                      // vaddps        %ymm4,%ymm0,%ymm0
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  197,244,88,205                      // vaddps        %ymm5,%ymm1,%ymm1
-  .byte  197,188,89,210                      // vmulps        %ymm2,%ymm8,%ymm2
-  .byte  197,236,88,214                      // vaddps        %ymm6,%ymm2,%ymm2
-  .byte  197,188,89,219                      // vmulps        %ymm3,%ymm8,%ymm3
-  .byte  197,228,88,223                      // vaddps        %ymm7,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_0_avx
-_sk_clamp_0_avx:
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  196,193,124,95,192                  // vmaxps        %ymm8,%ymm0,%ymm0
-  .byte  196,193,116,95,200                  // vmaxps        %ymm8,%ymm1,%ymm1
-  .byte  196,193,108,95,208                  // vmaxps        %ymm8,%ymm2,%ymm2
-  .byte  196,193,100,95,216                  // vmaxps        %ymm8,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_1_avx
-_sk_clamp_1_avx:
-  .byte  196,98,125,24,2                     // vbroadcastss  (%rdx),%ymm8
-  .byte  196,193,124,93,192                  // vminps        %ymm8,%ymm0,%ymm0
-  .byte  196,193,116,93,200                  // vminps        %ymm8,%ymm1,%ymm1
-  .byte  196,193,108,93,208                  // vminps        %ymm8,%ymm2,%ymm2
-  .byte  196,193,100,93,216                  // vminps        %ymm8,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_a_avx
-_sk_clamp_a_avx:
-  .byte  196,98,125,24,2                     // vbroadcastss  (%rdx),%ymm8
-  .byte  196,193,100,93,216                  // vminps        %ymm8,%ymm3,%ymm3
-  .byte  197,252,93,195                      // vminps        %ymm3,%ymm0,%ymm0
-  .byte  197,244,93,203                      // vminps        %ymm3,%ymm1,%ymm1
-  .byte  197,236,93,211                      // vminps        %ymm3,%ymm2,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_set_rgb_avx
-_sk_set_rgb_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,226,125,24,0                    // vbroadcastss  (%rax),%ymm0
-  .byte  196,226,125,24,72,4                 // vbroadcastss  0x4(%rax),%ymm1
-  .byte  196,226,125,24,80,8                 // vbroadcastss  0x8(%rax),%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_swap_rb_avx
-_sk_swap_rb_avx:
-  .byte  197,124,40,192                      // vmovaps       %ymm0,%ymm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,40,194                      // vmovaps       %ymm2,%ymm0
-  .byte  197,124,41,194                      // vmovaps       %ymm8,%ymm2
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_swap_avx
-_sk_swap_avx:
-  .byte  197,124,40,195                      // vmovaps       %ymm3,%ymm8
-  .byte  197,124,40,202                      // vmovaps       %ymm2,%ymm9
-  .byte  197,124,40,209                      // vmovaps       %ymm1,%ymm10
-  .byte  197,124,40,216                      // vmovaps       %ymm0,%ymm11
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,40,196                      // vmovaps       %ymm4,%ymm0
-  .byte  197,252,40,205                      // vmovaps       %ymm5,%ymm1
-  .byte  197,252,40,214                      // vmovaps       %ymm6,%ymm2
-  .byte  197,252,40,223                      // vmovaps       %ymm7,%ymm3
-  .byte  197,124,41,220                      // vmovaps       %ymm11,%ymm4
-  .byte  197,124,41,213                      // vmovaps       %ymm10,%ymm5
-  .byte  197,124,41,206                      // vmovaps       %ymm9,%ymm6
-  .byte  197,124,41,199                      // vmovaps       %ymm8,%ymm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_move_src_dst_avx
-_sk_move_src_dst_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,40,224                      // vmovaps       %ymm0,%ymm4
-  .byte  197,252,40,233                      // vmovaps       %ymm1,%ymm5
-  .byte  197,252,40,242                      // vmovaps       %ymm2,%ymm6
-  .byte  197,252,40,251                      // vmovaps       %ymm3,%ymm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_move_dst_src_avx
-_sk_move_dst_src_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,40,196                      // vmovaps       %ymm4,%ymm0
-  .byte  197,252,40,205                      // vmovaps       %ymm5,%ymm1
-  .byte  197,252,40,214                      // vmovaps       %ymm6,%ymm2
-  .byte  197,252,40,223                      // vmovaps       %ymm7,%ymm3
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_premul_avx
-_sk_premul_avx:
-  .byte  197,252,89,195                      // vmulps        %ymm3,%ymm0,%ymm0
-  .byte  197,244,89,203                      // vmulps        %ymm3,%ymm1,%ymm1
-  .byte  197,236,89,211                      // vmulps        %ymm3,%ymm2,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_unpremul_avx
-_sk_unpremul_avx:
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  196,65,100,194,200,0                // vcmpeqps      %ymm8,%ymm3,%ymm9
-  .byte  196,98,125,24,18                    // vbroadcastss  (%rdx),%ymm10
-  .byte  197,44,94,211                       // vdivps        %ymm3,%ymm10,%ymm10
-  .byte  196,67,45,74,192,144                // vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  197,188,89,210                      // vmulps        %ymm2,%ymm8,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_from_srgb_avx
-_sk_from_srgb_avx:
-  .byte  196,98,125,24,66,64                 // vbroadcastss  0x40(%rdx),%ymm8
-  .byte  197,60,89,200                       // vmulps        %ymm0,%ymm8,%ymm9
-  .byte  197,124,89,208                      // vmulps        %ymm0,%ymm0,%ymm10
-  .byte  196,98,125,24,90,60                 // vbroadcastss  0x3c(%rdx),%ymm11
-  .byte  196,98,125,24,98,56                 // vbroadcastss  0x38(%rdx),%ymm12
-  .byte  197,36,89,232                       // vmulps        %ymm0,%ymm11,%ymm13
-  .byte  196,65,20,88,236                    // vaddps        %ymm12,%ymm13,%ymm13
-  .byte  196,98,125,24,114,52                // vbroadcastss  0x34(%rdx),%ymm14
-  .byte  196,65,44,89,213                    // vmulps        %ymm13,%ymm10,%ymm10
-  .byte  196,65,12,88,210                    // vaddps        %ymm10,%ymm14,%ymm10
-  .byte  196,98,125,24,106,68                // vbroadcastss  0x44(%rdx),%ymm13
-  .byte  196,193,124,194,197,1               // vcmpltps      %ymm13,%ymm0,%ymm0
-  .byte  196,195,45,74,193,0                 // vblendvps     %ymm0,%ymm9,%ymm10,%ymm0
-  .byte  197,60,89,201                       // vmulps        %ymm1,%ymm8,%ymm9
-  .byte  197,116,89,209                      // vmulps        %ymm1,%ymm1,%ymm10
-  .byte  197,36,89,249                       // vmulps        %ymm1,%ymm11,%ymm15
-  .byte  196,65,4,88,252                     // vaddps        %ymm12,%ymm15,%ymm15
-  .byte  196,65,44,89,215                    // vmulps        %ymm15,%ymm10,%ymm10
-  .byte  196,65,12,88,210                    // vaddps        %ymm10,%ymm14,%ymm10
-  .byte  196,193,116,194,205,1               // vcmpltps      %ymm13,%ymm1,%ymm1
-  .byte  196,195,45,74,201,16                // vblendvps     %ymm1,%ymm9,%ymm10,%ymm1
-  .byte  197,60,89,194                       // vmulps        %ymm2,%ymm8,%ymm8
-  .byte  197,108,89,202                      // vmulps        %ymm2,%ymm2,%ymm9
-  .byte  197,36,89,210                       // vmulps        %ymm2,%ymm11,%ymm10
-  .byte  196,65,44,88,212                    // vaddps        %ymm12,%ymm10,%ymm10
-  .byte  196,65,52,89,202                    // vmulps        %ymm10,%ymm9,%ymm9
-  .byte  196,65,12,88,201                    // vaddps        %ymm9,%ymm14,%ymm9
-  .byte  196,193,108,194,213,1               // vcmpltps      %ymm13,%ymm2,%ymm2
-  .byte  196,195,53,74,208,32                // vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_to_srgb_avx
-_sk_to_srgb_avx:
-  .byte  197,124,82,192                      // vrsqrtps      %ymm0,%ymm8
-  .byte  196,65,124,83,200                   // vrcpps        %ymm8,%ymm9
-  .byte  196,65,124,82,208                   // vrsqrtps      %ymm8,%ymm10
-  .byte  196,98,125,24,66,72                 // vbroadcastss  0x48(%rdx),%ymm8
-  .byte  197,60,89,216                       // vmulps        %ymm0,%ymm8,%ymm11
-  .byte  196,98,125,24,34                    // vbroadcastss  (%rdx),%ymm12
-  .byte  196,98,125,24,106,76                // vbroadcastss  0x4c(%rdx),%ymm13
-  .byte  196,98,125,24,114,80                // vbroadcastss  0x50(%rdx),%ymm14
-  .byte  196,98,125,24,122,84                // vbroadcastss  0x54(%rdx),%ymm15
-  .byte  196,65,52,89,206                    // vmulps        %ymm14,%ymm9,%ymm9
-  .byte  196,65,52,88,207                    // vaddps        %ymm15,%ymm9,%ymm9
-  .byte  196,65,44,89,213                    // vmulps        %ymm13,%ymm10,%ymm10
-  .byte  196,65,44,88,201                    // vaddps        %ymm9,%ymm10,%ymm9
-  .byte  196,65,28,93,201                    // vminps        %ymm9,%ymm12,%ymm9
-  .byte  196,98,125,24,82,88                 // vbroadcastss  0x58(%rdx),%ymm10
-  .byte  196,193,124,194,194,1               // vcmpltps      %ymm10,%ymm0,%ymm0
-  .byte  196,195,53,74,195,0                 // vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
-  .byte  197,124,82,201                      // vrsqrtps      %ymm1,%ymm9
-  .byte  196,65,124,83,217                   // vrcpps        %ymm9,%ymm11
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,12,89,219                    // vmulps        %ymm11,%ymm14,%ymm11
-  .byte  196,65,4,88,219                     // vaddps        %ymm11,%ymm15,%ymm11
-  .byte  196,65,20,89,201                    // vmulps        %ymm9,%ymm13,%ymm9
-  .byte  196,65,52,88,203                    // vaddps        %ymm11,%ymm9,%ymm9
-  .byte  197,60,89,217                       // vmulps        %ymm1,%ymm8,%ymm11
-  .byte  196,65,28,93,201                    // vminps        %ymm9,%ymm12,%ymm9
-  .byte  196,193,116,194,202,1               // vcmpltps      %ymm10,%ymm1,%ymm1
-  .byte  196,195,53,74,203,16                // vblendvps     %ymm1,%ymm11,%ymm9,%ymm1
-  .byte  197,124,82,202                      // vrsqrtps      %ymm2,%ymm9
-  .byte  196,65,124,83,217                   // vrcpps        %ymm9,%ymm11
-  .byte  196,65,12,89,219                    // vmulps        %ymm11,%ymm14,%ymm11
-  .byte  196,65,4,88,219                     // vaddps        %ymm11,%ymm15,%ymm11
-  .byte  196,65,124,82,201                   // vrsqrtps      %ymm9,%ymm9
-  .byte  196,65,20,89,201                    // vmulps        %ymm9,%ymm13,%ymm9
-  .byte  196,65,52,88,203                    // vaddps        %ymm11,%ymm9,%ymm9
-  .byte  196,65,28,93,201                    // vminps        %ymm9,%ymm12,%ymm9
-  .byte  197,60,89,194                       // vmulps        %ymm2,%ymm8,%ymm8
-  .byte  196,193,108,194,210,1               // vcmpltps      %ymm10,%ymm2,%ymm2
-  .byte  196,195,53,74,208,32                // vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_scale_1_float_avx
-_sk_scale_1_float_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  197,188,89,210                      // vmulps        %ymm2,%ymm8,%ymm2
-  .byte  197,188,89,219                      // vmulps        %ymm3,%ymm8,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_scale_u8_avx
-_sk_scale_u8_avx:
-  .byte  73,137,200                          // mov           %rcx,%r8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,1,248                            // add           %rdi,%rax
-  .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,65                              // jne           478 <_sk_scale_u8_avx+0x51>
-  .byte  197,123,16,0                        // vmovsd        (%rax),%xmm8
-  .byte  196,66,121,49,200                   // vpmovzxbd     %xmm8,%xmm9
-  .byte  196,67,121,4,192,229                // vpermilps     $0xe5,%xmm8,%xmm8
-  .byte  196,66,121,49,192                   // vpmovzxbd     %xmm8,%xmm8
-  .byte  196,67,53,24,192,1                  // vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
-  .byte  196,65,124,91,192                   // vcvtdq2ps     %ymm8,%ymm8
-  .byte  196,98,125,24,74,12                 // vbroadcastss  0xc(%rdx),%ymm9
-  .byte  196,65,60,89,193                    // vmulps        %ymm9,%ymm8,%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  197,188,89,210                      // vmulps        %ymm2,%ymm8,%ymm2
-  .byte  197,188,89,219                      // vmulps        %ymm3,%ymm8,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,137,193                          // mov           %r8,%rcx
-  .byte  255,224                             // jmpq          *%rax
-  .byte  49,201                              // xor           %ecx,%ecx
-  .byte  77,137,194                          // mov           %r8,%r10
-  .byte  69,49,201                           // xor           %r9d,%r9d
-  .byte  68,15,182,24                        // movzbl        (%rax),%r11d
-  .byte  72,255,192                          // inc           %rax
-  .byte  73,211,227                          // shl           %cl,%r11
-  .byte  77,9,217                            // or            %r11,%r9
-  .byte  72,131,193,8                        // add           $0x8,%rcx
-  .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           480 <_sk_scale_u8_avx+0x59>
-  .byte  196,65,249,110,193                  // vmovq         %r9,%xmm8
-  .byte  235,158                             // jmp           43b <_sk_scale_u8_avx+0x14>
-
-.globl _sk_lerp_1_float_avx
-_sk_lerp_1_float_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  197,252,92,196                      // vsubps        %ymm4,%ymm0,%ymm0
-  .byte  196,193,124,89,192                  // vmulps        %ymm8,%ymm0,%ymm0
-  .byte  197,252,88,196                      // vaddps        %ymm4,%ymm0,%ymm0
-  .byte  197,244,92,205                      // vsubps        %ymm5,%ymm1,%ymm1
-  .byte  196,193,116,89,200                  // vmulps        %ymm8,%ymm1,%ymm1
-  .byte  197,244,88,205                      // vaddps        %ymm5,%ymm1,%ymm1
-  .byte  197,236,92,214                      // vsubps        %ymm6,%ymm2,%ymm2
-  .byte  196,193,108,89,208                  // vmulps        %ymm8,%ymm2,%ymm2
-  .byte  197,236,88,214                      // vaddps        %ymm6,%ymm2,%ymm2
-  .byte  197,228,92,223                      // vsubps        %ymm7,%ymm3,%ymm3
-  .byte  196,193,100,89,216                  // vmulps        %ymm8,%ymm3,%ymm3
-  .byte  197,228,88,223                      // vaddps        %ymm7,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_lerp_u8_avx
-_sk_lerp_u8_avx:
-  .byte  73,137,200                          // mov           %rcx,%r8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,1,248                            // add           %rdi,%rax
-  .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,101                             // jne           551 <_sk_lerp_u8_avx+0x75>
-  .byte  197,123,16,0                        // vmovsd        (%rax),%xmm8
-  .byte  196,66,121,49,200                   // vpmovzxbd     %xmm8,%xmm9
-  .byte  196,67,121,4,192,229                // vpermilps     $0xe5,%xmm8,%xmm8
-  .byte  196,66,121,49,192                   // vpmovzxbd     %xmm8,%xmm8
-  .byte  196,67,53,24,192,1                  // vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
-  .byte  196,65,124,91,192                   // vcvtdq2ps     %ymm8,%ymm8
-  .byte  196,98,125,24,74,12                 // vbroadcastss  0xc(%rdx),%ymm9
-  .byte  196,65,60,89,193                    // vmulps        %ymm9,%ymm8,%ymm8
-  .byte  197,252,92,196                      // vsubps        %ymm4,%ymm0,%ymm0
-  .byte  196,193,124,89,192                  // vmulps        %ymm8,%ymm0,%ymm0
-  .byte  197,252,88,196                      // vaddps        %ymm4,%ymm0,%ymm0
-  .byte  197,244,92,205                      // vsubps        %ymm5,%ymm1,%ymm1
-  .byte  196,193,116,89,200                  // vmulps        %ymm8,%ymm1,%ymm1
-  .byte  197,244,88,205                      // vaddps        %ymm5,%ymm1,%ymm1
-  .byte  197,236,92,214                      // vsubps        %ymm6,%ymm2,%ymm2
-  .byte  196,193,108,89,208                  // vmulps        %ymm8,%ymm2,%ymm2
-  .byte  197,236,88,214                      // vaddps        %ymm6,%ymm2,%ymm2
-  .byte  197,228,92,223                      // vsubps        %ymm7,%ymm3,%ymm3
-  .byte  196,193,100,89,216                  // vmulps        %ymm8,%ymm3,%ymm3
-  .byte  197,228,88,223                      // vaddps        %ymm7,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,137,193                          // mov           %r8,%rcx
-  .byte  255,224                             // jmpq          *%rax
-  .byte  49,201                              // xor           %ecx,%ecx
-  .byte  77,137,194                          // mov           %r8,%r10
-  .byte  69,49,201                           // xor           %r9d,%r9d
-  .byte  68,15,182,24                        // movzbl        (%rax),%r11d
-  .byte  72,255,192                          // inc           %rax
-  .byte  73,211,227                          // shl           %cl,%r11
-  .byte  77,9,217                            // or            %r11,%r9
-  .byte  72,131,193,8                        // add           $0x8,%rcx
-  .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           559 <_sk_lerp_u8_avx+0x7d>
-  .byte  196,65,249,110,193                  // vmovq         %r9,%xmm8
-  .byte  233,119,255,255,255                 // jmpq          4f0 <_sk_lerp_u8_avx+0x14>
-
-.globl _sk_lerp_565_avx
-_sk_lerp_565_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,16                           // mov           (%rax),%r10
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,148,0,0,0                    // jne           61b <_sk_lerp_565_avx+0xa2>
-  .byte  196,65,122,111,4,122                // vmovdqu       (%r10,%rdi,2),%xmm8
-  .byte  197,225,239,219                     // vpxor         %xmm3,%xmm3,%xmm3
-  .byte  197,185,105,219                     // vpunpckhwd    %xmm3,%xmm8,%xmm3
-  .byte  196,66,121,51,192                   // vpmovzxwd     %xmm8,%xmm8
-  .byte  196,227,61,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  .byte  196,98,125,24,66,104                // vbroadcastss  0x68(%rdx),%ymm8
-  .byte  197,60,84,195                       // vandps        %ymm3,%ymm8,%ymm8
-  .byte  196,65,124,91,192                   // vcvtdq2ps     %ymm8,%ymm8
-  .byte  196,98,125,24,74,116                // vbroadcastss  0x74(%rdx),%ymm9
-  .byte  196,65,52,89,192                    // vmulps        %ymm8,%ymm9,%ymm8
-  .byte  196,98,125,24,74,108                // vbroadcastss  0x6c(%rdx),%ymm9
-  .byte  197,52,84,203                       // vandps        %ymm3,%ymm9,%ymm9
-  .byte  196,65,124,91,201                   // vcvtdq2ps     %ymm9,%ymm9
-  .byte  196,98,125,24,82,120                // vbroadcastss  0x78(%rdx),%ymm10
-  .byte  196,65,44,89,201                    // vmulps        %ymm9,%ymm10,%ymm9
-  .byte  196,98,125,24,82,112                // vbroadcastss  0x70(%rdx),%ymm10
-  .byte  197,172,84,219                      // vandps        %ymm3,%ymm10,%ymm3
-  .byte  197,252,91,219                      // vcvtdq2ps     %ymm3,%ymm3
-  .byte  196,98,125,24,82,124                // vbroadcastss  0x7c(%rdx),%ymm10
-  .byte  197,172,89,219                      // vmulps        %ymm3,%ymm10,%ymm3
-  .byte  197,252,92,196                      // vsubps        %ymm4,%ymm0,%ymm0
-  .byte  196,193,124,89,192                  // vmulps        %ymm8,%ymm0,%ymm0
-  .byte  197,252,88,196                      // vaddps        %ymm4,%ymm0,%ymm0
-  .byte  197,244,92,205                      // vsubps        %ymm5,%ymm1,%ymm1
-  .byte  196,193,116,89,201                  // vmulps        %ymm9,%ymm1,%ymm1
-  .byte  197,244,88,205                      // vaddps        %ymm5,%ymm1,%ymm1
-  .byte  197,236,92,214                      // vsubps        %ymm6,%ymm2,%ymm2
-  .byte  197,236,89,211                      // vmulps        %ymm3,%ymm2,%ymm2
-  .byte  197,236,88,214                      // vaddps        %ymm6,%ymm2,%ymm2
-  .byte  196,226,125,24,26                   // vbroadcastss  (%rdx),%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  65,137,200                          // mov           %ecx,%r8d
-  .byte  65,128,224,7                        // and           $0x7,%r8b
-  .byte  196,65,57,239,192                   // vpxor         %xmm8,%xmm8,%xmm8
-  .byte  65,254,200                          // dec           %r8b
-  .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,85,255,255,255               // ja            58d <_sk_lerp_565_avx+0x14>
-  .byte  76,141,13,73,0,0,0                  // lea           0x49(%rip),%r9        # 688 <_sk_lerp_565_avx+0x10f>
-  .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
-  .byte  76,1,200                            // add           %r9,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  197,225,239,219                     // vpxor         %xmm3,%xmm3,%xmm3
-  .byte  196,65,97,196,68,122,12,6           // vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8
-  .byte  196,65,57,196,68,122,10,5           // vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8
-  .byte  196,65,57,196,68,122,8,4            // vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8
-  .byte  196,65,57,196,68,122,6,3            // vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8
-  .byte  196,65,57,196,68,122,4,2            // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
-  .byte  196,65,57,196,68,122,2,1            // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
-  .byte  196,65,57,196,4,122,0               // vpinsrw       $0x0,(%r10,%rdi,2),%xmm8,%xmm8
-  .byte  233,5,255,255,255                   // jmpq          58d <_sk_lerp_565_avx+0x14>
-  .byte  244                                 // hlt
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  236                                 // in            (%dx),%al
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,228                             // jmpq          *%rsp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  220,255                             // fdivr         %st,%st(7)
-  .byte  255                                 // (bad)
-  .byte  255,212                             // callq         *%rsp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,204                             // dec           %esp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,192                             // inc           %eax
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_load_tables_avx
-_sk_load_tables_avx:
-  .byte  85                                  // push          %rbp
-  .byte  65,87                               // push          %r15
-  .byte  65,86                               // push          %r14
-  .byte  65,85                               // push          %r13
-  .byte  65,84                               // push          %r12
-  .byte  83                                  // push          %rbx
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,0                            // mov           (%rax),%r8
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,18,2,0,0                     // jne           8ce <_sk_load_tables_avx+0x22a>
-  .byte  196,65,124,16,4,184                 // vmovups       (%r8,%rdi,4),%ymm8
-  .byte  196,98,125,24,74,16                 // vbroadcastss  0x10(%rdx),%ymm9
-  .byte  196,193,52,84,192                   // vandps        %ymm8,%ymm9,%ymm0
-  .byte  196,193,249,126,193                 // vmovq         %xmm0,%r9
-  .byte  69,137,203                          // mov           %r9d,%r11d
-  .byte  196,195,249,22,194,1                // vpextrq       $0x1,%xmm0,%r10
-  .byte  69,137,214                          // mov           %r10d,%r14d
-  .byte  73,193,234,32                       // shr           $0x20,%r10
-  .byte  73,193,233,32                       // shr           $0x20,%r9
-  .byte  196,227,125,25,192,1                // vextractf128  $0x1,%ymm0,%xmm0
-  .byte  196,193,249,126,196                 // vmovq         %xmm0,%r12
-  .byte  69,137,231                          // mov           %r12d,%r15d
-  .byte  196,227,249,22,195,1                // vpextrq       $0x1,%xmm0,%rbx
-  .byte  65,137,221                          // mov           %ebx,%r13d
-  .byte  72,193,235,32                       // shr           $0x20,%rbx
-  .byte  73,193,236,32                       // shr           $0x20,%r12
-  .byte  72,139,104,8                        // mov           0x8(%rax),%rbp
-  .byte  76,139,64,16                        // mov           0x10(%rax),%r8
-  .byte  196,161,122,16,68,189,0             // vmovss        0x0(%rbp,%r15,4),%xmm0
-  .byte  196,163,121,33,68,165,0,16          // vinsertps     $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0
-  .byte  196,163,121,33,68,173,0,32          // vinsertps     $0x20,0x0(%rbp,%r13,4),%xmm0,%xmm0
-  .byte  197,250,16,76,157,0                 // vmovss        0x0(%rbp,%rbx,4),%xmm1
-  .byte  196,227,121,33,193,48               // vinsertps     $0x30,%xmm1,%xmm0,%xmm0
-  .byte  196,161,122,16,76,157,0             // vmovss        0x0(%rbp,%r11,4),%xmm1
-  .byte  196,163,113,33,76,141,0,16          // vinsertps     $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1
-  .byte  196,163,113,33,76,181,0,32          // vinsertps     $0x20,0x0(%rbp,%r14,4),%xmm1,%xmm1
-  .byte  196,161,122,16,92,149,0             // vmovss        0x0(%rbp,%r10,4),%xmm3
-  .byte  196,227,113,33,203,48               // vinsertps     $0x30,%xmm3,%xmm1,%xmm1
-  .byte  196,227,117,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
-  .byte  196,193,113,114,208,8               // vpsrld        $0x8,%xmm8,%xmm1
-  .byte  196,67,125,25,194,1                 // vextractf128  $0x1,%ymm8,%xmm10
-  .byte  196,193,105,114,210,8               // vpsrld        $0x8,%xmm10,%xmm2
-  .byte  196,227,117,24,202,1                // vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
-  .byte  197,180,84,201                      // vandps        %ymm1,%ymm9,%ymm1
-  .byte  196,193,249,126,201                 // vmovq         %xmm1,%r9
-  .byte  69,137,203                          // mov           %r9d,%r11d
-  .byte  196,195,249,22,202,1                // vpextrq       $0x1,%xmm1,%r10
-  .byte  69,137,214                          // mov           %r10d,%r14d
-  .byte  73,193,234,32                       // shr           $0x20,%r10
-  .byte  73,193,233,32                       // shr           $0x20,%r9
-  .byte  196,227,125,25,201,1                // vextractf128  $0x1,%ymm1,%xmm1
-  .byte  196,225,249,126,205                 // vmovq         %xmm1,%rbp
-  .byte  65,137,239                          // mov           %ebp,%r15d
-  .byte  196,227,249,22,203,1                // vpextrq       $0x1,%xmm1,%rbx
-  .byte  65,137,220                          // mov           %ebx,%r12d
-  .byte  72,193,235,32                       // shr           $0x20,%rbx
-  .byte  72,193,237,32                       // shr           $0x20,%rbp
-  .byte  196,129,122,16,12,184               // vmovss        (%r8,%r15,4),%xmm1
-  .byte  196,195,113,33,12,168,16            // vinsertps     $0x10,(%r8,%rbp,4),%xmm1,%xmm1
-  .byte  196,129,122,16,20,160               // vmovss        (%r8,%r12,4),%xmm2
-  .byte  196,227,113,33,202,32               // vinsertps     $0x20,%xmm2,%xmm1,%xmm1
-  .byte  196,193,122,16,20,152               // vmovss        (%r8,%rbx,4),%xmm2
-  .byte  196,227,113,33,202,48               // vinsertps     $0x30,%xmm2,%xmm1,%xmm1
-  .byte  196,129,122,16,20,152               // vmovss        (%r8,%r11,4),%xmm2
-  .byte  196,131,105,33,20,136,16            // vinsertps     $0x10,(%r8,%r9,4),%xmm2,%xmm2
-  .byte  196,129,122,16,28,176               // vmovss        (%r8,%r14,4),%xmm3
-  .byte  196,227,105,33,211,32               // vinsertps     $0x20,%xmm3,%xmm2,%xmm2
-  .byte  196,129,122,16,28,144               // vmovss        (%r8,%r10,4),%xmm3
-  .byte  196,227,105,33,211,48               // vinsertps     $0x30,%xmm3,%xmm2,%xmm2
-  .byte  196,227,109,24,201,1                // vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
-  .byte  72,139,64,24                        // mov           0x18(%rax),%rax
-  .byte  196,193,105,114,208,16              // vpsrld        $0x10,%xmm8,%xmm2
-  .byte  196,193,97,114,210,16               // vpsrld        $0x10,%xmm10,%xmm3
-  .byte  196,227,109,24,211,1                // vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
-  .byte  197,180,84,210                      // vandps        %ymm2,%ymm9,%ymm2
-  .byte  196,193,249,126,208                 // vmovq         %xmm2,%r8
-  .byte  69,137,194                          // mov           %r8d,%r10d
-  .byte  196,195,249,22,209,1                // vpextrq       $0x1,%xmm2,%r9
-  .byte  69,137,203                          // mov           %r9d,%r11d
-  .byte  73,193,233,32                       // shr           $0x20,%r9
-  .byte  73,193,232,32                       // shr           $0x20,%r8
-  .byte  196,227,125,25,210,1                // vextractf128  $0x1,%ymm2,%xmm2
-  .byte  196,225,249,126,213                 // vmovq         %xmm2,%rbp
-  .byte  65,137,238                          // mov           %ebp,%r14d
-  .byte  196,227,249,22,211,1                // vpextrq       $0x1,%xmm2,%rbx
-  .byte  65,137,223                          // mov           %ebx,%r15d
-  .byte  72,193,235,32                       // shr           $0x20,%rbx
-  .byte  72,193,237,32                       // shr           $0x20,%rbp
-  .byte  196,161,122,16,20,176               // vmovss        (%rax,%r14,4),%xmm2
-  .byte  196,227,105,33,20,168,16            // vinsertps     $0x10,(%rax,%rbp,4),%xmm2,%xmm2
-  .byte  196,161,122,16,28,184               // vmovss        (%rax,%r15,4),%xmm3
-  .byte  196,227,105,33,211,32               // vinsertps     $0x20,%xmm3,%xmm2,%xmm2
-  .byte  197,250,16,28,152                   // vmovss        (%rax,%rbx,4),%xmm3
-  .byte  196,99,105,33,203,48                // vinsertps     $0x30,%xmm3,%xmm2,%xmm9
-  .byte  196,161,122,16,28,144               // vmovss        (%rax,%r10,4),%xmm3
-  .byte  196,163,97,33,28,128,16             // vinsertps     $0x10,(%rax,%r8,4),%xmm3,%xmm3
-  .byte  196,161,122,16,20,152               // vmovss        (%rax,%r11,4),%xmm2
-  .byte  196,227,97,33,210,32                // vinsertps     $0x20,%xmm2,%xmm3,%xmm2
-  .byte  196,161,122,16,28,136               // vmovss        (%rax,%r9,4),%xmm3
-  .byte  196,227,105,33,211,48               // vinsertps     $0x30,%xmm3,%xmm2,%xmm2
-  .byte  196,195,109,24,209,1                // vinsertf128   $0x1,%xmm9,%ymm2,%ymm2
-  .byte  196,193,57,114,208,24               // vpsrld        $0x18,%xmm8,%xmm8
-  .byte  196,193,97,114,210,24               // vpsrld        $0x18,%xmm10,%xmm3
-  .byte  196,227,61,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  .byte  197,252,91,219                      // vcvtdq2ps     %ymm3,%ymm3
-  .byte  196,98,125,24,66,12                 // vbroadcastss  0xc(%rdx),%ymm8
-  .byte  196,193,100,89,216                  // vmulps        %ymm8,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  91                                  // pop           %rbx
-  .byte  65,92                               // pop           %r12
-  .byte  65,93                               // pop           %r13
-  .byte  65,94                               // pop           %r14
-  .byte  65,95                               // pop           %r15
-  .byte  93                                  // pop           %rbp
-  .byte  255,224                             // jmpq          *%rax
-  .byte  65,137,201                          // mov           %ecx,%r9d
-  .byte  65,128,225,7                        // and           $0x7,%r9b
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  65,254,201                          // dec           %r9b
-  .byte  69,15,182,201                       // movzbl        %r9b,%r9d
-  .byte  65,128,249,6                        // cmp           $0x6,%r9b
-  .byte  15,135,215,253,255,255              // ja            6c2 <_sk_load_tables_avx+0x1e>
-  .byte  76,141,21,138,0,0,0                 // lea           0x8a(%rip),%r10        # 97c <_sk_load_tables_avx+0x2d8>
-  .byte  79,99,12,138                        // movslq        (%r10,%r9,4),%r9
-  .byte  77,1,209                            // add           %r10,%r9
-  .byte  65,255,225                          // jmpq          *%r9
-  .byte  196,193,121,110,68,184,24           // vmovd         0x18(%r8,%rdi,4),%xmm0
-  .byte  197,249,112,192,68                  // vpshufd       $0x44,%xmm0,%xmm0
-  .byte  196,227,125,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  196,99,117,12,192,64                // vblendps      $0x40,%ymm0,%ymm1,%ymm8
-  .byte  196,99,125,25,192,1                 // vextractf128  $0x1,%ymm8,%xmm0
-  .byte  196,195,121,34,68,184,20,1          // vpinsrd       $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0
-  .byte  196,99,61,24,192,1                  // vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
-  .byte  196,99,125,25,192,1                 // vextractf128  $0x1,%ymm8,%xmm0
-  .byte  196,195,121,34,68,184,16,0          // vpinsrd       $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0
-  .byte  196,99,61,24,192,1                  // vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
-  .byte  196,195,57,34,68,184,12,3           // vpinsrd       $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0
-  .byte  196,99,61,12,192,15                 // vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  .byte  196,195,57,34,68,184,8,2            // vpinsrd       $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0
-  .byte  196,99,61,12,192,15                 // vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  .byte  196,195,57,34,68,184,4,1            // vpinsrd       $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0
-  .byte  196,99,61,12,192,15                 // vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  .byte  196,195,57,34,4,184,0               // vpinsrd       $0x0,(%r8,%rdi,4),%xmm8,%xmm0
-  .byte  196,99,61,12,192,15                 // vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  .byte  233,70,253,255,255                  // jmpq          6c2 <_sk_load_tables_avx+0x1e>
-  .byte  238                                 // out           %al,(%dx)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,224                             // jmpq          *%rax
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,210                             // callq         *%rdx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,196                             // inc           %esp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,176,255,255,255,156             // pushq         -0x63000001(%rax)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-  .byte  128,255,255                         // cmp           $0xff,%bh
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_load_a8_avx
-_sk_load_a8_avx:
-  .byte  73,137,200                          // mov           %rcx,%r8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,1,248                            // add           %rdi,%rax
-  .byte  77,133,192                          // test          %r8,%r8
-  .byte  117,59                              // jne           9e3 <_sk_load_a8_avx+0x4b>
-  .byte  197,251,16,0                        // vmovsd        (%rax),%xmm0
-  .byte  196,226,121,49,200                  // vpmovzxbd     %xmm0,%xmm1
-  .byte  196,227,121,4,192,229               // vpermilps     $0xe5,%xmm0,%xmm0
-  .byte  196,226,121,49,192                  // vpmovzxbd     %xmm0,%xmm0
-  .byte  196,227,117,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
-  .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
-  .byte  196,226,125,24,74,12                // vbroadcastss  0xc(%rdx),%ymm1
-  .byte  197,252,89,217                      // vmulps        %ymm1,%ymm0,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,252,87,192                      // vxorps        %ymm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  197,236,87,210                      // vxorps        %ymm2,%ymm2,%ymm2
-  .byte  76,137,193                          // mov           %r8,%rcx
-  .byte  255,224                             // jmpq          *%rax
-  .byte  49,201                              // xor           %ecx,%ecx
-  .byte  77,137,194                          // mov           %r8,%r10
-  .byte  69,49,201                           // xor           %r9d,%r9d
-  .byte  68,15,182,24                        // movzbl        (%rax),%r11d
-  .byte  72,255,192                          // inc           %rax
-  .byte  73,211,227                          // shl           %cl,%r11
-  .byte  77,9,217                            // or            %r11,%r9
-  .byte  72,131,193,8                        // add           $0x8,%rcx
-  .byte  73,255,202                          // dec           %r10
-  .byte  117,234                             // jne           9eb <_sk_load_a8_avx+0x53>
-  .byte  196,193,249,110,193                 // vmovq         %r9,%xmm0
-  .byte  235,164                             // jmp           9ac <_sk_load_a8_avx+0x14>
-
-.globl _sk_store_a8_avx
-_sk_store_a8_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,8                            // mov           (%rax),%r9
-  .byte  196,98,125,24,66,8                  // vbroadcastss  0x8(%rdx),%ymm8
-  .byte  197,60,89,195                       // vmulps        %ymm3,%ymm8,%ymm8
-  .byte  196,65,125,91,192                   // vcvtps2dq     %ymm8,%ymm8
-  .byte  196,67,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm9
-  .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
-  .byte  196,65,57,103,192                   // vpackuswb     %xmm8,%xmm8,%xmm8
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           a3b <_sk_store_a8_avx+0x33>
-  .byte  196,65,123,17,4,57                  // vmovsd        %xmm8,(%r9,%rdi,1)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  137,200                             // mov           %ecx,%eax
-  .byte  36,7                                // and           $0x7,%al
-  .byte  254,200                             // dec           %al
-  .byte  68,15,182,192                       // movzbl        %al,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            a37 <_sk_store_a8_avx+0x2f>
-  .byte  196,66,121,48,192                   // vpmovzxbw     %xmm8,%xmm8
-  .byte  76,141,21,69,0,0,0                  // lea           0x45(%rip),%r10        # a9c <_sk_store_a8_avx+0x94>
-  .byte  75,99,4,130                         // movslq        (%r10,%r8,4),%rax
-  .byte  76,1,208                            // add           %r10,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  196,67,121,20,68,57,6,12            // vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,5,10            // vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,4,8             // vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,3,6             // vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,2,4             // vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
-  .byte  196,67,121,20,68,57,1,2             // vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
-  .byte  196,67,121,20,4,57,0                // vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
-  .byte  235,158                             // jmp           a37 <_sk_store_a8_avx+0x2f>
-  .byte  15,31,0                             // nopl          (%rax)
-  .byte  244                                 // hlt
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  236                                 // in            (%dx),%al
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,228                             // jmpq          *%rsp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  220,255                             // fdivr         %st,%st(7)
-  .byte  255                                 // (bad)
-  .byte  255,212                             // callq         *%rsp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,204                             // dec           %esp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,196                             // inc           %esp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_load_565_avx
-_sk_load_565_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,16                           // mov           (%rax),%r10
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,106                             // jne           b2c <_sk_load_565_avx+0x74>
-  .byte  196,193,122,111,4,122               // vmovdqu       (%r10,%rdi,2),%xmm0
-  .byte  197,241,239,201                     // vpxor         %xmm1,%xmm1,%xmm1
-  .byte  197,249,105,201                     // vpunpckhwd    %xmm1,%xmm0,%xmm1
-  .byte  196,226,121,51,192                  // vpmovzxwd     %xmm0,%xmm0
-  .byte  196,227,125,24,209,1                // vinsertf128   $0x1,%xmm1,%ymm0,%ymm2
-  .byte  196,226,125,24,66,104               // vbroadcastss  0x68(%rdx),%ymm0
-  .byte  197,252,84,194                      // vandps        %ymm2,%ymm0,%ymm0
-  .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
-  .byte  196,226,125,24,74,116               // vbroadcastss  0x74(%rdx),%ymm1
-  .byte  197,244,89,192                      // vmulps        %ymm0,%ymm1,%ymm0
-  .byte  196,226,125,24,74,108               // vbroadcastss  0x6c(%rdx),%ymm1
-  .byte  197,244,84,202                      // vandps        %ymm2,%ymm1,%ymm1
-  .byte  197,252,91,201                      // vcvtdq2ps     %ymm1,%ymm1
-  .byte  196,226,125,24,90,120               // vbroadcastss  0x78(%rdx),%ymm3
-  .byte  197,228,89,201                      // vmulps        %ymm1,%ymm3,%ymm1
-  .byte  196,226,125,24,90,112               // vbroadcastss  0x70(%rdx),%ymm3
-  .byte  197,228,84,210                      // vandps        %ymm2,%ymm3,%ymm2
-  .byte  197,252,91,210                      // vcvtdq2ps     %ymm2,%ymm2
-  .byte  196,226,125,24,90,124               // vbroadcastss  0x7c(%rdx),%ymm3
-  .byte  197,228,89,210                      // vmulps        %ymm2,%ymm3,%ymm2
-  .byte  196,226,125,24,26                   // vbroadcastss  (%rdx),%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  65,137,200                          // mov           %ecx,%r8d
-  .byte  65,128,224,7                        // and           $0x7,%r8b
-  .byte  197,249,239,192                     // vpxor         %xmm0,%xmm0,%xmm0
-  .byte  65,254,200                          // dec           %r8b
-  .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,132                             // ja            ac8 <_sk_load_565_avx+0x10>
-  .byte  76,141,13,73,0,0,0                  // lea           0x49(%rip),%r9        # b94 <_sk_load_565_avx+0xdc>
-  .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
-  .byte  76,1,200                            // add           %r9,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  197,249,239,192                     // vpxor         %xmm0,%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,12,6         // vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,10,5         // vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,8,4          // vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,6,3          // vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,4,2          // vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,68,122,2,1          // vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  196,193,121,196,4,122,0             // vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  .byte  233,52,255,255,255                  // jmpq          ac8 <_sk_load_565_avx+0x10>
-  .byte  244                                 // hlt
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  236                                 // in            (%dx),%al
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,228                             // jmpq          *%rsp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  220,255                             // fdivr         %st,%st(7)
-  .byte  255                                 // (bad)
-  .byte  255,212                             // callq         *%rsp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,204                             // dec           %esp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,192                             // inc           %eax
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_store_565_avx
-_sk_store_565_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,8                            // mov           (%rax),%r9
-  .byte  196,98,125,24,130,128,0,0,0         // vbroadcastss  0x80(%rdx),%ymm8
-  .byte  197,60,89,200                       // vmulps        %ymm0,%ymm8,%ymm9
-  .byte  196,65,125,91,201                   // vcvtps2dq     %ymm9,%ymm9
-  .byte  196,193,41,114,241,11               // vpslld        $0xb,%xmm9,%xmm10
-  .byte  196,67,125,25,201,1                 // vextractf128  $0x1,%ymm9,%xmm9
-  .byte  196,193,49,114,241,11               // vpslld        $0xb,%xmm9,%xmm9
-  .byte  196,67,45,24,201,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm9
-  .byte  196,98,125,24,146,132,0,0,0         // vbroadcastss  0x84(%rdx),%ymm10
-  .byte  197,44,89,209                       // vmulps        %ymm1,%ymm10,%ymm10
-  .byte  196,65,125,91,210                   // vcvtps2dq     %ymm10,%ymm10
-  .byte  196,193,33,114,242,5                // vpslld        $0x5,%xmm10,%xmm11
-  .byte  196,67,125,25,210,1                 // vextractf128  $0x1,%ymm10,%xmm10
-  .byte  196,193,41,114,242,5                // vpslld        $0x5,%xmm10,%xmm10
-  .byte  196,67,37,24,210,1                  // vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
-  .byte  196,65,45,86,201                    // vorpd         %ymm9,%ymm10,%ymm9
-  .byte  197,60,89,194                       // vmulps        %ymm2,%ymm8,%ymm8
-  .byte  196,65,125,91,192                   // vcvtps2dq     %ymm8,%ymm8
-  .byte  196,65,53,86,192                    // vorpd         %ymm8,%ymm9,%ymm8
-  .byte  196,67,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm9
-  .byte  196,66,57,43,193                    // vpackusdw     %xmm9,%xmm8,%xmm8
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           c36 <_sk_store_565_avx+0x86>
-  .byte  196,65,122,127,4,121                // vmovdqu       %xmm8,(%r9,%rdi,2)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  137,200                             // mov           %ecx,%eax
-  .byte  36,7                                // and           $0x7,%al
-  .byte  254,200                             // dec           %al
-  .byte  68,15,182,192                       // movzbl        %al,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            c32 <_sk_store_565_avx+0x82>
-  .byte  76,141,21,71,0,0,0                  // lea           0x47(%rip),%r10        # c94 <_sk_store_565_avx+0xe4>
-  .byte  75,99,4,130                         // movslq        (%r10,%r8,4),%rax
-  .byte  76,1,208                            // add           %r10,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  196,67,121,21,68,121,12,6           // vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,10,5           // vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,8,4            // vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,6,3            // vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,4,2            // vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
-  .byte  196,67,121,21,68,121,2,1            // vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
-  .byte  197,121,126,192                     // vmovd         %xmm8,%eax
-  .byte  102,65,137,4,121                    // mov           %ax,(%r9,%rdi,2)
-  .byte  235,161                             // jmp           c32 <_sk_store_565_avx+0x82>
-  .byte  15,31,0                             // nopl          (%rax)
-  .byte  242,255                             // repnz         (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  234                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,226                             // jmpq          *%rdx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  218,255                             // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,210                             // callq         *%rdx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,202                             // dec           %edx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,194                             // inc           %edx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_load_8888_avx
-_sk_load_8888_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,16                           // mov           (%rax),%r10
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,125                             // jne           d37 <_sk_load_8888_avx+0x87>
-  .byte  196,65,124,16,12,186                // vmovups       (%r10,%rdi,4),%ymm9
-  .byte  196,98,125,24,90,16                 // vbroadcastss  0x10(%rdx),%ymm11
-  .byte  196,193,36,84,193                   // vandps        %ymm9,%ymm11,%ymm0
-  .byte  197,252,91,192                      // vcvtdq2ps     %ymm0,%ymm0
-  .byte  196,98,125,24,66,12                 // vbroadcastss  0xc(%rdx),%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  196,193,41,114,209,8                // vpsrld        $0x8,%xmm9,%xmm10
-  .byte  196,99,125,25,203,1                 // vextractf128  $0x1,%ymm9,%xmm3
-  .byte  197,241,114,211,8                   // vpsrld        $0x8,%xmm3,%xmm1
-  .byte  196,227,45,24,201,1                 // vinsertf128   $0x1,%xmm1,%ymm10,%ymm1
-  .byte  197,164,84,201                      // vandps        %ymm1,%ymm11,%ymm1
-  .byte  197,252,91,201                      // vcvtdq2ps     %ymm1,%ymm1
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  196,193,41,114,209,16               // vpsrld        $0x10,%xmm9,%xmm10
-  .byte  197,233,114,211,16                  // vpsrld        $0x10,%xmm3,%xmm2
-  .byte  196,227,45,24,210,1                 // vinsertf128   $0x1,%xmm2,%ymm10,%ymm2
-  .byte  197,164,84,210                      // vandps        %ymm2,%ymm11,%ymm2
-  .byte  197,252,91,210                      // vcvtdq2ps     %ymm2,%ymm2
-  .byte  197,188,89,210                      // vmulps        %ymm2,%ymm8,%ymm2
-  .byte  196,193,49,114,209,24               // vpsrld        $0x18,%xmm9,%xmm9
-  .byte  197,225,114,211,24                  // vpsrld        $0x18,%xmm3,%xmm3
-  .byte  196,227,53,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm9,%ymm3
-  .byte  197,252,91,219                      // vcvtdq2ps     %ymm3,%ymm3
-  .byte  196,193,100,89,216                  // vmulps        %ymm8,%ymm3,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  65,137,200                          // mov           %ecx,%r8d
-  .byte  65,128,224,7                        // and           $0x7,%r8b
-  .byte  196,65,52,87,201                    // vxorps        %ymm9,%ymm9,%ymm9
-  .byte  65,254,200                          // dec           %r8b
-  .byte  69,15,182,192                       // movzbl        %r8b,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  15,135,108,255,255,255              // ja            cc0 <_sk_load_8888_avx+0x10>
-  .byte  76,141,13,137,0,0,0                 // lea           0x89(%rip),%r9        # de4 <_sk_load_8888_avx+0x134>
-  .byte  75,99,4,129                         // movslq        (%r9,%r8,4),%rax
-  .byte  76,1,200                            // add           %r9,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  196,193,121,110,68,186,24           // vmovd         0x18(%r10,%rdi,4),%xmm0
-  .byte  197,249,112,192,68                  // vpshufd       $0x44,%xmm0,%xmm0
-  .byte  196,227,125,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
-  .byte  197,244,87,201                      // vxorps        %ymm1,%ymm1,%ymm1
-  .byte  196,99,117,12,200,64                // vblendps      $0x40,%ymm0,%ymm1,%ymm9
-  .byte  196,99,125,25,200,1                 // vextractf128  $0x1,%ymm9,%xmm0
-  .byte  196,195,121,34,68,186,20,1          // vpinsrd       $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0
-  .byte  196,99,53,24,200,1                  // vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
-  .byte  196,99,125,25,200,1                 // vextractf128  $0x1,%ymm9,%xmm0
-  .byte  196,195,121,34,68,186,16,0          // vpinsrd       $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0
-  .byte  196,99,53,24,200,1                  // vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
-  .byte  196,195,49,34,68,186,12,3           // vpinsrd       $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0
-  .byte  196,99,53,12,200,15                 // vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  .byte  196,195,49,34,68,186,8,2            // vpinsrd       $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0
-  .byte  196,99,53,12,200,15                 // vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  .byte  196,195,49,34,68,186,4,1            // vpinsrd       $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0
-  .byte  196,99,53,12,200,15                 // vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  .byte  196,195,49,34,4,186,0               // vpinsrd       $0x0,(%r10,%rdi,4),%xmm9,%xmm0
-  .byte  196,99,53,12,200,15                 // vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  .byte  233,220,254,255,255                 // jmpq          cc0 <_sk_load_8888_avx+0x10>
-  .byte  238                                 // out           %al,(%dx)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,224                             // jmpq          *%rax
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,210                             // callq         *%rdx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,196                             // inc           %esp
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,176,255,255,255,156             // pushq         -0x63000001(%rax)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-  .byte  128,255,255                         // cmp           $0xff,%bh
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_store_8888_avx
-_sk_store_8888_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,8                            // mov           (%rax),%r9
-  .byte  196,98,125,24,66,8                  // vbroadcastss  0x8(%rdx),%ymm8
-  .byte  197,60,89,200                       // vmulps        %ymm0,%ymm8,%ymm9
-  .byte  196,65,125,91,201                   // vcvtps2dq     %ymm9,%ymm9
-  .byte  197,60,89,209                       // vmulps        %ymm1,%ymm8,%ymm10
-  .byte  196,65,125,91,210                   // vcvtps2dq     %ymm10,%ymm10
-  .byte  196,193,33,114,242,8                // vpslld        $0x8,%xmm10,%xmm11
-  .byte  196,67,125,25,210,1                 // vextractf128  $0x1,%ymm10,%xmm10
-  .byte  196,193,41,114,242,8                // vpslld        $0x8,%xmm10,%xmm10
-  .byte  196,67,37,24,210,1                  // vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
-  .byte  196,65,45,86,201                    // vorpd         %ymm9,%ymm10,%ymm9
-  .byte  197,60,89,210                       // vmulps        %ymm2,%ymm8,%ymm10
-  .byte  196,65,125,91,210                   // vcvtps2dq     %ymm10,%ymm10
-  .byte  196,193,33,114,242,16               // vpslld        $0x10,%xmm10,%xmm11
-  .byte  196,67,125,25,210,1                 // vextractf128  $0x1,%ymm10,%xmm10
-  .byte  196,193,41,114,242,16               // vpslld        $0x10,%xmm10,%xmm10
-  .byte  196,67,37,24,210,1                  // vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
-  .byte  197,60,89,195                       // vmulps        %ymm3,%ymm8,%ymm8
-  .byte  196,65,125,91,192                   // vcvtps2dq     %ymm8,%ymm8
-  .byte  196,193,33,114,240,24               // vpslld        $0x18,%xmm8,%xmm11
-  .byte  196,67,125,25,192,1                 // vextractf128  $0x1,%ymm8,%xmm8
-  .byte  196,193,57,114,240,24               // vpslld        $0x18,%xmm8,%xmm8
-  .byte  196,67,37,24,192,1                  // vinsertf128   $0x1,%xmm8,%ymm11,%ymm8
-  .byte  196,65,45,86,192                    // vorpd         %ymm8,%ymm10,%ymm8
-  .byte  196,65,53,86,192                    // vorpd         %ymm8,%ymm9,%ymm8
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,10                              // jne           e95 <_sk_store_8888_avx+0x95>
-  .byte  196,65,124,17,4,185                 // vmovups       %ymm8,(%r9,%rdi,4)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  137,200                             // mov           %ecx,%eax
-  .byte  36,7                                // and           $0x7,%al
-  .byte  254,200                             // dec           %al
-  .byte  68,15,182,192                       // movzbl        %al,%r8d
-  .byte  65,128,248,6                        // cmp           $0x6,%r8b
-  .byte  119,236                             // ja            e91 <_sk_store_8888_avx+0x91>
-  .byte  76,141,21,84,0,0,0                  // lea           0x54(%rip),%r10        # f00 <_sk_store_8888_avx+0x100>
-  .byte  75,99,4,130                         // movslq        (%r10,%r8,4),%rax
-  .byte  76,1,208                            // add           %r10,%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  196,67,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm9
-  .byte  196,67,121,22,76,185,24,2           // vpextrd       $0x2,%xmm9,0x18(%r9,%rdi,4)
-  .byte  196,67,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm9
-  .byte  196,67,121,22,76,185,20,1           // vpextrd       $0x1,%xmm9,0x14(%r9,%rdi,4)
-  .byte  196,67,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm9
-  .byte  196,65,121,126,76,185,16            // vmovd         %xmm9,0x10(%r9,%rdi,4)
-  .byte  196,67,121,22,68,185,12,3           // vpextrd       $0x3,%xmm8,0xc(%r9,%rdi,4)
-  .byte  196,67,121,22,68,185,8,2            // vpextrd       $0x2,%xmm8,0x8(%r9,%rdi,4)
-  .byte  196,67,121,22,68,185,4,1            // vpextrd       $0x1,%xmm8,0x4(%r9,%rdi,4)
-  .byte  196,65,121,126,4,185                // vmovd         %xmm8,(%r9,%rdi,4)
-  .byte  235,147                             // jmp           e91 <_sk_store_8888_avx+0x91>
-  .byte  102,144                             // xchg          %ax,%ax
-  .byte  246,255                             // idiv          %bh
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  238                                 // out           %al,(%dx)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,230                             // jmpq          *%rsi
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  222,255                             // fdivrp        %st,%st(7)
-  .byte  255                                 // (bad)
-  .byte  255,209                             // callq         *%rcx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255,195                             // inc           %ebx
-  .byte  255                                 // (bad)
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-  .byte  181,255                             // mov           $0xff,%ch
-  .byte  255                                 // (bad)
-  .byte  255                                 // .byte         0xff
-
-.globl _sk_load_f16_avx
-_sk_load_f16_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  15,133,240,0,0,0                    // jne           101a <_sk_load_f16_avx+0xfe>
-  .byte  197,249,16,12,248                   // vmovupd       (%rax,%rdi,8),%xmm1
-  .byte  197,249,16,84,248,16                // vmovupd       0x10(%rax,%rdi,8),%xmm2
-  .byte  197,249,16,92,248,32                // vmovupd       0x20(%rax,%rdi,8),%xmm3
-  .byte  197,121,16,68,248,48                // vmovupd       0x30(%rax,%rdi,8),%xmm8
-  .byte  197,241,97,194                      // vpunpcklwd    %xmm2,%xmm1,%xmm0
-  .byte  197,241,105,202                     // vpunpckhwd    %xmm2,%xmm1,%xmm1
-  .byte  196,193,97,97,208                   // vpunpcklwd    %xmm8,%xmm3,%xmm2
-  .byte  196,193,97,105,216                  // vpunpckhwd    %xmm8,%xmm3,%xmm3
-  .byte  197,121,97,193                      // vpunpcklwd    %xmm1,%xmm0,%xmm8
-  .byte  197,249,105,193                     // vpunpckhwd    %xmm1,%xmm0,%xmm0
-  .byte  197,233,97,203                      // vpunpcklwd    %xmm3,%xmm2,%xmm1
-  .byte  197,105,105,203                     // vpunpckhwd    %xmm3,%xmm2,%xmm9
-  .byte  197,249,110,90,100                  // vmovd         0x64(%rdx),%xmm3
-  .byte  197,249,112,219,0                   // vpshufd       $0x0,%xmm3,%xmm3
-  .byte  196,193,97,101,208                  // vpcmpgtw      %xmm8,%xmm3,%xmm2
-  .byte  196,65,105,223,192                  // vpandn        %xmm8,%xmm2,%xmm8
-  .byte  197,225,101,208                     // vpcmpgtw      %xmm0,%xmm3,%xmm2
-  .byte  197,233,223,192                     // vpandn        %xmm0,%xmm2,%xmm0
-  .byte  197,225,101,209                     // vpcmpgtw      %xmm1,%xmm3,%xmm2
-  .byte  197,233,223,201                     // vpandn        %xmm1,%xmm2,%xmm1
-  .byte  196,193,97,101,209                  // vpcmpgtw      %xmm9,%xmm3,%xmm2
-  .byte  196,193,105,223,209                 // vpandn        %xmm9,%xmm2,%xmm2
-  .byte  196,66,121,51,208                   // vpmovzxwd     %xmm8,%xmm10
-  .byte  196,98,121,51,201                   // vpmovzxwd     %xmm1,%xmm9
-  .byte  197,225,239,219                     // vpxor         %xmm3,%xmm3,%xmm3
-  .byte  197,57,105,195                      // vpunpckhwd    %xmm3,%xmm8,%xmm8
-  .byte  197,241,105,203                     // vpunpckhwd    %xmm3,%xmm1,%xmm1
-  .byte  196,98,121,51,216                   // vpmovzxwd     %xmm0,%xmm11
-  .byte  196,98,121,51,226                   // vpmovzxwd     %xmm2,%xmm12
-  .byte  197,121,105,235                     // vpunpckhwd    %xmm3,%xmm0,%xmm13
-  .byte  197,105,105,243                     // vpunpckhwd    %xmm3,%xmm2,%xmm14
-  .byte  196,193,121,114,242,13              // vpslld        $0xd,%xmm10,%xmm0
-  .byte  196,193,105,114,241,13              // vpslld        $0xd,%xmm9,%xmm2
-  .byte  196,227,125,24,194,1                // vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
-  .byte  196,98,125,24,74,92                 // vbroadcastss  0x5c(%rdx),%ymm9
-  .byte  197,180,89,192                      // vmulps        %ymm0,%ymm9,%ymm0
-  .byte  196,193,105,114,240,13              // vpslld        $0xd,%xmm8,%xmm2
-  .byte  197,241,114,241,13                  // vpslld        $0xd,%xmm1,%xmm1
-  .byte  196,227,109,24,201,1                // vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
-  .byte  197,180,89,201                      // vmulps        %ymm1,%ymm9,%ymm1
-  .byte  196,193,105,114,243,13              // vpslld        $0xd,%xmm11,%xmm2
-  .byte  196,193,97,114,244,13               // vpslld        $0xd,%xmm12,%xmm3
-  .byte  196,227,109,24,211,1                // vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
-  .byte  197,180,89,210                      // vmulps        %ymm2,%ymm9,%ymm2
-  .byte  196,193,57,114,245,13               // vpslld        $0xd,%xmm13,%xmm8
-  .byte  196,193,97,114,246,13               // vpslld        $0xd,%xmm14,%xmm3
-  .byte  196,227,61,24,219,1                 // vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  .byte  197,180,89,219                      // vmulps        %ymm3,%ymm9,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  197,251,16,12,248                   // vmovsd        (%rax,%rdi,8),%xmm1
-  .byte  196,65,57,87,192                    // vxorpd        %xmm8,%xmm8,%xmm8
-  .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  117,6                               // jne           1030 <_sk_load_f16_avx+0x114>
-  .byte  197,250,126,201                     // vmovq         %xmm1,%xmm1
-  .byte  235,30                              // jmp           104e <_sk_load_f16_avx+0x132>
-  .byte  197,241,22,76,248,8                 // vmovhpd       0x8(%rax,%rdi,8),%xmm1,%xmm1
-  .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,18                              // jb            104e <_sk_load_f16_avx+0x132>
-  .byte  197,251,16,84,248,16                // vmovsd        0x10(%rax,%rdi,8),%xmm2
-  .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  117,19                              // jne           105b <_sk_load_f16_avx+0x13f>
-  .byte  197,250,126,210                     // vmovq         %xmm2,%xmm2
-  .byte  235,46                              // jmp           107c <_sk_load_f16_avx+0x160>
-  .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  197,233,87,210                      // vxorpd        %xmm2,%xmm2,%xmm2
-  .byte  233,230,254,255,255                 // jmpq          f41 <_sk_load_f16_avx+0x25>
-  .byte  197,233,22,84,248,24                // vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
-  .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,21                              // jb            107c <_sk_load_f16_avx+0x160>
-  .byte  197,251,16,92,248,32                // vmovsd        0x20(%rax,%rdi,8),%xmm3
-  .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  117,18                              // jne           1085 <_sk_load_f16_avx+0x169>
-  .byte  197,250,126,219                     // vmovq         %xmm3,%xmm3
-  .byte  233,197,254,255,255                 // jmpq          f41 <_sk_load_f16_avx+0x25>
-  .byte  197,225,87,219                      // vxorpd        %xmm3,%xmm3,%xmm3
-  .byte  233,188,254,255,255                 // jmpq          f41 <_sk_load_f16_avx+0x25>
-  .byte  197,225,22,92,248,40                // vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
-  .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  15,130,172,254,255,255              // jb            f41 <_sk_load_f16_avx+0x25>
-  .byte  197,123,16,68,248,48                // vmovsd        0x30(%rax,%rdi,8),%xmm8
-  .byte  233,161,254,255,255                 // jmpq          f41 <_sk_load_f16_avx+0x25>
-
-.globl _sk_store_f16_avx
-_sk_store_f16_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  196,98,125,24,66,96                 // vbroadcastss  0x60(%rdx),%ymm8
-  .byte  197,60,89,200                       // vmulps        %ymm0,%ymm8,%ymm9
-  .byte  196,67,125,25,202,1                 // vextractf128  $0x1,%ymm9,%xmm10
-  .byte  196,193,41,114,210,13               // vpsrld        $0xd,%xmm10,%xmm10
-  .byte  196,193,49,114,209,13               // vpsrld        $0xd,%xmm9,%xmm9
-  .byte  197,60,89,217                       // vmulps        %ymm1,%ymm8,%ymm11
-  .byte  196,67,125,25,220,1                 // vextractf128  $0x1,%ymm11,%xmm12
-  .byte  196,193,25,114,212,13               // vpsrld        $0xd,%xmm12,%xmm12
-  .byte  196,193,33,114,211,13               // vpsrld        $0xd,%xmm11,%xmm11
-  .byte  197,60,89,234                       // vmulps        %ymm2,%ymm8,%ymm13
-  .byte  196,67,125,25,238,1                 // vextractf128  $0x1,%ymm13,%xmm14
-  .byte  196,193,9,114,214,13                // vpsrld        $0xd,%xmm14,%xmm14
-  .byte  196,193,17,114,213,13               // vpsrld        $0xd,%xmm13,%xmm13
-  .byte  197,60,89,195                       // vmulps        %ymm3,%ymm8,%ymm8
-  .byte  196,67,125,25,199,1                 // vextractf128  $0x1,%ymm8,%xmm15
-  .byte  196,193,1,114,215,13                // vpsrld        $0xd,%xmm15,%xmm15
-  .byte  196,193,57,114,208,13               // vpsrld        $0xd,%xmm8,%xmm8
-  .byte  196,193,33,115,251,2                // vpslldq       $0x2,%xmm11,%xmm11
-  .byte  196,65,33,235,201                   // vpor          %xmm9,%xmm11,%xmm9
-  .byte  196,193,33,115,252,2                // vpslldq       $0x2,%xmm12,%xmm11
-  .byte  196,65,33,235,226                   // vpor          %xmm10,%xmm11,%xmm12
-  .byte  196,193,57,115,248,2                // vpslldq       $0x2,%xmm8,%xmm8
-  .byte  196,65,57,235,197                   // vpor          %xmm13,%xmm8,%xmm8
-  .byte  196,193,41,115,255,2                // vpslldq       $0x2,%xmm15,%xmm10
-  .byte  196,65,41,235,238                   // vpor          %xmm14,%xmm10,%xmm13
-  .byte  196,65,49,98,216                    // vpunpckldq    %xmm8,%xmm9,%xmm11
-  .byte  196,65,49,106,208                   // vpunpckhdq    %xmm8,%xmm9,%xmm10
-  .byte  196,65,25,98,205                    // vpunpckldq    %xmm13,%xmm12,%xmm9
-  .byte  196,65,25,106,197                   // vpunpckhdq    %xmm13,%xmm12,%xmm8
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,27                              // jne           1163 <_sk_store_f16_avx+0xc3>
-  .byte  197,120,17,28,248                   // vmovups       %xmm11,(%rax,%rdi,8)
-  .byte  197,120,17,84,248,16                // vmovups       %xmm10,0x10(%rax,%rdi,8)
-  .byte  197,120,17,76,248,32                // vmovups       %xmm9,0x20(%rax,%rdi,8)
-  .byte  197,122,127,68,248,48               // vmovdqu       %xmm8,0x30(%rax,%rdi,8)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  197,121,214,28,248                  // vmovq         %xmm11,(%rax,%rdi,8)
-  .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,241                             // je            115f <_sk_store_f16_avx+0xbf>
-  .byte  197,121,23,92,248,8                 // vmovhpd       %xmm11,0x8(%rax,%rdi,8)
-  .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,229                             // jb            115f <_sk_store_f16_avx+0xbf>
-  .byte  197,121,214,84,248,16               // vmovq         %xmm10,0x10(%rax,%rdi,8)
-  .byte  116,221                             // je            115f <_sk_store_f16_avx+0xbf>
-  .byte  197,121,23,84,248,24                // vmovhpd       %xmm10,0x18(%rax,%rdi,8)
-  .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,209                             // jb            115f <_sk_store_f16_avx+0xbf>
-  .byte  197,121,214,76,248,32               // vmovq         %xmm9,0x20(%rax,%rdi,8)
-  .byte  116,201                             // je            115f <_sk_store_f16_avx+0xbf>
-  .byte  197,121,23,76,248,40                // vmovhpd       %xmm9,0x28(%rax,%rdi,8)
-  .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,189                             // jb            115f <_sk_store_f16_avx+0xbf>
-  .byte  197,121,214,68,248,48               // vmovq         %xmm8,0x30(%rax,%rdi,8)
-  .byte  235,181                             // jmp           115f <_sk_store_f16_avx+0xbf>
-
-.globl _sk_store_f32_avx
-_sk_store_f32_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  76,139,0                            // mov           (%rax),%r8
-  .byte  72,141,4,189,0,0,0,0                // lea           0x0(,%rdi,4),%rax
-  .byte  197,124,20,193                      // vunpcklps     %ymm1,%ymm0,%ymm8
-  .byte  197,124,21,217                      // vunpckhps     %ymm1,%ymm0,%ymm11
-  .byte  197,108,20,203                      // vunpcklps     %ymm3,%ymm2,%ymm9
-  .byte  197,108,21,227                      // vunpckhps     %ymm3,%ymm2,%ymm12
-  .byte  196,65,61,20,209                    // vunpcklpd     %ymm9,%ymm8,%ymm10
-  .byte  196,65,61,21,201                    // vunpckhpd     %ymm9,%ymm8,%ymm9
-  .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
-  .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
-  .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,55                              // jne           1217 <_sk_store_f32_avx+0x6d>
-  .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
-  .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
-  .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
-  .byte  196,67,61,6,195,49                  // vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
-  .byte  196,65,125,17,36,128                // vmovupd       %ymm12,(%r8,%rax,4)
-  .byte  196,65,125,17,108,128,32            // vmovupd       %ymm13,0x20(%r8,%rax,4)
-  .byte  196,65,125,17,76,128,64             // vmovupd       %ymm9,0x40(%r8,%rax,4)
-  .byte  196,65,125,17,68,128,96             // vmovupd       %ymm8,0x60(%r8,%rax,4)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-  .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
-  .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            1213 <_sk_store_f32_avx+0x69>
-  .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
-  .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            1213 <_sk_store_f32_avx+0x69>
-  .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
-  .byte  116,218                             // je            1213 <_sk_store_f32_avx+0x69>
-  .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
-  .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            1213 <_sk_store_f32_avx+0x69>
-  .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  .byte  116,195                             // je            1213 <_sk_store_f32_avx+0x69>
-  .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
-  .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,181                             // jb            1213 <_sk_store_f32_avx+0x69>
-  .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  .byte  235,171                             // jmp           1213 <_sk_store_f32_avx+0x69>
-
-.globl _sk_clamp_x_avx
-_sk_clamp_x_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  197,60,95,200                       // vmaxps        %ymm0,%ymm8,%ymm9
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,99,125,25,192,1                 // vextractf128  $0x1,%ymm8,%xmm0
-  .byte  196,65,41,118,210                   // vpcmpeqd      %xmm10,%xmm10,%xmm10
-  .byte  196,193,121,254,194                 // vpaddd        %xmm10,%xmm0,%xmm0
-  .byte  196,65,57,254,194                   // vpaddd        %xmm10,%xmm8,%xmm8
-  .byte  196,227,61,24,192,1                 // vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
-  .byte  197,180,93,192                      // vminps        %ymm0,%ymm9,%ymm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_y_avx
-_sk_clamp_y_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  197,60,95,201                       // vmaxps        %ymm1,%ymm8,%ymm9
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,99,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm1
-  .byte  196,65,41,118,210                   // vpcmpeqd      %xmm10,%xmm10,%xmm10
-  .byte  196,193,113,254,202                 // vpaddd        %xmm10,%xmm1,%xmm1
-  .byte  196,65,57,254,194                   // vpaddd        %xmm10,%xmm8,%xmm8
-  .byte  196,227,61,24,201,1                 // vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
-  .byte  197,180,93,201                      // vminps        %ymm1,%ymm9,%ymm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_repeat_x_avx
-_sk_repeat_x_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,65,124,94,200                   // vdivps        %ymm8,%ymm0,%ymm9
-  .byte  196,67,125,8,201,1                  // vroundps      $0x1,%ymm9,%ymm9
-  .byte  196,65,52,89,200                    // vmulps        %ymm8,%ymm9,%ymm9
-  .byte  196,65,124,92,201                   // vsubps        %ymm9,%ymm0,%ymm9
-  .byte  196,99,125,25,192,1                 // vextractf128  $0x1,%ymm8,%xmm0
-  .byte  196,65,41,118,210                   // vpcmpeqd      %xmm10,%xmm10,%xmm10
-  .byte  196,193,121,254,194                 // vpaddd        %xmm10,%xmm0,%xmm0
-  .byte  196,65,57,254,194                   // vpaddd        %xmm10,%xmm8,%xmm8
-  .byte  196,227,61,24,192,1                 // vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
-  .byte  197,180,93,192                      // vminps        %ymm0,%ymm9,%ymm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_repeat_y_avx
-_sk_repeat_y_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,65,116,94,200                   // vdivps        %ymm8,%ymm1,%ymm9
-  .byte  196,67,125,8,201,1                  // vroundps      $0x1,%ymm9,%ymm9
-  .byte  196,65,52,89,200                    // vmulps        %ymm8,%ymm9,%ymm9
-  .byte  196,65,116,92,201                   // vsubps        %ymm9,%ymm1,%ymm9
-  .byte  196,99,125,25,193,1                 // vextractf128  $0x1,%ymm8,%xmm1
-  .byte  196,65,41,118,210                   // vpcmpeqd      %xmm10,%xmm10,%xmm10
-  .byte  196,193,113,254,202                 // vpaddd        %xmm10,%xmm1,%xmm1
-  .byte  196,65,57,254,194                   // vpaddd        %xmm10,%xmm8,%xmm8
-  .byte  196,227,61,24,201,1                 // vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
-  .byte  197,180,93,201                      // vminps        %ymm1,%ymm9,%ymm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_mirror_x_avx
-_sk_mirror_x_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,122,16,0                        // vmovss        (%rax),%xmm8
-  .byte  196,65,121,112,200,0                // vpshufd       $0x0,%xmm8,%xmm9
-  .byte  196,67,53,24,201,1                  // vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
-  .byte  196,65,124,92,209                   // vsubps        %ymm9,%ymm0,%ymm10
-  .byte  196,193,58,88,192                   // vaddss        %xmm8,%xmm8,%xmm0
-  .byte  196,227,121,4,192,0                 // vpermilps     $0x0,%xmm0,%xmm0
-  .byte  196,227,125,24,192,1                // vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
-  .byte  197,44,94,192                       // vdivps        %ymm0,%ymm10,%ymm8
-  .byte  196,67,125,8,192,1                  // vroundps      $0x1,%ymm8,%ymm8
-  .byte  197,188,89,192                      // vmulps        %ymm0,%ymm8,%ymm0
-  .byte  197,172,92,192                      // vsubps        %ymm0,%ymm10,%ymm0
-  .byte  196,193,124,92,193                  // vsubps        %ymm9,%ymm0,%ymm0
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  197,60,92,192                       // vsubps        %ymm0,%ymm8,%ymm8
-  .byte  197,60,84,192                       // vandps        %ymm0,%ymm8,%ymm8
-  .byte  196,99,125,25,200,1                 // vextractf128  $0x1,%ymm9,%xmm0
-  .byte  196,65,41,118,210                   // vpcmpeqd      %xmm10,%xmm10,%xmm10
-  .byte  196,193,121,254,194                 // vpaddd        %xmm10,%xmm0,%xmm0
-  .byte  196,65,49,254,202                   // vpaddd        %xmm10,%xmm9,%xmm9
-  .byte  196,227,53,24,192,1                 // vinsertf128   $0x1,%xmm0,%ymm9,%ymm0
-  .byte  197,188,93,192                      // vminps        %ymm0,%ymm8,%ymm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_mirror_y_avx
-_sk_mirror_y_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,122,16,0                        // vmovss        (%rax),%xmm8
-  .byte  196,65,121,112,200,0                // vpshufd       $0x0,%xmm8,%xmm9
-  .byte  196,67,53,24,201,1                  // vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
-  .byte  196,65,116,92,209                   // vsubps        %ymm9,%ymm1,%ymm10
-  .byte  196,193,58,88,200                   // vaddss        %xmm8,%xmm8,%xmm1
-  .byte  196,227,121,4,201,0                 // vpermilps     $0x0,%xmm1,%xmm1
-  .byte  196,227,117,24,201,1                // vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
-  .byte  197,44,94,193                       // vdivps        %ymm1,%ymm10,%ymm8
-  .byte  196,67,125,8,192,1                  // vroundps      $0x1,%ymm8,%ymm8
-  .byte  197,188,89,201                      // vmulps        %ymm1,%ymm8,%ymm1
-  .byte  197,172,92,201                      // vsubps        %ymm1,%ymm10,%ymm1
-  .byte  196,193,116,92,201                  // vsubps        %ymm9,%ymm1,%ymm1
-  .byte  196,65,60,87,192                    // vxorps        %ymm8,%ymm8,%ymm8
-  .byte  197,60,92,193                       // vsubps        %ymm1,%ymm8,%ymm8
-  .byte  197,60,84,193                       // vandps        %ymm1,%ymm8,%ymm8
-  .byte  196,99,125,25,201,1                 // vextractf128  $0x1,%ymm9,%xmm1
-  .byte  196,65,41,118,210                   // vpcmpeqd      %xmm10,%xmm10,%xmm10
-  .byte  196,193,113,254,202                 // vpaddd        %xmm10,%xmm1,%xmm1
-  .byte  196,65,49,254,202                   // vpaddd        %xmm10,%xmm9,%xmm9
-  .byte  196,227,53,24,201,1                 // vinsertf128   $0x1,%xmm1,%ymm9,%ymm1
-  .byte  197,188,93,201                      // vminps        %ymm1,%ymm8,%ymm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_2x3_avx
-_sk_matrix_2x3_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,98,125,24,72,8                  // vbroadcastss  0x8(%rax),%ymm9
-  .byte  196,98,125,24,80,16                 // vbroadcastss  0x10(%rax),%ymm10
-  .byte  197,52,89,201                       // vmulps        %ymm1,%ymm9,%ymm9
-  .byte  196,65,52,88,202                    // vaddps        %ymm10,%ymm9,%ymm9
-  .byte  197,60,89,192                       // vmulps        %ymm0,%ymm8,%ymm8
-  .byte  196,65,60,88,193                    // vaddps        %ymm9,%ymm8,%ymm8
-  .byte  196,98,125,24,72,4                  // vbroadcastss  0x4(%rax),%ymm9
-  .byte  196,98,125,24,80,12                 // vbroadcastss  0xc(%rax),%ymm10
-  .byte  196,98,125,24,88,20                 // vbroadcastss  0x14(%rax),%ymm11
-  .byte  197,172,89,201                      // vmulps        %ymm1,%ymm10,%ymm1
-  .byte  196,193,116,88,203                  // vaddps        %ymm11,%ymm1,%ymm1
-  .byte  197,180,89,192                      // vmulps        %ymm0,%ymm9,%ymm0
-  .byte  197,252,88,201                      // vaddps        %ymm1,%ymm0,%ymm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,124,41,192                      // vmovaps       %ymm8,%ymm0
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_3x4_avx
-_sk_matrix_3x4_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,98,125,24,72,12                 // vbroadcastss  0xc(%rax),%ymm9
-  .byte  196,98,125,24,80,24                 // vbroadcastss  0x18(%rax),%ymm10
-  .byte  196,98,125,24,88,36                 // vbroadcastss  0x24(%rax),%ymm11
-  .byte  197,44,89,210                       // vmulps        %ymm2,%ymm10,%ymm10
-  .byte  196,65,44,88,211                    // vaddps        %ymm11,%ymm10,%ymm10
-  .byte  197,52,89,201                       // vmulps        %ymm1,%ymm9,%ymm9
-  .byte  196,65,52,88,202                    // vaddps        %ymm10,%ymm9,%ymm9
-  .byte  197,60,89,192                       // vmulps        %ymm0,%ymm8,%ymm8
-  .byte  196,65,60,88,193                    // vaddps        %ymm9,%ymm8,%ymm8
-  .byte  196,98,125,24,72,4                  // vbroadcastss  0x4(%rax),%ymm9
-  .byte  196,98,125,24,80,16                 // vbroadcastss  0x10(%rax),%ymm10
-  .byte  196,98,125,24,88,28                 // vbroadcastss  0x1c(%rax),%ymm11
-  .byte  196,98,125,24,96,40                 // vbroadcastss  0x28(%rax),%ymm12
-  .byte  197,36,89,218                       // vmulps        %ymm2,%ymm11,%ymm11
-  .byte  196,65,36,88,220                    // vaddps        %ymm12,%ymm11,%ymm11
-  .byte  197,44,89,209                       // vmulps        %ymm1,%ymm10,%ymm10
-  .byte  196,65,44,88,211                    // vaddps        %ymm11,%ymm10,%ymm10
-  .byte  197,52,89,200                       // vmulps        %ymm0,%ymm9,%ymm9
-  .byte  196,65,52,88,202                    // vaddps        %ymm10,%ymm9,%ymm9
-  .byte  196,98,125,24,80,8                  // vbroadcastss  0x8(%rax),%ymm10
-  .byte  196,98,125,24,88,20                 // vbroadcastss  0x14(%rax),%ymm11
-  .byte  196,98,125,24,96,32                 // vbroadcastss  0x20(%rax),%ymm12
-  .byte  196,98,125,24,104,44                // vbroadcastss  0x2c(%rax),%ymm13
-  .byte  197,156,89,210                      // vmulps        %ymm2,%ymm12,%ymm2
-  .byte  196,193,108,88,213                  // vaddps        %ymm13,%ymm2,%ymm2
-  .byte  197,164,89,201                      // vmulps        %ymm1,%ymm11,%ymm1
-  .byte  197,244,88,202                      // vaddps        %ymm2,%ymm1,%ymm1
-  .byte  197,172,89,192                      // vmulps        %ymm0,%ymm10,%ymm0
-  .byte  197,252,88,209                      // vaddps        %ymm1,%ymm0,%ymm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,124,41,192                      // vmovaps       %ymm8,%ymm0
-  .byte  197,124,41,201                      // vmovaps       %ymm9,%ymm1
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_perspective_avx
-_sk_matrix_perspective_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,98,125,24,0                     // vbroadcastss  (%rax),%ymm8
-  .byte  196,98,125,24,72,4                  // vbroadcastss  0x4(%rax),%ymm9
-  .byte  196,98,125,24,80,8                  // vbroadcastss  0x8(%rax),%ymm10
-  .byte  197,52,89,201                       // vmulps        %ymm1,%ymm9,%ymm9
-  .byte  196,65,52,88,202                    // vaddps        %ymm10,%ymm9,%ymm9
-  .byte  197,60,89,192                       // vmulps        %ymm0,%ymm8,%ymm8
-  .byte  196,65,60,88,193                    // vaddps        %ymm9,%ymm8,%ymm8
-  .byte  196,98,125,24,72,12                 // vbroadcastss  0xc(%rax),%ymm9
-  .byte  196,98,125,24,80,16                 // vbroadcastss  0x10(%rax),%ymm10
-  .byte  196,98,125,24,88,20                 // vbroadcastss  0x14(%rax),%ymm11
-  .byte  197,44,89,209                       // vmulps        %ymm1,%ymm10,%ymm10
-  .byte  196,65,44,88,211                    // vaddps        %ymm11,%ymm10,%ymm10
-  .byte  197,52,89,200                       // vmulps        %ymm0,%ymm9,%ymm9
-  .byte  196,65,52,88,202                    // vaddps        %ymm10,%ymm9,%ymm9
-  .byte  196,98,125,24,80,24                 // vbroadcastss  0x18(%rax),%ymm10
-  .byte  196,98,125,24,88,28                 // vbroadcastss  0x1c(%rax),%ymm11
-  .byte  196,98,125,24,96,32                 // vbroadcastss  0x20(%rax),%ymm12
-  .byte  197,164,89,201                      // vmulps        %ymm1,%ymm11,%ymm1
-  .byte  196,193,116,88,204                  // vaddps        %ymm12,%ymm1,%ymm1
-  .byte  197,172,89,192                      // vmulps        %ymm0,%ymm10,%ymm0
-  .byte  197,252,88,193                      // vaddps        %ymm1,%ymm0,%ymm0
-  .byte  197,252,83,200                      // vrcpps        %ymm0,%ymm1
-  .byte  197,188,89,193                      // vmulps        %ymm1,%ymm8,%ymm0
-  .byte  197,180,89,201                      // vmulps        %ymm1,%ymm9,%ymm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_linear_gradient_2stops_avx
-_sk_linear_gradient_2stops_avx:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  196,226,125,24,72,16                // vbroadcastss  0x10(%rax),%ymm1
-  .byte  196,226,125,24,16                   // vbroadcastss  (%rax),%ymm2
-  .byte  197,244,89,200                      // vmulps        %ymm0,%ymm1,%ymm1
-  .byte  197,108,88,193                      // vaddps        %ymm1,%ymm2,%ymm8
-  .byte  196,226,125,24,72,20                // vbroadcastss  0x14(%rax),%ymm1
-  .byte  196,226,125,24,80,4                 // vbroadcastss  0x4(%rax),%ymm2
-  .byte  197,244,89,200                      // vmulps        %ymm0,%ymm1,%ymm1
-  .byte  197,236,88,201                      // vaddps        %ymm1,%ymm2,%ymm1
-  .byte  196,226,125,24,80,24                // vbroadcastss  0x18(%rax),%ymm2
-  .byte  196,226,125,24,88,8                 // vbroadcastss  0x8(%rax),%ymm3
-  .byte  197,236,89,208                      // vmulps        %ymm0,%ymm2,%ymm2
-  .byte  197,228,88,210                      // vaddps        %ymm2,%ymm3,%ymm2
-  .byte  196,226,125,24,88,28                // vbroadcastss  0x1c(%rax),%ymm3
-  .byte  196,98,125,24,72,12                 // vbroadcastss  0xc(%rax),%ymm9
-  .byte  197,228,89,192                      // vmulps        %ymm0,%ymm3,%ymm0
-  .byte  197,180,88,216                      // vaddps        %ymm0,%ymm9,%ymm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  197,124,41,192                      // vmovaps       %ymm8,%ymm0
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_start_pipeline_sse41
-_sk_start_pipeline_sse41:
-  .byte  65,87                               // push          %r15
-  .byte  65,86                               // push          %r14
-  .byte  65,85                               // push          %r13
-  .byte  65,84                               // push          %r12
-  .byte  83                                  // push          %rbx
-  .byte  73,137,207                          // mov           %rcx,%r15
-  .byte  73,137,214                          // mov           %rdx,%r14
-  .byte  72,137,251                          // mov           %rdi,%rbx
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  73,137,196                          // mov           %rax,%r12
-  .byte  73,137,245                          // mov           %rsi,%r13
-  .byte  72,141,67,4                         // lea           0x4(%rbx),%rax
-  .byte  76,57,248                           // cmp           %r15,%rax
-  .byte  118,5                               // jbe           28 <_sk_start_pipeline_sse41+0x28>
-  .byte  72,137,216                          // mov           %rbx,%rax
-  .byte  235,52                              // jmp           5c <_sk_start_pipeline_sse41+0x5c>
-  .byte  15,87,192                           // xorps         %xmm0,%xmm0
-  .byte  15,87,201                           // xorps         %xmm1,%xmm1
-  .byte  15,87,210                           // xorps         %xmm2,%xmm2
-  .byte  15,87,219                           // xorps         %xmm3,%xmm3
-  .byte  15,87,228                           // xorps         %xmm4,%xmm4
-  .byte  15,87,237                           // xorps         %xmm5,%xmm5
-  .byte  15,87,246                           // xorps         %xmm6,%xmm6
-  .byte  15,87,255                           // xorps         %xmm7,%xmm7
-  .byte  72,137,223                          // mov           %rbx,%rdi
-  .byte  76,137,238                          // mov           %r13,%rsi
-  .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  65,255,212                          // callq         *%r12
-  .byte  72,141,67,4                         // lea           0x4(%rbx),%rax
-  .byte  72,131,195,8                        // add           $0x8,%rbx
-  .byte  76,57,251                           // cmp           %r15,%rbx
-  .byte  72,137,195                          // mov           %rax,%rbx
-  .byte  118,204                             // jbe           28 <_sk_start_pipeline_sse41+0x28>
-  .byte  91                                  // pop           %rbx
-  .byte  65,92                               // pop           %r12
-  .byte  65,93                               // pop           %r13
-  .byte  65,94                               // pop           %r14
-  .byte  65,95                               // pop           %r15
-  .byte  195                                 // retq
-
-.globl _sk_just_return_sse41
-_sk_just_return_sse41:
-  .byte  195                                 // retq
-
-.globl _sk_seed_shader_sse41
-_sk_seed_shader_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  102,15,110,199                      // movd          %edi,%xmm0
-  .byte  102,15,112,192,0                    // pshufd        $0x0,%xmm0,%xmm0
-  .byte  15,91,200                           // cvtdq2ps      %xmm0,%xmm1
-  .byte  243,15,16,18                        // movss         (%rdx),%xmm2
-  .byte  243,15,16,90,4                      // movss         0x4(%rdx),%xmm3
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  15,88,203                           // addps         %xmm3,%xmm1
-  .byte  15,16,66,20                         // movups        0x14(%rdx),%xmm0
-  .byte  15,88,193                           // addps         %xmm1,%xmm0
-  .byte  102,15,110,8                        // movd          (%rax),%xmm1
-  .byte  102,15,112,201,0                    // pshufd        $0x0,%xmm1,%xmm1
-  .byte  15,91,201                           // cvtdq2ps      %xmm1,%xmm1
-  .byte  15,88,203                           // addps         %xmm3,%xmm1
-  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,87,219                           // xorps         %xmm3,%xmm3
-  .byte  15,87,228                           // xorps         %xmm4,%xmm4
-  .byte  15,87,237                           // xorps         %xmm5,%xmm5
-  .byte  15,87,246                           // xorps         %xmm6,%xmm6
-  .byte  15,87,255                           // xorps         %xmm7,%xmm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_constant_color_sse41
-_sk_constant_color_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,16,24                            // movups        (%rax),%xmm3
-  .byte  15,40,195                           // movaps        %xmm3,%xmm0
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  15,40,203                           // movaps        %xmm3,%xmm1
-  .byte  15,198,201,85                       // shufps        $0x55,%xmm1,%xmm1
-  .byte  15,40,211                           // movaps        %xmm3,%xmm2
-  .byte  15,198,210,170                      // shufps        $0xaa,%xmm2,%xmm2
-  .byte  15,198,219,255                      // shufps        $0xff,%xmm3,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clear_sse41
-_sk_clear_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,87,192                           // xorps         %xmm0,%xmm0
-  .byte  15,87,201                           // xorps         %xmm1,%xmm1
-  .byte  15,87,210                           // xorps         %xmm2,%xmm2
-  .byte  15,87,219                           // xorps         %xmm3,%xmm3
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_plus__sse41
-_sk_plus__sse41:
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  15,88,223                           // addps         %xmm7,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_srcover_sse41
-_sk_srcover_sse41:
-  .byte  243,68,15,16,2                      // movss         (%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,92,195                        // subps         %xmm3,%xmm8
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,204                        // mulps         %xmm4,%xmm9
-  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,205                        // mulps         %xmm5,%xmm9
-  .byte  65,15,88,201                        // addps         %xmm9,%xmm1
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,206                        // mulps         %xmm6,%xmm9
-  .byte  65,15,88,209                        // addps         %xmm9,%xmm2
-  .byte  68,15,89,199                        // mulps         %xmm7,%xmm8
-  .byte  65,15,88,216                        // addps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_dstover_sse41
-_sk_dstover_sse41:
-  .byte  243,68,15,16,2                      // movss         (%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,92,199                        // subps         %xmm7,%xmm8
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  15,88,223                           // addps         %xmm7,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_0_sse41
-_sk_clamp_0_sse41:
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  65,15,95,192                        // maxps         %xmm8,%xmm0
-  .byte  65,15,95,200                        // maxps         %xmm8,%xmm1
-  .byte  65,15,95,208                        // maxps         %xmm8,%xmm2
-  .byte  65,15,95,216                        // maxps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_1_sse41
-_sk_clamp_1_sse41:
-  .byte  243,68,15,16,2                      // movss         (%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,93,192                        // minps         %xmm8,%xmm0
-  .byte  65,15,93,200                        // minps         %xmm8,%xmm1
-  .byte  65,15,93,208                        // minps         %xmm8,%xmm2
-  .byte  65,15,93,216                        // minps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_a_sse41
-_sk_clamp_a_sse41:
-  .byte  243,68,15,16,2                      // movss         (%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,93,216                        // minps         %xmm8,%xmm3
-  .byte  15,93,195                           // minps         %xmm3,%xmm0
-  .byte  15,93,203                           // minps         %xmm3,%xmm1
-  .byte  15,93,211                           // minps         %xmm3,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_set_rgb_sse41
-_sk_set_rgb_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,15,16,0                         // movss         (%rax),%xmm0
-  .byte  243,15,16,72,4                      // movss         0x4(%rax),%xmm1
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  243,15,16,80,8                      // movss         0x8(%rax),%xmm2
-  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_swap_rb_sse41
-_sk_swap_rb_sse41:
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,40,194                           // movaps        %xmm2,%xmm0
-  .byte  65,15,40,208                        // movaps        %xmm8,%xmm2
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_swap_sse41
-_sk_swap_sse41:
-  .byte  68,15,40,195                        // movaps        %xmm3,%xmm8
-  .byte  68,15,40,202                        // movaps        %xmm2,%xmm9
-  .byte  68,15,40,209                        // movaps        %xmm1,%xmm10
-  .byte  68,15,40,216                        // movaps        %xmm0,%xmm11
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,40,196                           // movaps        %xmm4,%xmm0
-  .byte  15,40,205                           // movaps        %xmm5,%xmm1
-  .byte  15,40,214                           // movaps        %xmm6,%xmm2
-  .byte  15,40,223                           // movaps        %xmm7,%xmm3
-  .byte  65,15,40,227                        // movaps        %xmm11,%xmm4
-  .byte  65,15,40,234                        // movaps        %xmm10,%xmm5
-  .byte  65,15,40,241                        // movaps        %xmm9,%xmm6
-  .byte  65,15,40,248                        // movaps        %xmm8,%xmm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_move_src_dst_sse41
-_sk_move_src_dst_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,40,224                           // movaps        %xmm0,%xmm4
-  .byte  15,40,233                           // movaps        %xmm1,%xmm5
-  .byte  15,40,242                           // movaps        %xmm2,%xmm6
-  .byte  15,40,251                           // movaps        %xmm3,%xmm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_move_dst_src_sse41
-_sk_move_dst_src_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,40,196                           // movaps        %xmm4,%xmm0
-  .byte  15,40,205                           // movaps        %xmm5,%xmm1
-  .byte  15,40,214                           // movaps        %xmm6,%xmm2
-  .byte  15,40,223                           // movaps        %xmm7,%xmm3
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_premul_sse41
-_sk_premul_sse41:
-  .byte  15,89,195                           // mulps         %xmm3,%xmm0
-  .byte  15,89,203                           // mulps         %xmm3,%xmm1
-  .byte  15,89,211                           // mulps         %xmm3,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_unpremul_sse41
-_sk_unpremul_sse41:
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  69,15,87,201                        // xorps         %xmm9,%xmm9
-  .byte  243,68,15,16,18                     // movss         (%rdx),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  68,15,94,211                        // divps         %xmm3,%xmm10
-  .byte  15,40,195                           // movaps        %xmm3,%xmm0
-  .byte  65,15,194,193,0                     // cmpeqps       %xmm9,%xmm0
-  .byte  102,69,15,56,20,209                 // blendvps      %xmm0,%xmm9,%xmm10
-  .byte  69,15,89,194                        // mulps         %xmm10,%xmm8
-  .byte  65,15,89,202                        // mulps         %xmm10,%xmm1
-  .byte  65,15,89,210                        // mulps         %xmm10,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,192                        // movaps        %xmm8,%xmm0
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_from_srgb_sse41
-_sk_from_srgb_sse41:
-  .byte  68,15,40,194                        // movaps        %xmm2,%xmm8
-  .byte  243,68,15,16,90,64                  // movss         0x40(%rdx),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,40,211                        // movaps        %xmm11,%xmm10
-  .byte  68,15,89,208                        // mulps         %xmm0,%xmm10
-  .byte  68,15,40,240                        // movaps        %xmm0,%xmm14
-  .byte  69,15,89,246                        // mulps         %xmm14,%xmm14
-  .byte  243,15,16,82,60                     // movss         0x3c(%rdx),%xmm2
-  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
-  .byte  243,68,15,16,98,52                  // movss         0x34(%rdx),%xmm12
-  .byte  243,68,15,16,106,56                 // movss         0x38(%rdx),%xmm13
-  .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
-  .byte  68,15,40,202                        // movaps        %xmm2,%xmm9
-  .byte  68,15,89,200                        // mulps         %xmm0,%xmm9
-  .byte  69,15,88,205                        // addps         %xmm13,%xmm9
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  69,15,89,206                        // mulps         %xmm14,%xmm9
-  .byte  69,15,88,204                        // addps         %xmm12,%xmm9
-  .byte  243,68,15,16,114,68                 // movss         0x44(%rdx),%xmm14
-  .byte  69,15,198,246,0                     // shufps        $0x0,%xmm14,%xmm14
-  .byte  65,15,194,198,1                     // cmpltps       %xmm14,%xmm0
-  .byte  102,69,15,56,20,202                 // blendvps      %xmm0,%xmm10,%xmm9
-  .byte  69,15,40,251                        // movaps        %xmm11,%xmm15
-  .byte  68,15,89,249                        // mulps         %xmm1,%xmm15
-  .byte  15,40,193                           // movaps        %xmm1,%xmm0
-  .byte  15,89,192                           // mulps         %xmm0,%xmm0
-  .byte  68,15,40,210                        // movaps        %xmm2,%xmm10
-  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
-  .byte  69,15,88,213                        // addps         %xmm13,%xmm10
-  .byte  68,15,89,208                        // mulps         %xmm0,%xmm10
-  .byte  69,15,88,212                        // addps         %xmm12,%xmm10
-  .byte  65,15,194,206,1                     // cmpltps       %xmm14,%xmm1
-  .byte  15,40,193                           // movaps        %xmm1,%xmm0
-  .byte  102,69,15,56,20,215                 // blendvps      %xmm0,%xmm15,%xmm10
-  .byte  69,15,89,216                        // mulps         %xmm8,%xmm11
-  .byte  65,15,40,192                        // movaps        %xmm8,%xmm0
-  .byte  15,89,192                           // mulps         %xmm0,%xmm0
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  65,15,88,213                        // addps         %xmm13,%xmm2
-  .byte  15,89,208                           // mulps         %xmm0,%xmm2
-  .byte  65,15,88,212                        // addps         %xmm12,%xmm2
-  .byte  69,15,194,198,1                     // cmpltps       %xmm14,%xmm8
-  .byte  65,15,40,192                        // movaps        %xmm8,%xmm0
-  .byte  102,65,15,56,20,211                 // blendvps      %xmm0,%xmm11,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,193                        // movaps        %xmm9,%xmm0
-  .byte  65,15,40,202                        // movaps        %xmm10,%xmm1
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_to_srgb_sse41
-_sk_to_srgb_sse41:
-  .byte  72,131,236,24                       // sub           $0x18,%rsp
-  .byte  15,41,60,36                         // movaps        %xmm7,(%rsp)
-  .byte  15,40,254                           // movaps        %xmm6,%xmm7
-  .byte  15,40,245                           // movaps        %xmm5,%xmm6
-  .byte  15,40,236                           // movaps        %xmm4,%xmm5
-  .byte  15,40,227                           // movaps        %xmm3,%xmm4
-  .byte  68,15,40,194                        // movaps        %xmm2,%xmm8
-  .byte  15,40,217                           // movaps        %xmm1,%xmm3
-  .byte  15,82,208                           // rsqrtps       %xmm0,%xmm2
-  .byte  68,15,83,202                        // rcpps         %xmm2,%xmm9
-  .byte  68,15,82,210                        // rsqrtps       %xmm2,%xmm10
-  .byte  243,15,16,18                        // movss         (%rdx),%xmm2
-  .byte  243,68,15,16,90,72                  // movss         0x48(%rdx),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  65,15,40,203                        // movaps        %xmm11,%xmm1
-  .byte  15,89,200                           // mulps         %xmm0,%xmm1
-  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
-  .byte  243,68,15,16,98,76                  // movss         0x4c(%rdx),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  243,68,15,16,106,80                 // movss         0x50(%rdx),%xmm13
-  .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
-  .byte  243,68,15,16,114,84                 // movss         0x54(%rdx),%xmm14
-  .byte  69,15,198,246,0                     // shufps        $0x0,%xmm14,%xmm14
-  .byte  69,15,89,205                        // mulps         %xmm13,%xmm9
-  .byte  69,15,88,206                        // addps         %xmm14,%xmm9
-  .byte  69,15,89,212                        // mulps         %xmm12,%xmm10
-  .byte  69,15,88,209                        // addps         %xmm9,%xmm10
-  .byte  68,15,40,202                        // movaps        %xmm2,%xmm9
-  .byte  69,15,93,202                        // minps         %xmm10,%xmm9
-  .byte  243,68,15,16,122,88                 // movss         0x58(%rdx),%xmm15
-  .byte  69,15,198,255,0                     // shufps        $0x0,%xmm15,%xmm15
-  .byte  65,15,194,199,1                     // cmpltps       %xmm15,%xmm0
-  .byte  102,68,15,56,20,201                 // blendvps      %xmm0,%xmm1,%xmm9
-  .byte  15,82,195                           // rsqrtps       %xmm3,%xmm0
-  .byte  15,83,200                           // rcpps         %xmm0,%xmm1
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  65,15,89,205                        // mulps         %xmm13,%xmm1
-  .byte  65,15,88,206                        // addps         %xmm14,%xmm1
-  .byte  65,15,89,196                        // mulps         %xmm12,%xmm0
-  .byte  15,88,193                           // addps         %xmm1,%xmm0
-  .byte  68,15,40,210                        // movaps        %xmm2,%xmm10
-  .byte  68,15,93,208                        // minps         %xmm0,%xmm10
-  .byte  65,15,40,203                        // movaps        %xmm11,%xmm1
-  .byte  15,89,203                           // mulps         %xmm3,%xmm1
-  .byte  65,15,194,223,1                     // cmpltps       %xmm15,%xmm3
-  .byte  15,40,195                           // movaps        %xmm3,%xmm0
-  .byte  102,68,15,56,20,209                 // blendvps      %xmm0,%xmm1,%xmm10
-  .byte  65,15,82,192                        // rsqrtps       %xmm8,%xmm0
-  .byte  15,83,200                           // rcpps         %xmm0,%xmm1
-  .byte  65,15,89,205                        // mulps         %xmm13,%xmm1
-  .byte  65,15,88,206                        // addps         %xmm14,%xmm1
-  .byte  15,82,192                           // rsqrtps       %xmm0,%xmm0
-  .byte  65,15,89,196                        // mulps         %xmm12,%xmm0
-  .byte  15,88,193                           // addps         %xmm1,%xmm0
-  .byte  15,93,208                           // minps         %xmm0,%xmm2
-  .byte  69,15,89,216                        // mulps         %xmm8,%xmm11
-  .byte  69,15,194,199,1                     // cmpltps       %xmm15,%xmm8
-  .byte  65,15,40,192                        // movaps        %xmm8,%xmm0
-  .byte  102,65,15,56,20,211                 // blendvps      %xmm0,%xmm11,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,193                        // movaps        %xmm9,%xmm0
-  .byte  65,15,40,202                        // movaps        %xmm10,%xmm1
-  .byte  15,40,220                           // movaps        %xmm4,%xmm3
-  .byte  15,40,229                           // movaps        %xmm5,%xmm4
-  .byte  15,40,238                           // movaps        %xmm6,%xmm5
-  .byte  15,40,247                           // movaps        %xmm7,%xmm6
-  .byte  15,40,60,36                         // movaps        (%rsp),%xmm7
-  .byte  72,131,196,24                       // add           $0x18,%rsp
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_scale_1_float_sse41
-_sk_scale_1_float_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_scale_u8_sse41
-_sk_scale_u8_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,68,15,56,49,4,56                // pmovzxbd      (%rax,%rdi,1),%xmm8
-  .byte  69,15,91,192                        // cvtdq2ps      %xmm8,%xmm8
-  .byte  243,68,15,16,74,12                  // movss         0xc(%rdx),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  65,15,89,193                        // mulps         %xmm9,%xmm0
-  .byte  65,15,89,201                        // mulps         %xmm9,%xmm1
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  65,15,89,217                        // mulps         %xmm9,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_lerp_1_float_sse41
-_sk_lerp_1_float_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  15,92,196                           // subps         %xmm4,%xmm0
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  15,92,205                           // subps         %xmm5,%xmm1
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  15,92,214                           // subps         %xmm6,%xmm2
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  15,92,223                           // subps         %xmm7,%xmm3
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  15,88,223                           // addps         %xmm7,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_lerp_u8_sse41
-_sk_lerp_u8_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,68,15,56,49,4,56                // pmovzxbd      (%rax,%rdi,1),%xmm8
-  .byte  69,15,91,192                        // cvtdq2ps      %xmm8,%xmm8
-  .byte  243,68,15,16,74,12                  // movss         0xc(%rdx),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  15,92,196                           // subps         %xmm4,%xmm0
-  .byte  65,15,89,193                        // mulps         %xmm9,%xmm0
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  15,92,205                           // subps         %xmm5,%xmm1
-  .byte  65,15,89,201                        // mulps         %xmm9,%xmm1
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  15,92,214                           // subps         %xmm6,%xmm2
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  15,92,223                           // subps         %xmm7,%xmm3
-  .byte  65,15,89,217                        // mulps         %xmm9,%xmm3
-  .byte  15,88,223                           // addps         %xmm7,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_lerp_565_sse41
-_sk_lerp_565_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,68,15,56,51,4,120               // pmovzxwd      (%rax,%rdi,2),%xmm8
-  .byte  102,15,110,90,104                   // movd          0x68(%rdx),%xmm3
-  .byte  102,15,112,219,0                    // pshufd        $0x0,%xmm3,%xmm3
-  .byte  102,65,15,219,216                   // pand          %xmm8,%xmm3
-  .byte  68,15,91,203                        // cvtdq2ps      %xmm3,%xmm9
-  .byte  243,15,16,26                        // movss         (%rdx),%xmm3
-  .byte  243,68,15,16,82,116                 // movss         0x74(%rdx),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  102,68,15,110,74,108                // movd          0x6c(%rdx),%xmm9
-  .byte  102,69,15,112,201,0                 // pshufd        $0x0,%xmm9,%xmm9
-  .byte  102,69,15,219,200                   // pand          %xmm8,%xmm9
-  .byte  69,15,91,201                        // cvtdq2ps      %xmm9,%xmm9
-  .byte  243,68,15,16,90,120                 // movss         0x78(%rdx),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,89,217                        // mulps         %xmm9,%xmm11
-  .byte  102,68,15,110,74,112                // movd          0x70(%rdx),%xmm9
-  .byte  102,69,15,112,201,0                 // pshufd        $0x0,%xmm9,%xmm9
-  .byte  102,69,15,219,200                   // pand          %xmm8,%xmm9
-  .byte  69,15,91,193                        // cvtdq2ps      %xmm9,%xmm8
-  .byte  243,68,15,16,74,124                 // movss         0x7c(%rdx),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  15,92,196                           // subps         %xmm4,%xmm0
-  .byte  65,15,89,194                        // mulps         %xmm10,%xmm0
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  15,92,205                           // subps         %xmm5,%xmm1
-  .byte  65,15,89,203                        // mulps         %xmm11,%xmm1
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  15,92,214                           // subps         %xmm6,%xmm2
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_tables_sse41
-_sk_load_tables_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,8                            // mov           (%rax),%rcx
-  .byte  76,139,64,8                         // mov           0x8(%rax),%r8
-  .byte  243,68,15,111,4,185                 // movdqu        (%rcx,%rdi,4),%xmm8
-  .byte  102,15,110,66,16                    // movd          0x10(%rdx),%xmm0
-  .byte  102,15,112,192,0                    // pshufd        $0x0,%xmm0,%xmm0
-  .byte  102,65,15,111,200                   // movdqa        %xmm8,%xmm1
-  .byte  102,15,114,209,8                    // psrld         $0x8,%xmm1
-  .byte  102,15,219,200                      // pand          %xmm0,%xmm1
-  .byte  102,65,15,111,208                   // movdqa        %xmm8,%xmm2
-  .byte  102,15,114,210,16                   // psrld         $0x10,%xmm2
-  .byte  102,15,219,208                      // pand          %xmm0,%xmm2
-  .byte  102,65,15,219,192                   // pand          %xmm8,%xmm0
-  .byte  102,72,15,58,22,193,1               // pextrq        $0x1,%xmm0,%rcx
-  .byte  65,137,201                          // mov           %ecx,%r9d
-  .byte  72,193,233,32                       // shr           $0x20,%rcx
-  .byte  102,73,15,126,194                   // movq          %xmm0,%r10
-  .byte  69,137,211                          // mov           %r10d,%r11d
-  .byte  73,193,234,32                       // shr           $0x20,%r10
-  .byte  243,67,15,16,4,152                  // movss         (%r8,%r11,4),%xmm0
-  .byte  102,67,15,58,33,4,144,16            // insertps      $0x10,(%r8,%r10,4),%xmm0
-  .byte  102,67,15,58,33,4,136,32            // insertps      $0x20,(%r8,%r9,4),%xmm0
-  .byte  102,65,15,58,33,4,136,48            // insertps      $0x30,(%r8,%rcx,4),%xmm0
-  .byte  72,139,72,16                        // mov           0x10(%rax),%rcx
-  .byte  102,73,15,58,22,200,1               // pextrq        $0x1,%xmm1,%r8
-  .byte  69,137,193                          // mov           %r8d,%r9d
-  .byte  73,193,232,32                       // shr           $0x20,%r8
-  .byte  102,73,15,126,202                   // movq          %xmm1,%r10
-  .byte  69,137,211                          // mov           %r10d,%r11d
-  .byte  73,193,234,32                       // shr           $0x20,%r10
-  .byte  243,66,15,16,12,153                 // movss         (%rcx,%r11,4),%xmm1
-  .byte  102,66,15,58,33,12,145,16           // insertps      $0x10,(%rcx,%r10,4),%xmm1
-  .byte  243,66,15,16,28,137                 // movss         (%rcx,%r9,4),%xmm3
-  .byte  102,15,58,33,203,32                 // insertps      $0x20,%xmm3,%xmm1
-  .byte  243,66,15,16,28,129                 // movss         (%rcx,%r8,4),%xmm3
-  .byte  102,15,58,33,203,48                 // insertps      $0x30,%xmm3,%xmm1
-  .byte  72,139,64,24                        // mov           0x18(%rax),%rax
-  .byte  102,72,15,58,22,209,1               // pextrq        $0x1,%xmm2,%rcx
-  .byte  65,137,200                          // mov           %ecx,%r8d
-  .byte  72,193,233,32                       // shr           $0x20,%rcx
-  .byte  102,73,15,126,209                   // movq          %xmm2,%r9
-  .byte  69,137,202                          // mov           %r9d,%r10d
-  .byte  73,193,233,32                       // shr           $0x20,%r9
-  .byte  243,66,15,16,20,144                 // movss         (%rax,%r10,4),%xmm2
-  .byte  102,66,15,58,33,20,136,16           // insertps      $0x10,(%rax,%r9,4),%xmm2
-  .byte  243,66,15,16,28,128                 // movss         (%rax,%r8,4),%xmm3
-  .byte  102,15,58,33,211,32                 // insertps      $0x20,%xmm3,%xmm2
-  .byte  243,15,16,28,136                    // movss         (%rax,%rcx,4),%xmm3
-  .byte  102,15,58,33,211,48                 // insertps      $0x30,%xmm3,%xmm2
-  .byte  102,65,15,114,208,24                // psrld         $0x18,%xmm8
-  .byte  69,15,91,192                        // cvtdq2ps      %xmm8,%xmm8
-  .byte  243,15,16,90,12                     // movss         0xc(%rdx),%xmm3
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_a8_sse41
-_sk_load_a8_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,15,56,49,4,56                   // pmovzxbd      (%rax,%rdi,1),%xmm0
-  .byte  15,91,192                           // cvtdq2ps      %xmm0,%xmm0
-  .byte  243,15,16,90,12                     // movss         0xc(%rdx),%xmm3
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  15,89,216                           // mulps         %xmm0,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,87,192                           // xorps         %xmm0,%xmm0
-  .byte  15,87,201                           // xorps         %xmm1,%xmm1
-  .byte  15,87,210                           // xorps         %xmm2,%xmm2
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_a8_sse41
-_sk_store_a8_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,68,15,16,66,8                   // movss         0x8(%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,89,195                        // mulps         %xmm3,%xmm8
-  .byte  102,69,15,91,192                    // cvtps2dq      %xmm8,%xmm8
-  .byte  102,69,15,56,43,192                 // packusdw      %xmm8,%xmm8
-  .byte  102,69,15,103,192                   // packuswb      %xmm8,%xmm8
-  .byte  102,68,15,126,4,56                  // movd          %xmm8,(%rax,%rdi,1)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_565_sse41
-_sk_load_565_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,68,15,56,51,12,120              // pmovzxwd      (%rax,%rdi,2),%xmm9
-  .byte  102,15,110,66,104                   // movd          0x68(%rdx),%xmm0
-  .byte  102,15,112,192,0                    // pshufd        $0x0,%xmm0,%xmm0
-  .byte  102,65,15,219,193                   // pand          %xmm9,%xmm0
-  .byte  15,91,200                           // cvtdq2ps      %xmm0,%xmm1
-  .byte  243,15,16,26                        // movss         (%rdx),%xmm3
-  .byte  243,15,16,66,116                    // movss         0x74(%rdx),%xmm0
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  15,89,193                           // mulps         %xmm1,%xmm0
-  .byte  102,15,110,74,108                   // movd          0x6c(%rdx),%xmm1
-  .byte  102,15,112,201,0                    // pshufd        $0x0,%xmm1,%xmm1
-  .byte  102,65,15,219,201                   // pand          %xmm9,%xmm1
-  .byte  68,15,91,193                        // cvtdq2ps      %xmm1,%xmm8
-  .byte  243,15,16,74,120                    // movss         0x78(%rdx),%xmm1
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  102,15,110,82,112                   // movd          0x70(%rdx),%xmm2
-  .byte  102,15,112,210,0                    // pshufd        $0x0,%xmm2,%xmm2
-  .byte  102,65,15,219,209                   // pand          %xmm9,%xmm2
-  .byte  68,15,91,194                        // cvtdq2ps      %xmm2,%xmm8
-  .byte  243,15,16,82,124                    // movss         0x7c(%rdx),%xmm2
-  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_565_sse41
-_sk_store_565_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,68,15,16,130,128,0,0,0          // movss         0x80(%rdx),%xmm8
-  .byte  243,68,15,16,138,132,0,0,0          // movss         0x84(%rdx),%xmm9
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  69,15,40,208                        // movaps        %xmm8,%xmm10
-  .byte  68,15,89,208                        // mulps         %xmm0,%xmm10
-  .byte  102,69,15,91,210                    // cvtps2dq      %xmm10,%xmm10
-  .byte  102,65,15,114,242,11                // pslld         $0xb,%xmm10
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  102,69,15,91,201                    // cvtps2dq      %xmm9,%xmm9
-  .byte  102,65,15,114,241,5                 // pslld         $0x5,%xmm9
-  .byte  102,69,15,235,202                   // por           %xmm10,%xmm9
-  .byte  68,15,89,194                        // mulps         %xmm2,%xmm8
-  .byte  102,69,15,91,192                    // cvtps2dq      %xmm8,%xmm8
-  .byte  102,69,15,86,193                    // orpd          %xmm9,%xmm8
-  .byte  102,69,15,56,43,192                 // packusdw      %xmm8,%xmm8
-  .byte  102,68,15,214,4,120                 // movq          %xmm8,(%rax,%rdi,2)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_8888_sse41
-_sk_load_8888_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,15,111,28,184                   // movdqu        (%rax,%rdi,4),%xmm3
-  .byte  102,15,110,66,16                    // movd          0x10(%rdx),%xmm0
-  .byte  102,15,112,192,0                    // pshufd        $0x0,%xmm0,%xmm0
-  .byte  102,15,111,203                      // movdqa        %xmm3,%xmm1
-  .byte  102,15,114,209,8                    // psrld         $0x8,%xmm1
-  .byte  102,15,219,200                      // pand          %xmm0,%xmm1
-  .byte  102,15,111,211                      // movdqa        %xmm3,%xmm2
-  .byte  102,15,114,210,16                   // psrld         $0x10,%xmm2
-  .byte  102,15,219,208                      // pand          %xmm0,%xmm2
-  .byte  102,15,219,195                      // pand          %xmm3,%xmm0
-  .byte  15,91,192                           // cvtdq2ps      %xmm0,%xmm0
-  .byte  243,68,15,16,66,12                  // movss         0xc(%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  15,91,201                           // cvtdq2ps      %xmm1,%xmm1
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  15,91,210                           // cvtdq2ps      %xmm2,%xmm2
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  102,15,114,211,24                   // psrld         $0x18,%xmm3
-  .byte  15,91,219                           // cvtdq2ps      %xmm3,%xmm3
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_8888_sse41
-_sk_store_8888_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,68,15,16,66,8                   // movss         0x8(%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,200                        // mulps         %xmm0,%xmm9
-  .byte  102,69,15,91,201                    // cvtps2dq      %xmm9,%xmm9
-  .byte  69,15,40,208                        // movaps        %xmm8,%xmm10
-  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
-  .byte  102,69,15,91,210                    // cvtps2dq      %xmm10,%xmm10
-  .byte  102,65,15,114,242,8                 // pslld         $0x8,%xmm10
-  .byte  102,69,15,235,209                   // por           %xmm9,%xmm10
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,202                        // mulps         %xmm2,%xmm9
-  .byte  102,69,15,91,201                    // cvtps2dq      %xmm9,%xmm9
-  .byte  102,65,15,114,241,16                // pslld         $0x10,%xmm9
-  .byte  68,15,89,195                        // mulps         %xmm3,%xmm8
-  .byte  102,69,15,91,192                    // cvtps2dq      %xmm8,%xmm8
-  .byte  102,65,15,114,240,24                // pslld         $0x18,%xmm8
-  .byte  102,69,15,235,193                   // por           %xmm9,%xmm8
-  .byte  102,69,15,235,194                   // por           %xmm10,%xmm8
-  .byte  243,68,15,127,4,184                 // movdqu        %xmm8,(%rax,%rdi,4)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_f16_sse41
-_sk_load_f16_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,15,111,4,248                    // movdqu        (%rax,%rdi,8),%xmm0
-  .byte  243,15,111,76,248,16                // movdqu        0x10(%rax,%rdi,8),%xmm1
-  .byte  102,15,111,208                      // movdqa        %xmm0,%xmm2
-  .byte  102,15,97,209                       // punpcklwd     %xmm1,%xmm2
-  .byte  102,15,105,193                      // punpckhwd     %xmm1,%xmm0
-  .byte  102,68,15,111,194                   // movdqa        %xmm2,%xmm8
-  .byte  102,68,15,97,192                    // punpcklwd     %xmm0,%xmm8
-  .byte  102,15,105,208                      // punpckhwd     %xmm0,%xmm2
-  .byte  102,15,110,66,100                   // movd          0x64(%rdx),%xmm0
-  .byte  102,15,112,216,0                    // pshufd        $0x0,%xmm0,%xmm3
-  .byte  102,15,111,203                      // movdqa        %xmm3,%xmm1
-  .byte  102,65,15,101,200                   // pcmpgtw       %xmm8,%xmm1
-  .byte  102,65,15,223,200                   // pandn         %xmm8,%xmm1
-  .byte  102,15,101,218                      // pcmpgtw       %xmm2,%xmm3
-  .byte  102,15,223,218                      // pandn         %xmm2,%xmm3
-  .byte  102,15,56,51,193                    // pmovzxwd      %xmm1,%xmm0
-  .byte  102,15,114,240,13                   // pslld         $0xd,%xmm0
-  .byte  102,15,110,82,92                    // movd          0x5c(%rdx),%xmm2
-  .byte  102,68,15,112,194,0                 // pshufd        $0x0,%xmm2,%xmm8
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  102,69,15,239,201                   // pxor          %xmm9,%xmm9
-  .byte  102,65,15,105,201                   // punpckhwd     %xmm9,%xmm1
-  .byte  102,15,114,241,13                   // pslld         $0xd,%xmm1
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  102,15,56,51,211                    // pmovzxwd      %xmm3,%xmm2
-  .byte  102,15,114,242,13                   // pslld         $0xd,%xmm2
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  102,65,15,105,217                   // punpckhwd     %xmm9,%xmm3
-  .byte  102,15,114,243,13                   // pslld         $0xd,%xmm3
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_f16_sse41
-_sk_store_f16_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,68,15,110,66,96                 // movd          0x60(%rdx),%xmm8
-  .byte  102,69,15,112,192,0                 // pshufd        $0x0,%xmm8,%xmm8
-  .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
-  .byte  68,15,89,200                        // mulps         %xmm0,%xmm9
-  .byte  102,65,15,114,209,13                // psrld         $0xd,%xmm9
-  .byte  102,69,15,111,208                   // movdqa        %xmm8,%xmm10
-  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
-  .byte  102,65,15,114,210,13                // psrld         $0xd,%xmm10
-  .byte  102,69,15,111,216                   // movdqa        %xmm8,%xmm11
-  .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
-  .byte  102,65,15,114,211,13                // psrld         $0xd,%xmm11
-  .byte  68,15,89,195                        // mulps         %xmm3,%xmm8
-  .byte  102,65,15,114,208,13                // psrld         $0xd,%xmm8
-  .byte  102,65,15,115,250,2                 // pslldq        $0x2,%xmm10
-  .byte  102,69,15,235,209                   // por           %xmm9,%xmm10
-  .byte  102,65,15,115,248,2                 // pslldq        $0x2,%xmm8
-  .byte  102,69,15,235,195                   // por           %xmm11,%xmm8
-  .byte  102,69,15,111,202                   // movdqa        %xmm10,%xmm9
-  .byte  102,69,15,98,200                    // punpckldq     %xmm8,%xmm9
-  .byte  243,68,15,127,12,248                // movdqu        %xmm9,(%rax,%rdi,8)
-  .byte  102,69,15,106,208                   // punpckhdq     %xmm8,%xmm10
-  .byte  243,68,15,127,84,248,16             // movdqu        %xmm10,0x10(%rax,%rdi,8)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_f32_sse41
-_sk_store_f32_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,137,249                          // mov           %rdi,%rcx
-  .byte  72,193,225,4                        // shl           $0x4,%rcx
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  68,15,40,200                        // movaps        %xmm0,%xmm9
-  .byte  68,15,20,201                        // unpcklps      %xmm1,%xmm9
-  .byte  68,15,40,210                        // movaps        %xmm2,%xmm10
-  .byte  68,15,40,218                        // movaps        %xmm2,%xmm11
-  .byte  68,15,20,219                        // unpcklps      %xmm3,%xmm11
-  .byte  68,15,21,193                        // unpckhps      %xmm1,%xmm8
-  .byte  68,15,21,211                        // unpckhps      %xmm3,%xmm10
-  .byte  69,15,40,225                        // movaps        %xmm9,%xmm12
-  .byte  102,69,15,20,227                    // unpcklpd      %xmm11,%xmm12
-  .byte  102,69,15,21,203                    // unpckhpd      %xmm11,%xmm9
-  .byte  69,15,40,216                        // movaps        %xmm8,%xmm11
-  .byte  102,69,15,20,218                    // unpcklpd      %xmm10,%xmm11
-  .byte  102,69,15,21,194                    // unpckhpd      %xmm10,%xmm8
-  .byte  102,68,15,17,36,8                   // movupd        %xmm12,(%rax,%rcx,1)
-  .byte  102,68,15,17,76,8,16                // movupd        %xmm9,0x10(%rax,%rcx,1)
-  .byte  102,68,15,17,92,8,32                // movupd        %xmm11,0x20(%rax,%rcx,1)
-  .byte  102,68,15,17,68,8,48                // movupd        %xmm8,0x30(%rax,%rcx,1)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_x_sse41
-_sk_clamp_x_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  68,15,95,192                        // maxps         %xmm0,%xmm8
-  .byte  243,68,15,16,8                      // movss         (%rax),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  102,15,118,192                      // pcmpeqd       %xmm0,%xmm0
-  .byte  102,65,15,254,193                   // paddd         %xmm9,%xmm0
-  .byte  68,15,93,192                        // minps         %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,192                        // movaps        %xmm8,%xmm0
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_y_sse41
-_sk_clamp_y_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  68,15,95,193                        // maxps         %xmm1,%xmm8
-  .byte  243,68,15,16,8                      // movss         (%rax),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  102,15,118,201                      // pcmpeqd       %xmm1,%xmm1
-  .byte  102,65,15,254,201                   // paddd         %xmm9,%xmm1
-  .byte  68,15,93,193                        // minps         %xmm1,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,200                        // movaps        %xmm8,%xmm1
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_repeat_x_sse41
-_sk_repeat_x_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,40,200                        // movaps        %xmm0,%xmm9
-  .byte  69,15,94,200                        // divps         %xmm8,%xmm9
-  .byte  102,69,15,58,8,201,1                // roundps       $0x1,%xmm9,%xmm9
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  65,15,92,193                        // subps         %xmm9,%xmm0
-  .byte  102,69,15,118,201                   // pcmpeqd       %xmm9,%xmm9
-  .byte  102,69,15,254,200                   // paddd         %xmm8,%xmm9
-  .byte  65,15,93,193                        // minps         %xmm9,%xmm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_repeat_y_sse41
-_sk_repeat_y_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,40,201                        // movaps        %xmm1,%xmm9
-  .byte  69,15,94,200                        // divps         %xmm8,%xmm9
-  .byte  102,69,15,58,8,201,1                // roundps       $0x1,%xmm9,%xmm9
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  65,15,92,201                        // subps         %xmm9,%xmm1
-  .byte  102,69,15,118,201                   // pcmpeqd       %xmm9,%xmm9
-  .byte  102,69,15,254,200                   // paddd         %xmm8,%xmm9
-  .byte  65,15,93,201                        // minps         %xmm9,%xmm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_mirror_x_sse41
-_sk_mirror_x_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  65,15,92,193                        // subps         %xmm9,%xmm0
-  .byte  243,69,15,88,192                    // addss         %xmm8,%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,40,208                        // movaps        %xmm0,%xmm10
-  .byte  69,15,94,208                        // divps         %xmm8,%xmm10
-  .byte  102,69,15,58,8,210,1                // roundps       $0x1,%xmm10,%xmm10
-  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
-  .byte  65,15,92,194                        // subps         %xmm10,%xmm0
-  .byte  65,15,92,193                        // subps         %xmm9,%xmm0
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  68,15,92,192                        // subps         %xmm0,%xmm8
-  .byte  65,15,84,192                        // andps         %xmm8,%xmm0
-  .byte  102,69,15,118,192                   // pcmpeqd       %xmm8,%xmm8
-  .byte  102,69,15,254,193                   // paddd         %xmm9,%xmm8
-  .byte  65,15,93,192                        // minps         %xmm8,%xmm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_mirror_y_sse41
-_sk_mirror_y_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  65,15,92,201                        // subps         %xmm9,%xmm1
-  .byte  243,69,15,88,192                    // addss         %xmm8,%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,40,209                        // movaps        %xmm1,%xmm10
-  .byte  69,15,94,208                        // divps         %xmm8,%xmm10
-  .byte  102,69,15,58,8,210,1                // roundps       $0x1,%xmm10,%xmm10
-  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
-  .byte  65,15,92,202                        // subps         %xmm10,%xmm1
-  .byte  65,15,92,201                        // subps         %xmm9,%xmm1
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  68,15,92,193                        // subps         %xmm1,%xmm8
-  .byte  65,15,84,200                        // andps         %xmm8,%xmm1
-  .byte  102,69,15,118,192                   // pcmpeqd       %xmm8,%xmm8
-  .byte  102,69,15,254,193                   // paddd         %xmm9,%xmm8
-  .byte  65,15,93,200                        // minps         %xmm8,%xmm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_2x3_sse41
-_sk_matrix_2x3_sse41:
-  .byte  68,15,40,201                        // movaps        %xmm1,%xmm9
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,15,16,0                         // movss         (%rax),%xmm0
-  .byte  243,15,16,72,4                      // movss         0x4(%rax),%xmm1
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  243,68,15,16,80,8                   // movss         0x8(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,16                  // movss         0x10(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,88,194                        // addps         %xmm10,%xmm0
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  243,68,15,16,80,12                  // movss         0xc(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,20                  // movss         0x14(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  65,15,88,202                        // addps         %xmm10,%xmm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_3x4_sse41
-_sk_matrix_3x4_sse41:
-  .byte  68,15,40,201                        // movaps        %xmm1,%xmm9
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,15,16,0                         // movss         (%rax),%xmm0
-  .byte  243,15,16,72,4                      // movss         0x4(%rax),%xmm1
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  243,68,15,16,80,12                  // movss         0xc(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,24                  // movss         0x18(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  243,68,15,16,96,36                  // movss         0x24(%rax),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
-  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,88,194                        // addps         %xmm10,%xmm0
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  243,68,15,16,80,16                  // movss         0x10(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,28                  // movss         0x1c(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  243,68,15,16,96,40                  // movss         0x28(%rax),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
-  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  65,15,88,202                        // addps         %xmm10,%xmm1
-  .byte  243,68,15,16,80,8                   // movss         0x8(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,20                  // movss         0x14(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  243,68,15,16,96,32                  // movss         0x20(%rax),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  243,68,15,16,104,44                 // movss         0x2c(%rax),%xmm13
-  .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
-  .byte  68,15,89,226                        // mulps         %xmm2,%xmm12
-  .byte  69,15,88,229                        // addps         %xmm13,%xmm12
-  .byte  69,15,89,217                        // mulps         %xmm9,%xmm11
-  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
-  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,210                        // movaps        %xmm10,%xmm2
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_perspective_sse41
-_sk_matrix_perspective_sse41:
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,15,16,0                         // movss         (%rax),%xmm0
-  .byte  243,68,15,16,72,4                   // movss         0x4(%rax),%xmm9
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  243,68,15,16,80,8                   // movss         0x8(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  69,15,88,202                        // addps         %xmm10,%xmm9
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
-  .byte  243,68,15,16,72,12                  // movss         0xc(%rax),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  243,68,15,16,80,16                  // movss         0x10(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,20                  // movss         0x14(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  69,15,88,202                        // addps         %xmm10,%xmm9
-  .byte  243,68,15,16,80,24                  // movss         0x18(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,28                  // movss         0x1c(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  243,68,15,16,96,32                  // movss         0x20(%rax),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  68,15,89,217                        // mulps         %xmm1,%xmm11
-  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
-  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,83,202                        // rcpps         %xmm10,%xmm1
-  .byte  15,89,193                           // mulps         %xmm1,%xmm0
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,201                        // movaps        %xmm9,%xmm1
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_linear_gradient_2stops_sse41
-_sk_linear_gradient_2stops_sse41:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  68,15,16,8                          // movups        (%rax),%xmm9
-  .byte  15,16,88,16                         // movups        0x10(%rax),%xmm3
-  .byte  68,15,40,195                        // movaps        %xmm3,%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,40,201                        // movaps        %xmm9,%xmm1
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  68,15,89,192                        // mulps         %xmm0,%xmm8
-  .byte  68,15,88,193                        // addps         %xmm1,%xmm8
-  .byte  15,40,203                           // movaps        %xmm3,%xmm1
-  .byte  15,198,201,85                       // shufps        $0x55,%xmm1,%xmm1
-  .byte  65,15,40,209                        // movaps        %xmm9,%xmm2
-  .byte  15,198,210,85                       // shufps        $0x55,%xmm2,%xmm2
-  .byte  15,89,200                           // mulps         %xmm0,%xmm1
-  .byte  15,88,202                           // addps         %xmm2,%xmm1
-  .byte  15,40,211                           // movaps        %xmm3,%xmm2
-  .byte  15,198,210,170                      // shufps        $0xaa,%xmm2,%xmm2
-  .byte  69,15,40,209                        // movaps        %xmm9,%xmm10
-  .byte  69,15,198,210,170                   // shufps        $0xaa,%xmm10,%xmm10
-  .byte  15,89,208                           // mulps         %xmm0,%xmm2
-  .byte  65,15,88,210                        // addps         %xmm10,%xmm2
-  .byte  15,198,219,255                      // shufps        $0xff,%xmm3,%xmm3
-  .byte  69,15,198,201,255                   // shufps        $0xff,%xmm9,%xmm9
-  .byte  15,89,216                           // mulps         %xmm0,%xmm3
-  .byte  65,15,88,217                        // addps         %xmm9,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,192                        // movaps        %xmm8,%xmm0
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_start_pipeline_sse2
-_sk_start_pipeline_sse2:
-  .byte  65,87                               // push          %r15
-  .byte  65,86                               // push          %r14
-  .byte  65,85                               // push          %r13
-  .byte  65,84                               // push          %r12
-  .byte  83                                  // push          %rbx
-  .byte  73,137,207                          // mov           %rcx,%r15
-  .byte  73,137,214                          // mov           %rdx,%r14
-  .byte  72,137,251                          // mov           %rdi,%rbx
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  73,137,196                          // mov           %rax,%r12
-  .byte  73,137,245                          // mov           %rsi,%r13
-  .byte  72,141,67,4                         // lea           0x4(%rbx),%rax
-  .byte  76,57,248                           // cmp           %r15,%rax
-  .byte  118,5                               // jbe           28 <_sk_start_pipeline_sse2+0x28>
-  .byte  72,137,216                          // mov           %rbx,%rax
-  .byte  235,52                              // jmp           5c <_sk_start_pipeline_sse2+0x5c>
-  .byte  15,87,192                           // xorps         %xmm0,%xmm0
-  .byte  15,87,201                           // xorps         %xmm1,%xmm1
-  .byte  15,87,210                           // xorps         %xmm2,%xmm2
-  .byte  15,87,219                           // xorps         %xmm3,%xmm3
-  .byte  15,87,228                           // xorps         %xmm4,%xmm4
-  .byte  15,87,237                           // xorps         %xmm5,%xmm5
-  .byte  15,87,246                           // xorps         %xmm6,%xmm6
-  .byte  15,87,255                           // xorps         %xmm7,%xmm7
-  .byte  72,137,223                          // mov           %rbx,%rdi
-  .byte  76,137,238                          // mov           %r13,%rsi
-  .byte  76,137,242                          // mov           %r14,%rdx
-  .byte  65,255,212                          // callq         *%r12
-  .byte  72,141,67,4                         // lea           0x4(%rbx),%rax
-  .byte  72,131,195,8                        // add           $0x8,%rbx
-  .byte  76,57,251                           // cmp           %r15,%rbx
-  .byte  72,137,195                          // mov           %rax,%rbx
-  .byte  118,204                             // jbe           28 <_sk_start_pipeline_sse2+0x28>
-  .byte  91                                  // pop           %rbx
-  .byte  65,92                               // pop           %r12
-  .byte  65,93                               // pop           %r13
-  .byte  65,94                               // pop           %r14
-  .byte  65,95                               // pop           %r15
-  .byte  195                                 // retq
-
-.globl _sk_just_return_sse2
-_sk_just_return_sse2:
-  .byte  195                                 // retq
-
-.globl _sk_seed_shader_sse2
-_sk_seed_shader_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  102,15,110,199                      // movd          %edi,%xmm0
-  .byte  102,15,112,192,0                    // pshufd        $0x0,%xmm0,%xmm0
-  .byte  15,91,200                           // cvtdq2ps      %xmm0,%xmm1
-  .byte  243,15,16,18                        // movss         (%rdx),%xmm2
-  .byte  243,15,16,90,4                      // movss         0x4(%rdx),%xmm3
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  15,88,203                           // addps         %xmm3,%xmm1
-  .byte  15,16,66,20                         // movups        0x14(%rdx),%xmm0
-  .byte  15,88,193                           // addps         %xmm1,%xmm0
-  .byte  102,15,110,8                        // movd          (%rax),%xmm1
-  .byte  102,15,112,201,0                    // pshufd        $0x0,%xmm1,%xmm1
-  .byte  15,91,201                           // cvtdq2ps      %xmm1,%xmm1
-  .byte  15,88,203                           // addps         %xmm3,%xmm1
-  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,87,219                           // xorps         %xmm3,%xmm3
-  .byte  15,87,228                           // xorps         %xmm4,%xmm4
-  .byte  15,87,237                           // xorps         %xmm5,%xmm5
-  .byte  15,87,246                           // xorps         %xmm6,%xmm6
-  .byte  15,87,255                           // xorps         %xmm7,%xmm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_constant_color_sse2
-_sk_constant_color_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,16,24                            // movups        (%rax),%xmm3
-  .byte  15,40,195                           // movaps        %xmm3,%xmm0
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  15,40,203                           // movaps        %xmm3,%xmm1
-  .byte  15,198,201,85                       // shufps        $0x55,%xmm1,%xmm1
-  .byte  15,40,211                           // movaps        %xmm3,%xmm2
-  .byte  15,198,210,170                      // shufps        $0xaa,%xmm2,%xmm2
-  .byte  15,198,219,255                      // shufps        $0xff,%xmm3,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clear_sse2
-_sk_clear_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,87,192                           // xorps         %xmm0,%xmm0
-  .byte  15,87,201                           // xorps         %xmm1,%xmm1
-  .byte  15,87,210                           // xorps         %xmm2,%xmm2
-  .byte  15,87,219                           // xorps         %xmm3,%xmm3
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_plus__sse2
-_sk_plus__sse2:
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  15,88,223                           // addps         %xmm7,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_srcover_sse2
-_sk_srcover_sse2:
-  .byte  243,68,15,16,2                      // movss         (%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,92,195                        // subps         %xmm3,%xmm8
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,204                        // mulps         %xmm4,%xmm9
-  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,205                        // mulps         %xmm5,%xmm9
-  .byte  65,15,88,201                        // addps         %xmm9,%xmm1
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,206                        // mulps         %xmm6,%xmm9
-  .byte  65,15,88,209                        // addps         %xmm9,%xmm2
-  .byte  68,15,89,199                        // mulps         %xmm7,%xmm8
-  .byte  65,15,88,216                        // addps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_dstover_sse2
-_sk_dstover_sse2:
-  .byte  243,68,15,16,2                      // movss         (%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,92,199                        // subps         %xmm7,%xmm8
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  15,88,223                           // addps         %xmm7,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_0_sse2
-_sk_clamp_0_sse2:
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  65,15,95,192                        // maxps         %xmm8,%xmm0
-  .byte  65,15,95,200                        // maxps         %xmm8,%xmm1
-  .byte  65,15,95,208                        // maxps         %xmm8,%xmm2
-  .byte  65,15,95,216                        // maxps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_1_sse2
-_sk_clamp_1_sse2:
-  .byte  243,68,15,16,2                      // movss         (%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,93,192                        // minps         %xmm8,%xmm0
-  .byte  65,15,93,200                        // minps         %xmm8,%xmm1
-  .byte  65,15,93,208                        // minps         %xmm8,%xmm2
-  .byte  65,15,93,216                        // minps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_a_sse2
-_sk_clamp_a_sse2:
-  .byte  243,68,15,16,2                      // movss         (%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,93,216                        // minps         %xmm8,%xmm3
-  .byte  15,93,195                           // minps         %xmm3,%xmm0
-  .byte  15,93,203                           // minps         %xmm3,%xmm1
-  .byte  15,93,211                           // minps         %xmm3,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_set_rgb_sse2
-_sk_set_rgb_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,15,16,0                         // movss         (%rax),%xmm0
-  .byte  243,15,16,72,4                      // movss         0x4(%rax),%xmm1
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  243,15,16,80,8                      // movss         0x8(%rax),%xmm2
-  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_swap_rb_sse2
-_sk_swap_rb_sse2:
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,40,194                           // movaps        %xmm2,%xmm0
-  .byte  65,15,40,208                        // movaps        %xmm8,%xmm2
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_swap_sse2
-_sk_swap_sse2:
-  .byte  68,15,40,195                        // movaps        %xmm3,%xmm8
-  .byte  68,15,40,202                        // movaps        %xmm2,%xmm9
-  .byte  68,15,40,209                        // movaps        %xmm1,%xmm10
-  .byte  68,15,40,216                        // movaps        %xmm0,%xmm11
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,40,196                           // movaps        %xmm4,%xmm0
-  .byte  15,40,205                           // movaps        %xmm5,%xmm1
-  .byte  15,40,214                           // movaps        %xmm6,%xmm2
-  .byte  15,40,223                           // movaps        %xmm7,%xmm3
-  .byte  65,15,40,227                        // movaps        %xmm11,%xmm4
-  .byte  65,15,40,234                        // movaps        %xmm10,%xmm5
-  .byte  65,15,40,241                        // movaps        %xmm9,%xmm6
-  .byte  65,15,40,248                        // movaps        %xmm8,%xmm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_move_src_dst_sse2
-_sk_move_src_dst_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,40,224                           // movaps        %xmm0,%xmm4
-  .byte  15,40,233                           // movaps        %xmm1,%xmm5
-  .byte  15,40,242                           // movaps        %xmm2,%xmm6
-  .byte  15,40,251                           // movaps        %xmm3,%xmm7
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_move_dst_src_sse2
-_sk_move_dst_src_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,40,196                           // movaps        %xmm4,%xmm0
-  .byte  15,40,205                           // movaps        %xmm5,%xmm1
-  .byte  15,40,214                           // movaps        %xmm6,%xmm2
-  .byte  15,40,223                           // movaps        %xmm7,%xmm3
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_premul_sse2
-_sk_premul_sse2:
-  .byte  15,89,195                           // mulps         %xmm3,%xmm0
-  .byte  15,89,203                           // mulps         %xmm3,%xmm1
-  .byte  15,89,211                           // mulps         %xmm3,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_unpremul_sse2
-_sk_unpremul_sse2:
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  68,15,194,195,0                     // cmpeqps       %xmm3,%xmm8
-  .byte  243,68,15,16,10                     // movss         (%rdx),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  68,15,94,203                        // divps         %xmm3,%xmm9
-  .byte  69,15,85,193                        // andnps        %xmm9,%xmm8
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_from_srgb_sse2
-_sk_from_srgb_sse2:
-  .byte  243,68,15,16,66,64                  // movss         0x40(%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  69,15,40,232                        // movaps        %xmm8,%xmm13
-  .byte  68,15,89,232                        // mulps         %xmm0,%xmm13
-  .byte  68,15,40,224                        // movaps        %xmm0,%xmm12
-  .byte  69,15,89,228                        // mulps         %xmm12,%xmm12
-  .byte  243,68,15,16,74,60                  // movss         0x3c(%rdx),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  243,68,15,16,82,52                  // movss         0x34(%rdx),%xmm10
-  .byte  243,68,15,16,90,56                  // movss         0x38(%rdx),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,40,241                        // movaps        %xmm9,%xmm14
-  .byte  68,15,89,240                        // mulps         %xmm0,%xmm14
-  .byte  69,15,88,243                        // addps         %xmm11,%xmm14
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  69,15,89,244                        // mulps         %xmm12,%xmm14
-  .byte  69,15,88,242                        // addps         %xmm10,%xmm14
-  .byte  243,68,15,16,98,68                  // movss         0x44(%rdx),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  65,15,194,196,1                     // cmpltps       %xmm12,%xmm0
-  .byte  68,15,84,232                        // andps         %xmm0,%xmm13
-  .byte  65,15,85,198                        // andnps        %xmm14,%xmm0
-  .byte  65,15,86,197                        // orps          %xmm13,%xmm0
-  .byte  69,15,40,232                        // movaps        %xmm8,%xmm13
-  .byte  68,15,89,233                        // mulps         %xmm1,%xmm13
-  .byte  68,15,40,241                        // movaps        %xmm1,%xmm14
-  .byte  69,15,89,246                        // mulps         %xmm14,%xmm14
-  .byte  69,15,40,249                        // movaps        %xmm9,%xmm15
-  .byte  68,15,89,249                        // mulps         %xmm1,%xmm15
-  .byte  69,15,88,251                        // addps         %xmm11,%xmm15
-  .byte  69,15,89,254                        // mulps         %xmm14,%xmm15
-  .byte  69,15,88,250                        // addps         %xmm10,%xmm15
-  .byte  65,15,194,204,1                     // cmpltps       %xmm12,%xmm1
-  .byte  68,15,84,233                        // andps         %xmm1,%xmm13
-  .byte  65,15,85,207                        // andnps        %xmm15,%xmm1
-  .byte  65,15,86,205                        // orps          %xmm13,%xmm1
-  .byte  68,15,89,194                        // mulps         %xmm2,%xmm8
-  .byte  68,15,40,234                        // movaps        %xmm2,%xmm13
-  .byte  69,15,89,237                        // mulps         %xmm13,%xmm13
-  .byte  68,15,89,202                        // mulps         %xmm2,%xmm9
-  .byte  69,15,88,203                        // addps         %xmm11,%xmm9
-  .byte  69,15,89,205                        // mulps         %xmm13,%xmm9
-  .byte  69,15,88,202                        // addps         %xmm10,%xmm9
-  .byte  65,15,194,212,1                     // cmpltps       %xmm12,%xmm2
-  .byte  68,15,84,194                        // andps         %xmm2,%xmm8
-  .byte  65,15,85,209                        // andnps        %xmm9,%xmm2
-  .byte  65,15,86,208                        // orps          %xmm8,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_to_srgb_sse2
-_sk_to_srgb_sse2:
-  .byte  72,131,236,40                       // sub           $0x28,%rsp
-  .byte  15,41,124,36,16                     // movaps        %xmm7,0x10(%rsp)
-  .byte  15,41,52,36                         // movaps        %xmm6,(%rsp)
-  .byte  15,40,245                           // movaps        %xmm5,%xmm6
-  .byte  15,40,236                           // movaps        %xmm4,%xmm5
-  .byte  15,40,227                           // movaps        %xmm3,%xmm4
-  .byte  68,15,82,192                        // rsqrtps       %xmm0,%xmm8
-  .byte  69,15,83,232                        // rcpps         %xmm8,%xmm13
-  .byte  69,15,82,248                        // rsqrtps       %xmm8,%xmm15
-  .byte  243,15,16,26                        // movss         (%rdx),%xmm3
-  .byte  243,68,15,16,66,72                  // movss         0x48(%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  69,15,40,240                        // movaps        %xmm8,%xmm14
-  .byte  68,15,89,240                        // mulps         %xmm0,%xmm14
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  243,68,15,16,82,76                  // movss         0x4c(%rdx),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,90,80                  // movss         0x50(%rdx),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  243,68,15,16,98,84                  // movss         0x54(%rdx),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  69,15,89,235                        // mulps         %xmm11,%xmm13
-  .byte  69,15,88,236                        // addps         %xmm12,%xmm13
-  .byte  69,15,89,250                        // mulps         %xmm10,%xmm15
-  .byte  69,15,88,253                        // addps         %xmm13,%xmm15
-  .byte  68,15,40,203                        // movaps        %xmm3,%xmm9
-  .byte  69,15,93,207                        // minps         %xmm15,%xmm9
-  .byte  243,68,15,16,106,88                 // movss         0x58(%rdx),%xmm13
-  .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
-  .byte  65,15,194,197,1                     // cmpltps       %xmm13,%xmm0
-  .byte  68,15,84,240                        // andps         %xmm0,%xmm14
-  .byte  65,15,85,193                        // andnps        %xmm9,%xmm0
-  .byte  65,15,86,198                        // orps          %xmm14,%xmm0
-  .byte  68,15,82,201                        // rsqrtps       %xmm1,%xmm9
-  .byte  69,15,83,241                        // rcpps         %xmm9,%xmm14
-  .byte  69,15,82,201                        // rsqrtps       %xmm9,%xmm9
-  .byte  69,15,89,243                        // mulps         %xmm11,%xmm14
-  .byte  69,15,88,244                        // addps         %xmm12,%xmm14
-  .byte  69,15,89,202                        // mulps         %xmm10,%xmm9
-  .byte  69,15,88,206                        // addps         %xmm14,%xmm9
-  .byte  68,15,40,243                        // movaps        %xmm3,%xmm14
-  .byte  69,15,93,241                        // minps         %xmm9,%xmm14
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  65,15,194,205,1                     // cmpltps       %xmm13,%xmm1
-  .byte  68,15,84,201                        // andps         %xmm1,%xmm9
-  .byte  65,15,85,206                        // andnps        %xmm14,%xmm1
-  .byte  65,15,86,201                        // orps          %xmm9,%xmm1
-  .byte  68,15,82,202                        // rsqrtps       %xmm2,%xmm9
-  .byte  69,15,83,241                        // rcpps         %xmm9,%xmm14
-  .byte  69,15,89,243                        // mulps         %xmm11,%xmm14
-  .byte  69,15,88,244                        // addps         %xmm12,%xmm14
-  .byte  65,15,82,249                        // rsqrtps       %xmm9,%xmm7
-  .byte  65,15,89,250                        // mulps         %xmm10,%xmm7
-  .byte  65,15,88,254                        // addps         %xmm14,%xmm7
-  .byte  15,93,223                           // minps         %xmm7,%xmm3
-  .byte  68,15,89,194                        // mulps         %xmm2,%xmm8
-  .byte  65,15,194,213,1                     // cmpltps       %xmm13,%xmm2
-  .byte  68,15,84,194                        // andps         %xmm2,%xmm8
-  .byte  15,85,211                           // andnps        %xmm3,%xmm2
-  .byte  65,15,86,208                        // orps          %xmm8,%xmm2
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,40,220                           // movaps        %xmm4,%xmm3
-  .byte  15,40,229                           // movaps        %xmm5,%xmm4
-  .byte  15,40,238                           // movaps        %xmm6,%xmm5
-  .byte  15,40,52,36                         // movaps        (%rsp),%xmm6
-  .byte  15,40,124,36,16                     // movaps        0x10(%rsp),%xmm7
-  .byte  72,131,196,40                       // add           $0x28,%rsp
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_scale_1_float_sse2
-_sk_scale_1_float_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_scale_u8_sse2
-_sk_scale_u8_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,68,15,110,4,56                  // movd          (%rax,%rdi,1),%xmm8
-  .byte  102,69,15,239,201                   // pxor          %xmm9,%xmm9
-  .byte  102,69,15,96,193                    // punpcklbw     %xmm9,%xmm8
-  .byte  102,69,15,97,193                    // punpcklwd     %xmm9,%xmm8
-  .byte  69,15,91,192                        // cvtdq2ps      %xmm8,%xmm8
-  .byte  243,68,15,16,74,12                  // movss         0xc(%rdx),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  65,15,89,193                        // mulps         %xmm9,%xmm0
-  .byte  65,15,89,201                        // mulps         %xmm9,%xmm1
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  65,15,89,217                        // mulps         %xmm9,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_lerp_1_float_sse2
-_sk_lerp_1_float_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  15,92,196                           // subps         %xmm4,%xmm0
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  15,92,205                           // subps         %xmm5,%xmm1
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  15,92,214                           // subps         %xmm6,%xmm2
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  15,92,223                           // subps         %xmm7,%xmm3
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  15,88,223                           // addps         %xmm7,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_lerp_u8_sse2
-_sk_lerp_u8_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,68,15,110,4,56                  // movd          (%rax,%rdi,1),%xmm8
-  .byte  102,69,15,239,201                   // pxor          %xmm9,%xmm9
-  .byte  102,69,15,96,193                    // punpcklbw     %xmm9,%xmm8
-  .byte  102,69,15,97,193                    // punpcklwd     %xmm9,%xmm8
-  .byte  69,15,91,192                        // cvtdq2ps      %xmm8,%xmm8
-  .byte  243,68,15,16,74,12                  // movss         0xc(%rdx),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  15,92,196                           // subps         %xmm4,%xmm0
-  .byte  65,15,89,193                        // mulps         %xmm9,%xmm0
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  15,92,205                           // subps         %xmm5,%xmm1
-  .byte  65,15,89,201                        // mulps         %xmm9,%xmm1
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  15,92,214                           // subps         %xmm6,%xmm2
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  15,92,223                           // subps         %xmm7,%xmm3
-  .byte  65,15,89,217                        // mulps         %xmm9,%xmm3
-  .byte  15,88,223                           // addps         %xmm7,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_lerp_565_sse2
-_sk_lerp_565_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,68,15,126,4,120                 // movq          (%rax,%rdi,2),%xmm8
-  .byte  102,15,239,219                      // pxor          %xmm3,%xmm3
-  .byte  102,68,15,97,195                    // punpcklwd     %xmm3,%xmm8
-  .byte  102,15,110,90,104                   // movd          0x68(%rdx),%xmm3
-  .byte  102,15,112,219,0                    // pshufd        $0x0,%xmm3,%xmm3
-  .byte  102,65,15,219,216                   // pand          %xmm8,%xmm3
-  .byte  68,15,91,203                        // cvtdq2ps      %xmm3,%xmm9
-  .byte  243,15,16,26                        // movss         (%rdx),%xmm3
-  .byte  243,68,15,16,82,116                 // movss         0x74(%rdx),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  102,68,15,110,74,108                // movd          0x6c(%rdx),%xmm9
-  .byte  102,69,15,112,201,0                 // pshufd        $0x0,%xmm9,%xmm9
-  .byte  102,69,15,219,200                   // pand          %xmm8,%xmm9
-  .byte  69,15,91,201                        // cvtdq2ps      %xmm9,%xmm9
-  .byte  243,68,15,16,90,120                 // movss         0x78(%rdx),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,89,217                        // mulps         %xmm9,%xmm11
-  .byte  102,68,15,110,74,112                // movd          0x70(%rdx),%xmm9
-  .byte  102,69,15,112,201,0                 // pshufd        $0x0,%xmm9,%xmm9
-  .byte  102,69,15,219,200                   // pand          %xmm8,%xmm9
-  .byte  69,15,91,193                        // cvtdq2ps      %xmm9,%xmm8
-  .byte  243,68,15,16,74,124                 // movss         0x7c(%rdx),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  15,92,196                           // subps         %xmm4,%xmm0
-  .byte  65,15,89,194                        // mulps         %xmm10,%xmm0
-  .byte  15,88,196                           // addps         %xmm4,%xmm0
-  .byte  15,92,205                           // subps         %xmm5,%xmm1
-  .byte  65,15,89,203                        // mulps         %xmm11,%xmm1
-  .byte  15,88,205                           // addps         %xmm5,%xmm1
-  .byte  15,92,214                           // subps         %xmm6,%xmm2
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  15,88,214                           // addps         %xmm6,%xmm2
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_tables_sse2
-_sk_load_tables_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,8                            // mov           (%rax),%rcx
-  .byte  76,139,64,8                         // mov           0x8(%rax),%r8
-  .byte  243,68,15,111,4,185                 // movdqu        (%rcx,%rdi,4),%xmm8
-  .byte  102,15,110,66,16                    // movd          0x10(%rdx),%xmm0
-  .byte  102,15,112,192,0                    // pshufd        $0x0,%xmm0,%xmm0
-  .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
-  .byte  102,65,15,114,209,8                 // psrld         $0x8,%xmm9
-  .byte  102,68,15,219,200                   // pand          %xmm0,%xmm9
-  .byte  102,69,15,111,208                   // movdqa        %xmm8,%xmm10
-  .byte  102,65,15,114,210,16                // psrld         $0x10,%xmm10
-  .byte  102,68,15,219,208                   // pand          %xmm0,%xmm10
-  .byte  102,65,15,219,192                   // pand          %xmm8,%xmm0
-  .byte  102,15,112,216,78                   // pshufd        $0x4e,%xmm0,%xmm3
-  .byte  102,72,15,126,217                   // movq          %xmm3,%rcx
-  .byte  65,137,201                          // mov           %ecx,%r9d
-  .byte  72,193,233,32                       // shr           $0x20,%rcx
-  .byte  102,73,15,126,194                   // movq          %xmm0,%r10
-  .byte  69,137,211                          // mov           %r10d,%r11d
-  .byte  73,193,234,32                       // shr           $0x20,%r10
-  .byte  243,67,15,16,28,144                 // movss         (%r8,%r10,4),%xmm3
-  .byte  243,65,15,16,4,136                  // movss         (%r8,%rcx,4),%xmm0
-  .byte  15,20,216                           // unpcklps      %xmm0,%xmm3
-  .byte  243,67,15,16,4,152                  // movss         (%r8,%r11,4),%xmm0
-  .byte  243,67,15,16,12,136                 // movss         (%r8,%r9,4),%xmm1
-  .byte  15,20,193                           // unpcklps      %xmm1,%xmm0
-  .byte  15,20,195                           // unpcklps      %xmm3,%xmm0
-  .byte  72,139,72,16                        // mov           0x10(%rax),%rcx
-  .byte  102,65,15,112,201,78                // pshufd        $0x4e,%xmm9,%xmm1
-  .byte  102,73,15,126,200                   // movq          %xmm1,%r8
-  .byte  69,137,193                          // mov           %r8d,%r9d
-  .byte  73,193,232,32                       // shr           $0x20,%r8
-  .byte  102,77,15,126,202                   // movq          %xmm9,%r10
-  .byte  69,137,211                          // mov           %r10d,%r11d
-  .byte  73,193,234,32                       // shr           $0x20,%r10
-  .byte  243,66,15,16,28,145                 // movss         (%rcx,%r10,4),%xmm3
-  .byte  243,66,15,16,12,129                 // movss         (%rcx,%r8,4),%xmm1
-  .byte  15,20,217                           // unpcklps      %xmm1,%xmm3
-  .byte  243,66,15,16,12,153                 // movss         (%rcx,%r11,4),%xmm1
-  .byte  243,66,15,16,20,137                 // movss         (%rcx,%r9,4),%xmm2
-  .byte  15,20,202                           // unpcklps      %xmm2,%xmm1
-  .byte  15,20,203                           // unpcklps      %xmm3,%xmm1
-  .byte  72,139,64,24                        // mov           0x18(%rax),%rax
-  .byte  102,65,15,112,210,78                // pshufd        $0x4e,%xmm10,%xmm2
-  .byte  102,72,15,126,209                   // movq          %xmm2,%rcx
-  .byte  65,137,200                          // mov           %ecx,%r8d
-  .byte  72,193,233,32                       // shr           $0x20,%rcx
-  .byte  102,77,15,126,209                   // movq          %xmm10,%r9
-  .byte  69,137,202                          // mov           %r9d,%r10d
-  .byte  73,193,233,32                       // shr           $0x20,%r9
-  .byte  243,70,15,16,12,136                 // movss         (%rax,%r9,4),%xmm9
-  .byte  243,15,16,20,136                    // movss         (%rax,%rcx,4),%xmm2
-  .byte  68,15,20,202                        // unpcklps      %xmm2,%xmm9
-  .byte  243,66,15,16,20,144                 // movss         (%rax,%r10,4),%xmm2
-  .byte  243,66,15,16,28,128                 // movss         (%rax,%r8,4),%xmm3
-  .byte  15,20,211                           // unpcklps      %xmm3,%xmm2
-  .byte  65,15,20,209                        // unpcklps      %xmm9,%xmm2
-  .byte  102,65,15,114,208,24                // psrld         $0x18,%xmm8
-  .byte  69,15,91,192                        // cvtdq2ps      %xmm8,%xmm8
-  .byte  243,15,16,90,12                     // movss         0xc(%rdx),%xmm3
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_a8_sse2
-_sk_load_a8_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,15,110,4,56                     // movd          (%rax,%rdi,1),%xmm0
-  .byte  102,15,239,201                      // pxor          %xmm1,%xmm1
-  .byte  102,15,96,193                       // punpcklbw     %xmm1,%xmm0
-  .byte  102,15,97,193                       // punpcklwd     %xmm1,%xmm0
-  .byte  15,91,192                           // cvtdq2ps      %xmm0,%xmm0
-  .byte  243,15,16,90,12                     // movss         0xc(%rdx),%xmm3
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  15,89,216                           // mulps         %xmm0,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  15,87,192                           // xorps         %xmm0,%xmm0
-  .byte  102,15,239,201                      // pxor          %xmm1,%xmm1
-  .byte  15,87,210                           // xorps         %xmm2,%xmm2
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_a8_sse2
-_sk_store_a8_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,68,15,16,66,8                   // movss         0x8(%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,89,195                        // mulps         %xmm3,%xmm8
-  .byte  102,69,15,91,192                    // cvtps2dq      %xmm8,%xmm8
-  .byte  102,65,15,114,240,16                // pslld         $0x10,%xmm8
-  .byte  102,65,15,114,224,16                // psrad         $0x10,%xmm8
-  .byte  102,69,15,107,192                   // packssdw      %xmm8,%xmm8
-  .byte  102,69,15,103,192                   // packuswb      %xmm8,%xmm8
-  .byte  102,68,15,126,4,56                  // movd          %xmm8,(%rax,%rdi,1)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_565_sse2
-_sk_load_565_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,68,15,126,12,120                // movq          (%rax,%rdi,2),%xmm9
-  .byte  102,15,239,192                      // pxor          %xmm0,%xmm0
-  .byte  102,68,15,97,200                    // punpcklwd     %xmm0,%xmm9
-  .byte  102,15,110,66,104                   // movd          0x68(%rdx),%xmm0
-  .byte  102,15,112,192,0                    // pshufd        $0x0,%xmm0,%xmm0
-  .byte  102,65,15,219,193                   // pand          %xmm9,%xmm0
-  .byte  15,91,200                           // cvtdq2ps      %xmm0,%xmm1
-  .byte  243,15,16,26                        // movss         (%rdx),%xmm3
-  .byte  243,15,16,66,116                    // movss         0x74(%rdx),%xmm0
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  15,89,193                           // mulps         %xmm1,%xmm0
-  .byte  102,15,110,74,108                   // movd          0x6c(%rdx),%xmm1
-  .byte  102,15,112,201,0                    // pshufd        $0x0,%xmm1,%xmm1
-  .byte  102,65,15,219,201                   // pand          %xmm9,%xmm1
-  .byte  68,15,91,193                        // cvtdq2ps      %xmm1,%xmm8
-  .byte  243,15,16,74,120                    // movss         0x78(%rdx),%xmm1
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  102,15,110,82,112                   // movd          0x70(%rdx),%xmm2
-  .byte  102,15,112,210,0                    // pshufd        $0x0,%xmm2,%xmm2
-  .byte  102,65,15,219,209                   // pand          %xmm9,%xmm2
-  .byte  68,15,91,194                        // cvtdq2ps      %xmm2,%xmm8
-  .byte  243,15,16,82,124                    // movss         0x7c(%rdx),%xmm2
-  .byte  15,198,210,0                        // shufps        $0x0,%xmm2,%xmm2
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  15,198,219,0                        // shufps        $0x0,%xmm3,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_565_sse2
-_sk_store_565_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,68,15,16,130,128,0,0,0          // movss         0x80(%rdx),%xmm8
-  .byte  243,68,15,16,138,132,0,0,0          // movss         0x84(%rdx),%xmm9
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  69,15,40,208                        // movaps        %xmm8,%xmm10
-  .byte  68,15,89,208                        // mulps         %xmm0,%xmm10
-  .byte  102,69,15,91,210                    // cvtps2dq      %xmm10,%xmm10
-  .byte  102,65,15,114,242,11                // pslld         $0xb,%xmm10
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  102,69,15,91,201                    // cvtps2dq      %xmm9,%xmm9
-  .byte  102,65,15,114,241,5                 // pslld         $0x5,%xmm9
-  .byte  102,69,15,235,202                   // por           %xmm10,%xmm9
-  .byte  68,15,89,194                        // mulps         %xmm2,%xmm8
-  .byte  102,69,15,91,192                    // cvtps2dq      %xmm8,%xmm8
-  .byte  102,69,15,86,193                    // orpd          %xmm9,%xmm8
-  .byte  102,65,15,114,240,16                // pslld         $0x10,%xmm8
-  .byte  102,65,15,114,224,16                // psrad         $0x10,%xmm8
-  .byte  102,69,15,107,192                   // packssdw      %xmm8,%xmm8
-  .byte  102,68,15,214,4,120                 // movq          %xmm8,(%rax,%rdi,2)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_8888_sse2
-_sk_load_8888_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,15,111,28,184                   // movdqu        (%rax,%rdi,4),%xmm3
-  .byte  102,15,110,66,16                    // movd          0x10(%rdx),%xmm0
-  .byte  102,15,112,192,0                    // pshufd        $0x0,%xmm0,%xmm0
-  .byte  102,15,111,203                      // movdqa        %xmm3,%xmm1
-  .byte  102,15,114,209,8                    // psrld         $0x8,%xmm1
-  .byte  102,15,219,200                      // pand          %xmm0,%xmm1
-  .byte  102,15,111,211                      // movdqa        %xmm3,%xmm2
-  .byte  102,15,114,210,16                   // psrld         $0x10,%xmm2
-  .byte  102,15,219,208                      // pand          %xmm0,%xmm2
-  .byte  102,15,219,195                      // pand          %xmm3,%xmm0
-  .byte  15,91,192                           // cvtdq2ps      %xmm0,%xmm0
-  .byte  243,68,15,16,66,12                  // movss         0xc(%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  15,91,201                           // cvtdq2ps      %xmm1,%xmm1
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  15,91,210                           // cvtdq2ps      %xmm2,%xmm2
-  .byte  65,15,89,208                        // mulps         %xmm8,%xmm2
-  .byte  102,15,114,211,24                   // psrld         $0x18,%xmm3
-  .byte  15,91,219                           // cvtdq2ps      %xmm3,%xmm3
-  .byte  65,15,89,216                        // mulps         %xmm8,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_8888_sse2
-_sk_store_8888_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,68,15,16,66,8                   // movss         0x8(%rdx),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,200                        // mulps         %xmm0,%xmm9
-  .byte  102,69,15,91,201                    // cvtps2dq      %xmm9,%xmm9
-  .byte  69,15,40,208                        // movaps        %xmm8,%xmm10
-  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
-  .byte  102,69,15,91,210                    // cvtps2dq      %xmm10,%xmm10
-  .byte  102,65,15,114,242,8                 // pslld         $0x8,%xmm10
-  .byte  102,69,15,235,209                   // por           %xmm9,%xmm10
-  .byte  69,15,40,200                        // movaps        %xmm8,%xmm9
-  .byte  68,15,89,202                        // mulps         %xmm2,%xmm9
-  .byte  102,69,15,91,201                    // cvtps2dq      %xmm9,%xmm9
-  .byte  102,65,15,114,241,16                // pslld         $0x10,%xmm9
-  .byte  68,15,89,195                        // mulps         %xmm3,%xmm8
-  .byte  102,69,15,91,192                    // cvtps2dq      %xmm8,%xmm8
-  .byte  102,65,15,114,240,24                // pslld         $0x18,%xmm8
-  .byte  102,69,15,235,193                   // por           %xmm9,%xmm8
-  .byte  102,69,15,235,194                   // por           %xmm10,%xmm8
-  .byte  243,68,15,127,4,184                 // movdqu        %xmm8,(%rax,%rdi,4)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_load_f16_sse2
-_sk_load_f16_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  243,15,111,4,248                    // movdqu        (%rax,%rdi,8),%xmm0
-  .byte  243,15,111,76,248,16                // movdqu        0x10(%rax,%rdi,8),%xmm1
-  .byte  102,15,111,208                      // movdqa        %xmm0,%xmm2
-  .byte  102,15,97,209                       // punpcklwd     %xmm1,%xmm2
-  .byte  102,15,105,193                      // punpckhwd     %xmm1,%xmm0
-  .byte  102,68,15,111,194                   // movdqa        %xmm2,%xmm8
-  .byte  102,68,15,97,192                    // punpcklwd     %xmm0,%xmm8
-  .byte  102,15,105,208                      // punpckhwd     %xmm0,%xmm2
-  .byte  102,15,110,66,100                   // movd          0x64(%rdx),%xmm0
-  .byte  102,15,112,216,0                    // pshufd        $0x0,%xmm0,%xmm3
-  .byte  102,15,111,203                      // movdqa        %xmm3,%xmm1
-  .byte  102,65,15,101,200                   // pcmpgtw       %xmm8,%xmm1
-  .byte  102,65,15,223,200                   // pandn         %xmm8,%xmm1
-  .byte  102,15,101,218                      // pcmpgtw       %xmm2,%xmm3
-  .byte  102,15,223,218                      // pandn         %xmm2,%xmm3
-  .byte  102,69,15,239,192                   // pxor          %xmm8,%xmm8
-  .byte  102,15,111,193                      // movdqa        %xmm1,%xmm0
-  .byte  102,65,15,97,192                    // punpcklwd     %xmm8,%xmm0
-  .byte  102,15,114,240,13                   // pslld         $0xd,%xmm0
-  .byte  102,15,110,82,92                    // movd          0x5c(%rdx),%xmm2
-  .byte  102,68,15,112,202,0                 // pshufd        $0x0,%xmm2,%xmm9
-  .byte  65,15,89,193                        // mulps         %xmm9,%xmm0
-  .byte  102,65,15,105,200                   // punpckhwd     %xmm8,%xmm1
-  .byte  102,15,114,241,13                   // pslld         $0xd,%xmm1
-  .byte  65,15,89,201                        // mulps         %xmm9,%xmm1
-  .byte  102,15,111,211                      // movdqa        %xmm3,%xmm2
-  .byte  102,65,15,97,208                    // punpcklwd     %xmm8,%xmm2
-  .byte  102,15,114,242,13                   // pslld         $0xd,%xmm2
-  .byte  65,15,89,209                        // mulps         %xmm9,%xmm2
-  .byte  102,65,15,105,216                   // punpckhwd     %xmm8,%xmm3
-  .byte  102,15,114,243,13                   // pslld         $0xd,%xmm3
-  .byte  65,15,89,217                        // mulps         %xmm9,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_f16_sse2
-_sk_store_f16_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  102,68,15,110,66,96                 // movd          0x60(%rdx),%xmm8
-  .byte  102,69,15,112,192,0                 // pshufd        $0x0,%xmm8,%xmm8
-  .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
-  .byte  68,15,89,200                        // mulps         %xmm0,%xmm9
-  .byte  102,65,15,114,209,13                // psrld         $0xd,%xmm9
-  .byte  102,69,15,111,208                   // movdqa        %xmm8,%xmm10
-  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
-  .byte  102,65,15,114,210,13                // psrld         $0xd,%xmm10
-  .byte  102,69,15,111,216                   // movdqa        %xmm8,%xmm11
-  .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
-  .byte  102,65,15,114,211,13                // psrld         $0xd,%xmm11
-  .byte  68,15,89,195                        // mulps         %xmm3,%xmm8
-  .byte  102,65,15,114,208,13                // psrld         $0xd,%xmm8
-  .byte  102,65,15,115,250,2                 // pslldq        $0x2,%xmm10
-  .byte  102,69,15,235,209                   // por           %xmm9,%xmm10
-  .byte  102,65,15,115,248,2                 // pslldq        $0x2,%xmm8
-  .byte  102,69,15,235,195                   // por           %xmm11,%xmm8
-  .byte  102,69,15,111,202                   // movdqa        %xmm10,%xmm9
-  .byte  102,69,15,98,200                    // punpckldq     %xmm8,%xmm9
-  .byte  243,68,15,127,12,248                // movdqu        %xmm9,(%rax,%rdi,8)
-  .byte  102,69,15,106,208                   // punpckhdq     %xmm8,%xmm10
-  .byte  243,68,15,127,84,248,16             // movdqu        %xmm10,0x10(%rax,%rdi,8)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_store_f32_sse2
-_sk_store_f32_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  72,139,0                            // mov           (%rax),%rax
-  .byte  72,137,249                          // mov           %rdi,%rcx
-  .byte  72,193,225,4                        // shl           $0x4,%rcx
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  68,15,40,200                        // movaps        %xmm0,%xmm9
-  .byte  68,15,20,201                        // unpcklps      %xmm1,%xmm9
-  .byte  68,15,40,210                        // movaps        %xmm2,%xmm10
-  .byte  68,15,40,218                        // movaps        %xmm2,%xmm11
-  .byte  68,15,20,219                        // unpcklps      %xmm3,%xmm11
-  .byte  68,15,21,193                        // unpckhps      %xmm1,%xmm8
-  .byte  68,15,21,211                        // unpckhps      %xmm3,%xmm10
-  .byte  69,15,40,225                        // movaps        %xmm9,%xmm12
-  .byte  102,69,15,20,227                    // unpcklpd      %xmm11,%xmm12
-  .byte  102,69,15,21,203                    // unpckhpd      %xmm11,%xmm9
-  .byte  69,15,40,216                        // movaps        %xmm8,%xmm11
-  .byte  102,69,15,20,218                    // unpcklpd      %xmm10,%xmm11
-  .byte  102,69,15,21,194                    // unpckhpd      %xmm10,%xmm8
-  .byte  102,68,15,17,36,8                   // movupd        %xmm12,(%rax,%rcx,1)
-  .byte  102,68,15,17,76,8,16                // movupd        %xmm9,0x10(%rax,%rcx,1)
-  .byte  102,68,15,17,92,8,32                // movupd        %xmm11,0x20(%rax,%rcx,1)
-  .byte  102,68,15,17,68,8,48                // movupd        %xmm8,0x30(%rax,%rcx,1)
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_x_sse2
-_sk_clamp_x_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  68,15,95,192                        // maxps         %xmm0,%xmm8
-  .byte  243,68,15,16,8                      // movss         (%rax),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  102,15,118,192                      // pcmpeqd       %xmm0,%xmm0
-  .byte  102,65,15,254,193                   // paddd         %xmm9,%xmm0
-  .byte  68,15,93,192                        // minps         %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,192                        // movaps        %xmm8,%xmm0
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_clamp_y_sse2
-_sk_clamp_y_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  69,15,87,192                        // xorps         %xmm8,%xmm8
-  .byte  68,15,95,193                        // maxps         %xmm1,%xmm8
-  .byte  243,68,15,16,8                      // movss         (%rax),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  102,15,118,201                      // pcmpeqd       %xmm1,%xmm1
-  .byte  102,65,15,254,201                   // paddd         %xmm9,%xmm1
-  .byte  68,15,93,193                        // minps         %xmm1,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,200                        // movaps        %xmm8,%xmm1
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_repeat_x_sse2
-_sk_repeat_x_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,40,200                        // movaps        %xmm0,%xmm9
-  .byte  69,15,94,200                        // divps         %xmm8,%xmm9
-  .byte  243,69,15,91,209                    // cvttps2dq     %xmm9,%xmm10
-  .byte  69,15,91,210                        // cvtdq2ps      %xmm10,%xmm10
-  .byte  69,15,194,202,1                     // cmpltps       %xmm10,%xmm9
-  .byte  243,68,15,16,26                     // movss         (%rdx),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,84,217                        // andps         %xmm9,%xmm11
-  .byte  69,15,92,211                        // subps         %xmm11,%xmm10
-  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
-  .byte  65,15,92,194                        // subps         %xmm10,%xmm0
-  .byte  102,69,15,118,201                   // pcmpeqd       %xmm9,%xmm9
-  .byte  102,69,15,254,200                   // paddd         %xmm8,%xmm9
-  .byte  65,15,93,193                        // minps         %xmm9,%xmm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_repeat_y_sse2
-_sk_repeat_y_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,0                      // movss         (%rax),%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  68,15,40,201                        // movaps        %xmm1,%xmm9
-  .byte  69,15,94,200                        // divps         %xmm8,%xmm9
-  .byte  243,69,15,91,209                    // cvttps2dq     %xmm9,%xmm10
-  .byte  69,15,91,210                        // cvtdq2ps      %xmm10,%xmm10
-  .byte  69,15,194,202,1                     // cmpltps       %xmm10,%xmm9
-  .byte  243,68,15,16,26                     // movss         (%rdx),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,84,217                        // andps         %xmm9,%xmm11
-  .byte  69,15,92,211                        // subps         %xmm11,%xmm10
-  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
-  .byte  65,15,92,202                        // subps         %xmm10,%xmm1
-  .byte  102,69,15,118,201                   // pcmpeqd       %xmm9,%xmm9
-  .byte  102,69,15,254,200                   // paddd         %xmm8,%xmm9
-  .byte  65,15,93,201                        // minps         %xmm9,%xmm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_mirror_x_sse2
-_sk_mirror_x_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,8                      // movss         (%rax),%xmm9
-  .byte  69,15,40,193                        // movaps        %xmm9,%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,92,192                        // subps         %xmm8,%xmm0
-  .byte  243,69,15,88,201                    // addss         %xmm9,%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  68,15,40,208                        // movaps        %xmm0,%xmm10
-  .byte  69,15,94,209                        // divps         %xmm9,%xmm10
-  .byte  243,69,15,91,218                    // cvttps2dq     %xmm10,%xmm11
-  .byte  69,15,91,219                        // cvtdq2ps      %xmm11,%xmm11
-  .byte  69,15,194,211,1                     // cmpltps       %xmm11,%xmm10
-  .byte  243,68,15,16,34                     // movss         (%rdx),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  69,15,84,226                        // andps         %xmm10,%xmm12
-  .byte  69,15,87,210                        // xorps         %xmm10,%xmm10
-  .byte  69,15,92,220                        // subps         %xmm12,%xmm11
-  .byte  69,15,89,217                        // mulps         %xmm9,%xmm11
-  .byte  65,15,92,195                        // subps         %xmm11,%xmm0
-  .byte  65,15,92,192                        // subps         %xmm8,%xmm0
-  .byte  68,15,92,208                        // subps         %xmm0,%xmm10
-  .byte  65,15,84,194                        // andps         %xmm10,%xmm0
-  .byte  102,69,15,118,201                   // pcmpeqd       %xmm9,%xmm9
-  .byte  102,69,15,254,200                   // paddd         %xmm8,%xmm9
-  .byte  65,15,93,193                        // minps         %xmm9,%xmm0
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_mirror_y_sse2
-_sk_mirror_y_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,68,15,16,8                      // movss         (%rax),%xmm9
-  .byte  69,15,40,193                        // movaps        %xmm9,%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,92,200                        // subps         %xmm8,%xmm1
-  .byte  243,69,15,88,201                    // addss         %xmm9,%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  68,15,40,209                        // movaps        %xmm1,%xmm10
-  .byte  69,15,94,209                        // divps         %xmm9,%xmm10
-  .byte  243,69,15,91,218                    // cvttps2dq     %xmm10,%xmm11
-  .byte  69,15,91,219                        // cvtdq2ps      %xmm11,%xmm11
-  .byte  69,15,194,211,1                     // cmpltps       %xmm11,%xmm10
-  .byte  243,68,15,16,34                     // movss         (%rdx),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  69,15,84,226                        // andps         %xmm10,%xmm12
-  .byte  69,15,87,210                        // xorps         %xmm10,%xmm10
-  .byte  69,15,92,220                        // subps         %xmm12,%xmm11
-  .byte  69,15,89,217                        // mulps         %xmm9,%xmm11
-  .byte  65,15,92,203                        // subps         %xmm11,%xmm1
-  .byte  65,15,92,200                        // subps         %xmm8,%xmm1
-  .byte  68,15,92,209                        // subps         %xmm1,%xmm10
-  .byte  65,15,84,202                        // andps         %xmm10,%xmm1
-  .byte  102,69,15,118,201                   // pcmpeqd       %xmm9,%xmm9
-  .byte  102,69,15,254,200                   // paddd         %xmm8,%xmm9
-  .byte  65,15,93,201                        // minps         %xmm9,%xmm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_2x3_sse2
-_sk_matrix_2x3_sse2:
-  .byte  68,15,40,201                        // movaps        %xmm1,%xmm9
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,15,16,0                         // movss         (%rax),%xmm0
-  .byte  243,15,16,72,4                      // movss         0x4(%rax),%xmm1
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  243,68,15,16,80,8                   // movss         0x8(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,16                  // movss         0x10(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,88,194                        // addps         %xmm10,%xmm0
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  243,68,15,16,80,12                  // movss         0xc(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,20                  // movss         0x14(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  65,15,88,202                        // addps         %xmm10,%xmm1
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_3x4_sse2
-_sk_matrix_3x4_sse2:
-  .byte  68,15,40,201                        // movaps        %xmm1,%xmm9
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,15,16,0                         // movss         (%rax),%xmm0
-  .byte  243,15,16,72,4                      // movss         0x4(%rax),%xmm1
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  243,68,15,16,80,12                  // movss         0xc(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,24                  // movss         0x18(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  243,68,15,16,96,36                  // movss         0x24(%rax),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
-  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,88,194                        // addps         %xmm10,%xmm0
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  243,68,15,16,80,16                  // movss         0x10(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,28                  // movss         0x1c(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  243,68,15,16,96,40                  // movss         0x28(%rax),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
-  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
-  .byte  69,15,89,209                        // mulps         %xmm9,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,89,200                        // mulps         %xmm8,%xmm1
-  .byte  65,15,88,202                        // addps         %xmm10,%xmm1
-  .byte  243,68,15,16,80,8                   // movss         0x8(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,20                  // movss         0x14(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  243,68,15,16,96,32                  // movss         0x20(%rax),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  243,68,15,16,104,44                 // movss         0x2c(%rax),%xmm13
-  .byte  69,15,198,237,0                     // shufps        $0x0,%xmm13,%xmm13
-  .byte  68,15,89,226                        // mulps         %xmm2,%xmm12
-  .byte  69,15,88,229                        // addps         %xmm13,%xmm12
-  .byte  69,15,89,217                        // mulps         %xmm9,%xmm11
-  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
-  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,210                        // movaps        %xmm10,%xmm2
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_matrix_perspective_sse2
-_sk_matrix_perspective_sse2:
-  .byte  68,15,40,192                        // movaps        %xmm0,%xmm8
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  243,15,16,0                         // movss         (%rax),%xmm0
-  .byte  243,68,15,16,72,4                   // movss         0x4(%rax),%xmm9
-  .byte  15,198,192,0                        // shufps        $0x0,%xmm0,%xmm0
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  243,68,15,16,80,8                   // movss         0x8(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  69,15,88,202                        // addps         %xmm10,%xmm9
-  .byte  65,15,89,192                        // mulps         %xmm8,%xmm0
-  .byte  65,15,88,193                        // addps         %xmm9,%xmm0
-  .byte  243,68,15,16,72,12                  // movss         0xc(%rax),%xmm9
-  .byte  69,15,198,201,0                     // shufps        $0x0,%xmm9,%xmm9
-  .byte  243,68,15,16,80,16                  // movss         0x10(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,20                  // movss         0x14(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  69,15,89,200                        // mulps         %xmm8,%xmm9
-  .byte  69,15,88,202                        // addps         %xmm10,%xmm9
-  .byte  243,68,15,16,80,24                  // movss         0x18(%rax),%xmm10
-  .byte  69,15,198,210,0                     // shufps        $0x0,%xmm10,%xmm10
-  .byte  243,68,15,16,88,28                  // movss         0x1c(%rax),%xmm11
-  .byte  69,15,198,219,0                     // shufps        $0x0,%xmm11,%xmm11
-  .byte  243,68,15,16,96,32                  // movss         0x20(%rax),%xmm12
-  .byte  69,15,198,228,0                     // shufps        $0x0,%xmm12,%xmm12
-  .byte  68,15,89,217                        // mulps         %xmm1,%xmm11
-  .byte  69,15,88,220                        // addps         %xmm12,%xmm11
-  .byte  69,15,89,208                        // mulps         %xmm8,%xmm10
-  .byte  69,15,88,211                        // addps         %xmm11,%xmm10
-  .byte  65,15,83,202                        // rcpps         %xmm10,%xmm1
-  .byte  15,89,193                           // mulps         %xmm1,%xmm0
-  .byte  68,15,89,201                        // mulps         %xmm1,%xmm9
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,201                        // movaps        %xmm9,%xmm1
-  .byte  255,224                             // jmpq          *%rax
-
-.globl _sk_linear_gradient_2stops_sse2
-_sk_linear_gradient_2stops_sse2:
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  68,15,16,8                          // movups        (%rax),%xmm9
-  .byte  15,16,88,16                         // movups        0x10(%rax),%xmm3
-  .byte  68,15,40,195                        // movaps        %xmm3,%xmm8
-  .byte  69,15,198,192,0                     // shufps        $0x0,%xmm8,%xmm8
-  .byte  65,15,40,201                        // movaps        %xmm9,%xmm1
-  .byte  15,198,201,0                        // shufps        $0x0,%xmm1,%xmm1
-  .byte  68,15,89,192                        // mulps         %xmm0,%xmm8
-  .byte  68,15,88,193                        // addps         %xmm1,%xmm8
-  .byte  15,40,203                           // movaps        %xmm3,%xmm1
-  .byte  15,198,201,85                       // shufps        $0x55,%xmm1,%xmm1
-  .byte  65,15,40,209                        // movaps        %xmm9,%xmm2
-  .byte  15,198,210,85                       // shufps        $0x55,%xmm2,%xmm2
-  .byte  15,89,200                           // mulps         %xmm0,%xmm1
-  .byte  15,88,202                           // addps         %xmm2,%xmm1
-  .byte  15,40,211                           // movaps        %xmm3,%xmm2
-  .byte  15,198,210,170                      // shufps        $0xaa,%xmm2,%xmm2
-  .byte  69,15,40,209                        // movaps        %xmm9,%xmm10
-  .byte  69,15,198,210,170                   // shufps        $0xaa,%xmm10,%xmm10
-  .byte  15,89,208                           // mulps         %xmm0,%xmm2
-  .byte  65,15,88,210                        // addps         %xmm10,%xmm2
-  .byte  15,198,219,255                      // shufps        $0xff,%xmm3,%xmm3
-  .byte  69,15,198,201,255                   // shufps        $0xff,%xmm9,%xmm9
-  .byte  15,89,216                           // mulps         %xmm0,%xmm3
-  .byte  65,15,88,217                        // addps         %xmm9,%xmm3
-  .byte  72,173                              // lods          %ds:(%rsi),%rax
-  .byte  65,15,40,192                        // movaps        %xmm8,%xmm0
-  .byte  255,224                             // jmpq          *%rax
-#endif
diff --git a/src/jumper/SkJumper_generated.cpp b/src/jumper/SkJumper_generated.cpp
new file mode 100644 (file)
index 0000000..234cbc7
--- /dev/null
@@ -0,0 +1,11511 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+// This file is generated semi-automatically with this command:
+//   $ src/jumper/build_stages.py
+
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+    #pragma section("code", read,execute)
+    #define CODE extern "C" __declspec(allocate("code"))
+#elif defined(__MACH__)
+    #define CODE extern "C" __attribute__((section("__TEXT,__text")))
+#else
+    #define CODE extern "C" __attribute__((section(".text")))
+#endif
+
+#if defined(__aarch64__)
+
+CODE const uint32_t sk_start_pipeline_aarch64[] = {
+  0xa9bd5bf7,                             //stp           x23, x22, [sp, #-48]!
+  0xa90153f5,                             //stp           x21, x20, [sp, #16]
+  0xa9027bf3,                             //stp           x19, x30, [sp, #32]
+  0xaa0103f5,                             //mov           x21, x1
+  0xf84086b7,                             //ldr           x23, [x21], #8
+  0xaa0003f6,                             //mov           x22, x0
+  0xaa0303f3,                             //mov           x19, x3
+  0xaa0203f4,                             //mov           x20, x2
+  0x910012c8,                             //add           x8, x22, #0x4
+  0xeb13011f,                             //cmp           x8, x19
+  0x54000069,                             //b.ls          34 <sk_start_pipeline_aarch64+0x34>  // b.plast
+  0xaa1603e0,                             //mov           x0, x22
+  0x14000012,                             //b             78 <sk_start_pipeline_aarch64+0x78>
+  0x6f00e400,                             //movi          v0.2d, #0x0
+  0x6f00e401,                             //movi          v1.2d, #0x0
+  0x6f00e402,                             //movi          v2.2d, #0x0
+  0x6f00e403,                             //movi          v3.2d, #0x0
+  0x6f00e404,                             //movi          v4.2d, #0x0
+  0x6f00e405,                             //movi          v5.2d, #0x0
+  0x6f00e406,                             //movi          v6.2d, #0x0
+  0x6f00e407,                             //movi          v7.2d, #0x0
+  0xaa1603e0,                             //mov           x0, x22
+  0xaa1503e1,                             //mov           x1, x21
+  0xaa1403e2,                             //mov           x2, x20
+  0xd63f02e0,                             //blr           x23
+  0x910022c8,                             //add           x8, x22, #0x8
+  0x910012c0,                             //add           x0, x22, #0x4
+  0xeb13011f,                             //cmp           x8, x19
+  0xaa0003f6,                             //mov           x22, x0
+  0x54fffe09,                             //b.ls          34 <sk_start_pipeline_aarch64+0x34>  // b.plast
+  0xa9427bf3,                             //ldp           x19, x30, [sp, #32]
+  0xa94153f5,                             //ldp           x21, x20, [sp, #16]
+  0xa8c35bf7,                             //ldp           x23, x22, [sp], #48
+  0xd65f03c0,                             //ret
+};
+
+CODE const uint32_t sk_just_return_aarch64[] = {
+  0xd65f03c0,                             //ret
+};
+
+CODE const uint32_t sk_seed_shader_aarch64[] = {
+  0xaa0203e9,                             //mov           x9, x2
+  0xa9400c28,                             //ldp           x8, x3, [x1]
+  0x4ddfc922,                             //ld1r          {v2.4s}, [x9], #4
+  0x3cc14047,                             //ldur          q7, [x2, #20]
+  0x4e040c00,                             //dup           v0.4s, w0
+  0x4d40c901,                             //ld1r          {v1.4s}, [x8]
+  0x4d40c926,                             //ld1r          {v6.4s}, [x9]
+  0x4e21d800,                             //scvtf         v0.4s, v0.4s
+  0x91004028,                             //add           x8, x1, #0x10
+  0x4e21d821,                             //scvtf         v1.4s, v1.4s
+  0x4e26d400,                             //fadd          v0.4s, v0.4s, v6.4s
+  0x6f00e403,                             //movi          v3.2d, #0x0
+  0x6f00e404,                             //movi          v4.2d, #0x0
+  0x6f00e405,                             //movi          v5.2d, #0x0
+  0x4e26d421,                             //fadd          v1.4s, v1.4s, v6.4s
+  0x6f00e406,                             //movi          v6.2d, #0x0
+  0x4e20d4e0,                             //fadd          v0.4s, v7.4s, v0.4s
+  0x6f00e407,                             //movi          v7.2d, #0x0
+  0xaa0803e1,                             //mov           x1, x8
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_constant_color_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0x3dc00103,                             //ldr           q3, [x8]
+  0x4e040460,                             //dup           v0.4s, v3.s[0]
+  0x4e0c0461,                             //dup           v1.4s, v3.s[1]
+  0x4e140462,                             //dup           v2.4s, v3.s[2]
+  0x4e1c0463,                             //dup           v3.4s, v3.s[3]
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_clear_aarch64[] = {
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x6f00e400,                             //movi          v0.2d, #0x0
+  0x6f00e401,                             //movi          v1.2d, #0x0
+  0x6f00e402,                             //movi          v2.2d, #0x0
+  0x6f00e403,                             //movi          v3.2d, #0x0
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_plus__aarch64[] = {
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4e24d400,                             //fadd          v0.4s, v0.4s, v4.4s
+  0x4e25d421,                             //fadd          v1.4s, v1.4s, v5.4s
+  0x4e26d442,                             //fadd          v2.4s, v2.4s, v6.4s
+  0x4e27d463,                             //fadd          v3.4s, v3.4s, v7.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_srcover_aarch64[] = {
+  0x4d40c850,                             //ld1r          {v16.4s}, [x2]
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4ea3d610,                             //fsub          v16.4s, v16.4s, v3.4s
+  0x4e24ce00,                             //fmla          v0.4s, v16.4s, v4.4s
+  0x4e25ce01,                             //fmla          v1.4s, v16.4s, v5.4s
+  0x4e26ce02,                             //fmla          v2.4s, v16.4s, v6.4s
+  0x4e27ce03,                             //fmla          v3.4s, v16.4s, v7.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_dstover_aarch64[] = {
+  0x4d40c851,                             //ld1r          {v17.4s}, [x2]
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4ea41c90,                             //mov           v16.16b, v4.16b
+  0x4ea61cd2,                             //mov           v18.16b, v6.16b
+  0x4ea7d634,                             //fsub          v20.4s, v17.4s, v7.4s
+  0x4ea51cb1,                             //mov           v17.16b, v5.16b
+  0x4ea71cf3,                             //mov           v19.16b, v7.16b
+  0x4e20ce90,                             //fmla          v16.4s, v20.4s, v0.4s
+  0x4e21ce91,                             //fmla          v17.4s, v20.4s, v1.4s
+  0x4e22ce92,                             //fmla          v18.4s, v20.4s, v2.4s
+  0x4e23ce93,                             //fmla          v19.4s, v20.4s, v3.4s
+  0x4eb01e00,                             //mov           v0.16b, v16.16b
+  0x4eb11e21,                             //mov           v1.16b, v17.16b
+  0x4eb21e42,                             //mov           v2.16b, v18.16b
+  0x4eb31e63,                             //mov           v3.16b, v19.16b
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_clamp_0_aarch64[] = {
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x6f00e410,                             //movi          v16.2d, #0x0
+  0x4e30f400,                             //fmax          v0.4s, v0.4s, v16.4s
+  0x4e30f421,                             //fmax          v1.4s, v1.4s, v16.4s
+  0x4e30f442,                             //fmax          v2.4s, v2.4s, v16.4s
+  0x4e30f463,                             //fmax          v3.4s, v3.4s, v16.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_clamp_1_aarch64[] = {
+  0x4d40c850,                             //ld1r          {v16.4s}, [x2]
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4eb0f400,                             //fmin          v0.4s, v0.4s, v16.4s
+  0x4eb0f421,                             //fmin          v1.4s, v1.4s, v16.4s
+  0x4eb0f442,                             //fmin          v2.4s, v2.4s, v16.4s
+  0x4eb0f463,                             //fmin          v3.4s, v3.4s, v16.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_clamp_a_aarch64[] = {
+  0x4d40c850,                             //ld1r          {v16.4s}, [x2]
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4eb0f463,                             //fmin          v3.4s, v3.4s, v16.4s
+  0x4ea3f400,                             //fmin          v0.4s, v0.4s, v3.4s
+  0x4ea3f421,                             //fmin          v1.4s, v1.4s, v3.4s
+  0x4ea3f442,                             //fmin          v2.4s, v2.4s, v3.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_set_rgb_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xaa0803e9,                             //mov           x9, x8
+  0x4ddfc920,                             //ld1r          {v0.4s}, [x9], #4
+  0x91002108,                             //add           x8, x8, #0x8
+  0x4d40c902,                             //ld1r          {v2.4s}, [x8]
+  0x4d40c921,                             //ld1r          {v1.4s}, [x9]
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_swap_rb_aarch64[] = {
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4ea01c10,                             //mov           v16.16b, v0.16b
+  0x4ea21c40,                             //mov           v0.16b, v2.16b
+  0x4eb01e02,                             //mov           v2.16b, v16.16b
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_swap_aarch64[] = {
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4ea31c70,                             //mov           v16.16b, v3.16b
+  0x4ea21c51,                             //mov           v17.16b, v2.16b
+  0x4ea11c32,                             //mov           v18.16b, v1.16b
+  0x4ea01c13,                             //mov           v19.16b, v0.16b
+  0x4ea41c80,                             //mov           v0.16b, v4.16b
+  0x4ea51ca1,                             //mov           v1.16b, v5.16b
+  0x4ea61cc2,                             //mov           v2.16b, v6.16b
+  0x4ea71ce3,                             //mov           v3.16b, v7.16b
+  0x4eb31e64,                             //mov           v4.16b, v19.16b
+  0x4eb21e45,                             //mov           v5.16b, v18.16b
+  0x4eb11e26,                             //mov           v6.16b, v17.16b
+  0x4eb01e07,                             //mov           v7.16b, v16.16b
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_move_src_dst_aarch64[] = {
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4ea01c04,                             //mov           v4.16b, v0.16b
+  0x4ea11c25,                             //mov           v5.16b, v1.16b
+  0x4ea21c46,                             //mov           v6.16b, v2.16b
+  0x4ea31c67,                             //mov           v7.16b, v3.16b
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_move_dst_src_aarch64[] = {
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4ea41c80,                             //mov           v0.16b, v4.16b
+  0x4ea51ca1,                             //mov           v1.16b, v5.16b
+  0x4ea61cc2,                             //mov           v2.16b, v6.16b
+  0x4ea71ce3,                             //mov           v3.16b, v7.16b
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_premul_aarch64[] = {
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x6e23dc00,                             //fmul          v0.4s, v0.4s, v3.4s
+  0x6e23dc21,                             //fmul          v1.4s, v1.4s, v3.4s
+  0x6e23dc42,                             //fmul          v2.4s, v2.4s, v3.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_unpremul_aarch64[] = {
+  0x4d40c850,                             //ld1r          {v16.4s}, [x2]
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4ea0d871,                             //fcmeq         v17.4s, v3.4s, #0.0
+  0x6e23fe10,                             //fdiv          v16.4s, v16.4s, v3.4s
+  0x4e711e10,                             //bic           v16.16b, v16.16b, v17.16b
+  0x6e20de00,                             //fmul          v0.4s, v16.4s, v0.4s
+  0x6e21de01,                             //fmul          v1.4s, v16.4s, v1.4s
+  0x6e22de02,                             //fmul          v2.4s, v16.4s, v2.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_from_srgb_aarch64[] = {
+  0x9100e048,                             //add           x8, x2, #0x38
+  0x4d40c910,                             //ld1r          {v16.4s}, [x8]
+  0x9100d048,                             //add           x8, x2, #0x34
+  0x2d47cc52,                             //ldp           s18, s19, [x2, #60]
+  0x4d40c911,                             //ld1r          {v17.4s}, [x8]
+  0x6e22dc54,                             //fmul          v20.4s, v2.4s, v2.4s
+  0x4eb01e15,                             //mov           v21.16b, v16.16b
+  0x4eb01e17,                             //mov           v23.16b, v16.16b
+  0x4f921050,                             //fmla          v16.4s, v2.4s, v18.s[0]
+  0x4eb11e36,                             //mov           v22.16b, v17.16b
+  0x4eb11e38,                             //mov           v24.16b, v17.16b
+  0x4e34ce11,                             //fmla          v17.4s, v16.4s, v20.4s
+  0x6e20dc10,                             //fmul          v16.4s, v0.4s, v0.4s
+  0x91011048,                             //add           x8, x2, #0x44
+  0x4f921015,                             //fmla          v21.4s, v0.4s, v18.s[0]
+  0x4e30ceb6,                             //fmla          v22.4s, v21.4s, v16.4s
+  0x4d40c910,                             //ld1r          {v16.4s}, [x8]
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x6e21dc34,                             //fmul          v20.4s, v1.4s, v1.4s
+  0x4f921037,                             //fmla          v23.4s, v1.4s, v18.s[0]
+  0x4f939015,                             //fmul          v21.4s, v0.4s, v19.s[0]
+  0x4f939032,                             //fmul          v18.4s, v1.4s, v19.s[0]
+  0x4f939053,                             //fmul          v19.4s, v2.4s, v19.s[0]
+  0x6ea0e600,                             //fcmgt         v0.4s, v16.4s, v0.4s
+  0x6ea1e601,                             //fcmgt         v1.4s, v16.4s, v1.4s
+  0x6ea2e602,                             //fcmgt         v2.4s, v16.4s, v2.4s
+  0x4e34cef8,                             //fmla          v24.4s, v23.4s, v20.4s
+  0x6e761ea0,                             //bsl           v0.16b, v21.16b, v22.16b
+  0x6e781e41,                             //bsl           v1.16b, v18.16b, v24.16b
+  0x6e711e62,                             //bsl           v2.16b, v19.16b, v17.16b
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_to_srgb_aarch64[] = {
+  0x6ea1d811,                             //frsqrte       v17.4s, v0.4s
+  0x6ea1d835,                             //frsqrte       v21.4s, v1.4s
+  0x6e31de37,                             //fmul          v23.4s, v17.4s, v17.4s
+  0x6ea1d856,                             //frsqrte       v22.4s, v2.4s
+  0x6e35deb9,                             //fmul          v25.4s, v21.4s, v21.4s
+  0x4eb7fc17,                             //frsqrts       v23.4s, v0.4s, v23.4s
+  0x91015048,                             //add           x8, x2, #0x54
+  0x6e36deda,                             //fmul          v26.4s, v22.4s, v22.4s
+  0x4eb9fc39,                             //frsqrts       v25.4s, v1.4s, v25.4s
+  0x6e37de31,                             //fmul          v17.4s, v17.4s, v23.4s
+  0x4d40c914,                             //ld1r          {v20.4s}, [x8]
+  0x4ebafc5a,                             //frsqrts       v26.4s, v2.4s, v26.4s
+  0x6e39deb5,                             //fmul          v21.4s, v21.4s, v25.4s
+  0x4ea1da37,                             //frecpe        v23.4s, v17.4s
+  0xbd405053,                             //ldr           s19, [x2, #80]
+  0x91016048,                             //add           x8, x2, #0x58
+  0x6e3aded6,                             //fmul          v22.4s, v22.4s, v26.4s
+  0x4ea1dabb,                             //frecpe        v27.4s, v21.4s
+  0x4e37fe3d,                             //frecps        v29.4s, v17.4s, v23.4s
+  0x2d494052,                             //ldp           s18, s16, [x2, #72]
+  0x4d40c918,                             //ld1r          {v24.4s}, [x8]
+  0x4ea1dadc,                             //frecpe        v28.4s, v22.4s
+  0x6e3ddef7,                             //fmul          v23.4s, v23.4s, v29.4s
+  0x4e3bfebd,                             //frecps        v29.4s, v21.4s, v27.4s
+  0x6e3ddf7b,                             //fmul          v27.4s, v27.4s, v29.4s
+  0x4e3cfedd,                             //frecps        v29.4s, v22.4s, v28.4s
+  0x6e3ddf9c,                             //fmul          v28.4s, v28.4s, v29.4s
+  0x4eb41e9d,                             //mov           v29.16b, v20.16b
+  0x6ea1da39,                             //frsqrte       v25.4s, v17.4s
+  0x4f9312fd,                             //fmla          v29.4s, v23.4s, v19.s[0]
+  0x4eb41e97,                             //mov           v23.16b, v20.16b
+  0x4f92901a,                             //fmul          v26.4s, v0.4s, v18.s[0]
+  0x4f931377,                             //fmla          v23.4s, v27.4s, v19.s[0]
+  0x4f931394,                             //fmla          v20.4s, v28.4s, v19.s[0]
+  0x4f929033,                             //fmul          v19.4s, v1.4s, v18.s[0]
+  0x4f929052,                             //fmul          v18.4s, v2.4s, v18.s[0]
+  0x6ea0e700,                             //fcmgt         v0.4s, v24.4s, v0.4s
+  0x6ea1e701,                             //fcmgt         v1.4s, v24.4s, v1.4s
+  0x6ea2e702,                             //fcmgt         v2.4s, v24.4s, v2.4s
+  0x6e39df38,                             //fmul          v24.4s, v25.4s, v25.4s
+  0x6ea1dabb,                             //frsqrte       v27.4s, v21.4s
+  0x4eb8fe31,                             //frsqrts       v17.4s, v17.4s, v24.4s
+  0x6ea1dadc,                             //frsqrte       v28.4s, v22.4s
+  0x6e3bdf78,                             //fmul          v24.4s, v27.4s, v27.4s
+  0x6e31df31,                             //fmul          v17.4s, v25.4s, v17.4s
+  0x4eb8feb5,                             //frsqrts       v21.4s, v21.4s, v24.4s
+  0x6e3cdf98,                             //fmul          v24.4s, v28.4s, v28.4s
+  0x4f90123d,                             //fmla          v29.4s, v17.4s, v16.s[0]
+  0x4d40c851,                             //ld1r          {v17.4s}, [x2]
+  0x4eb8fed6,                             //frsqrts       v22.4s, v22.4s, v24.4s
+  0x6e35df75,                             //fmul          v21.4s, v27.4s, v21.4s
+  0x6e36df96,                             //fmul          v22.4s, v28.4s, v22.4s
+  0xf8408423,                             //ldr           x3, [x1], #8
+  0x4f9012b7,                             //fmla          v23.4s, v21.4s, v16.s[0]
+  0x4f9012d4,                             //fmla          v20.4s, v22.4s, v16.s[0]
+  0x4ebdf630,                             //fmin          v16.4s, v17.4s, v29.4s
+  0x4eb7f635,                             //fmin          v21.4s, v17.4s, v23.4s
+  0x4eb4f631,                             //fmin          v17.4s, v17.4s, v20.4s
+  0x6e701f40,                             //bsl           v0.16b, v26.16b, v16.16b
+  0x6e751e61,                             //bsl           v1.16b, v19.16b, v21.16b
+  0x6e711e42,                             //bsl           v2.16b, v18.16b, v17.16b
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_scale_1_float_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xbd400110,                             //ldr           s16, [x8]
+  0x4f909000,                             //fmul          v0.4s, v0.4s, v16.s[0]
+  0x4f909021,                             //fmul          v1.4s, v1.4s, v16.s[0]
+  0x4f909042,                             //fmul          v2.4s, v2.4s, v16.s[0]
+  0x4f909063,                             //fmul          v3.4s, v3.4s, v16.s[0]
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_scale_u8_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xbd400c51,                             //ldr           s17, [x2, #12]
+  0xf9400108,                             //ldr           x8, [x8]
+  0x8b000108,                             //add           x8, x8, x0
+  0x39400109,                             //ldrb          w9, [x8]
+  0x3940050a,                             //ldrb          w10, [x8, #1]
+  0x3940090b,                             //ldrb          w11, [x8, #2]
+  0x39400d08,                             //ldrb          w8, [x8, #3]
+  0x4e021d30,                             //mov           v16.h[0], w9
+  0x4e061d50,                             //mov           v16.h[1], w10
+  0x4e0a1d70,                             //mov           v16.h[2], w11
+  0x4e0e1d10,                             //mov           v16.h[3], w8
+  0x2f07b7f0,                             //bic           v16.4h, #0xff, lsl #8
+  0x2f10a610,                             //uxtl          v16.4s, v16.4h
+  0x6e21da10,                             //ucvtf         v16.4s, v16.4s
+  0x4f919210,                             //fmul          v16.4s, v16.4s, v17.s[0]
+  0x6e20de00,                             //fmul          v0.4s, v16.4s, v0.4s
+  0x6e21de01,                             //fmul          v1.4s, v16.4s, v1.4s
+  0x6e22de02,                             //fmul          v2.4s, v16.4s, v2.4s
+  0x6e23de03,                             //fmul          v3.4s, v16.4s, v3.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_lerp_1_float_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0x4ea4d411,                             //fsub          v17.4s, v0.4s, v4.4s
+  0x4ea41c80,                             //mov           v0.16b, v4.16b
+  0x4ea5d432,                             //fsub          v18.4s, v1.4s, v5.4s
+  0xbd400110,                             //ldr           s16, [x8]
+  0x4ea51ca1,                             //mov           v1.16b, v5.16b
+  0x4f901220,                             //fmla          v0.4s, v17.4s, v16.s[0]
+  0x4ea6d451,                             //fsub          v17.4s, v2.4s, v6.4s
+  0x4f901241,                             //fmla          v1.4s, v18.4s, v16.s[0]
+  0x4ea61cc2,                             //mov           v2.16b, v6.16b
+  0x4ea7d472,                             //fsub          v18.4s, v3.4s, v7.4s
+  0x4ea71ce3,                             //mov           v3.16b, v7.16b
+  0x4f901222,                             //fmla          v2.4s, v17.4s, v16.s[0]
+  0x4f901243,                             //fmla          v3.4s, v18.4s, v16.s[0]
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_lerp_u8_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xbd400c51,                             //ldr           s17, [x2, #12]
+  0x4ea4d412,                             //fsub          v18.4s, v0.4s, v4.4s
+  0xf9400108,                             //ldr           x8, [x8]
+  0x8b000108,                             //add           x8, x8, x0
+  0x39400109,                             //ldrb          w9, [x8]
+  0x3940050a,                             //ldrb          w10, [x8, #1]
+  0x3940090b,                             //ldrb          w11, [x8, #2]
+  0x39400d08,                             //ldrb          w8, [x8, #3]
+  0x4e021d30,                             //mov           v16.h[0], w9
+  0x4e061d50,                             //mov           v16.h[1], w10
+  0x4e0a1d70,                             //mov           v16.h[2], w11
+  0x4e0e1d10,                             //mov           v16.h[3], w8
+  0x2f07b7f0,                             //bic           v16.4h, #0xff, lsl #8
+  0x2f10a600,                             //uxtl          v0.4s, v16.4h
+  0x6e21d800,                             //ucvtf         v0.4s, v0.4s
+  0x4f919010,                             //fmul          v16.4s, v0.4s, v17.s[0]
+  0x4ea41c80,                             //mov           v0.16b, v4.16b
+  0x4ea5d431,                             //fsub          v17.4s, v1.4s, v5.4s
+  0x4ea51ca1,                             //mov           v1.16b, v5.16b
+  0x4e32ce00,                             //fmla          v0.4s, v16.4s, v18.4s
+  0x4ea6d452,                             //fsub          v18.4s, v2.4s, v6.4s
+  0x4e31ce01,                             //fmla          v1.4s, v16.4s, v17.4s
+  0x4ea61cc2,                             //mov           v2.16b, v6.16b
+  0x4ea7d471,                             //fsub          v17.4s, v3.4s, v7.4s
+  0x4ea71ce3,                             //mov           v3.16b, v7.16b
+  0x4e32ce02,                             //fmla          v2.4s, v16.4s, v18.4s
+  0x4e31ce03,                             //fmla          v3.4s, v16.4s, v17.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_lerp_565_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xd37ff809,                             //lsl           x9, x0, #1
+  0x2d4ec851,                             //ldp           s17, s18, [x2, #116]
+  0x4ea4d413,                             //fsub          v19.4s, v0.4s, v4.4s
+  0xf9400108,                             //ldr           x8, [x8]
+  0x4ea41c80,                             //mov           v0.16b, v4.16b
+  0xfc696903,                             //ldr           d3, [x8, x9]
+  0x9101a048,                             //add           x8, x2, #0x68
+  0x4d40c910,                             //ld1r          {v16.4s}, [x8]
+  0x9101b048,                             //add           x8, x2, #0x6c
+  0x2f10a463,                             //uxtl          v3.4s, v3.4h
+  0x4e231e10,                             //and           v16.16b, v16.16b, v3.16b
+  0x4e21da10,                             //scvtf         v16.4s, v16.4s
+  0x4f919210,                             //fmul          v16.4s, v16.4s, v17.s[0]
+  0x4d40c911,                             //ld1r          {v17.4s}, [x8]
+  0x9101c048,                             //add           x8, x2, #0x70
+  0x4e33ce00,                             //fmla          v0.4s, v16.4s, v19.4s
+  0x4ea5d430,                             //fsub          v16.4s, v1.4s, v5.4s
+  0x4e231e31,                             //and           v17.16b, v17.16b, v3.16b
+  0x4e21da31,                             //scvtf         v17.4s, v17.4s
+  0x4f929231,                             //fmul          v17.4s, v17.4s, v18.s[0]
+  0x4d40c912,                             //ld1r          {v18.4s}, [x8]
+  0x4ea51ca1,                             //mov           v1.16b, v5.16b
+  0x4e30ce21,                             //fmla          v1.4s, v17.4s, v16.4s
+  0xbd407c50,                             //ldr           s16, [x2, #124]
+  0x4e231e52,                             //and           v18.16b, v18.16b, v3.16b
+  0x4d40c843,                             //ld1r          {v3.4s}, [x2]
+  0x4e21da52,                             //scvtf         v18.4s, v18.4s
+  0x4ea6d451,                             //fsub          v17.4s, v2.4s, v6.4s
+  0x4ea61cc2,                             //mov           v2.16b, v6.16b
+  0x4f909250,                             //fmul          v16.4s, v18.4s, v16.s[0]
+  0x4e31ce02,                             //fmla          v2.4s, v16.4s, v17.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_load_tables_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0x9100404b,                             //add           x11, x2, #0x10
+  0x4d40c960,                             //ld1r          {v0.4s}, [x11]
+  0xd37ef409,                             //lsl           x9, x0, #2
+  0xa9402d0a,                             //ldp           x10, x11, [x8]
+  0x3ce96942,                             //ldr           q2, [x10, x9]
+  0xa9412109,                             //ldp           x9, x8, [x8, #16]
+  0x4e221c01,                             //and           v1.16b, v0.16b, v2.16b
+  0x0e143c2c,                             //mov           w12, v1.s[2]
+  0xbc6c5971,                             //ldr           s17, [x11, w12, uxtw #2]
+  0x1e26002c,                             //fmov          w12, s1
+  0x6f380443,                             //ushr          v3.4s, v2.4s, #8
+  0x6f300450,                             //ushr          v16.4s, v2.4s, #16
+  0x8b2c496c,                             //add           x12, x11, w12, uxtw #2
+  0x0e0c3c2a,                             //mov           w10, v1.s[1]
+  0x0e1c3c2d,                             //mov           w13, v1.s[3]
+  0x4e231c01,                             //and           v1.16b, v0.16b, v3.16b
+  0x4e301c03,                             //and           v3.16b, v0.16b, v16.16b
+  0x0d408180,                             //ld1           {v0.s}[0], [x12]
+  0x0e143c2c,                             //mov           w12, v1.s[2]
+  0xbc6c5932,                             //ldr           s18, [x9, w12, uxtw #2]
+  0x1e26002c,                             //fmov          w12, s1
+  0x8b2a496a,                             //add           x10, x11, w10, uxtw #2
+  0xbc6d5970,                             //ldr           s16, [x11, w13, uxtw #2]
+  0x0e0c3c2b,                             //mov           w11, v1.s[1]
+  0x0e1c3c2d,                             //mov           w13, v1.s[3]
+  0x8b2c492c,                             //add           x12, x9, w12, uxtw #2
+  0xbc6d5933,                             //ldr           s19, [x9, w13, uxtw #2]
+  0x0e0c3c6d,                             //mov           w13, v3.s[1]
+  0x8b2b4929,                             //add           x9, x9, w11, uxtw #2
+  0x0e143c6b,                             //mov           w11, v3.s[2]
+  0x0d408181,                             //ld1           {v1.s}[0], [x12]
+  0x0e1c3c6c,                             //mov           w12, v3.s[3]
+  0x0d409140,                             //ld1           {v0.s}[1], [x10]
+  0x1e26006a,                             //fmov          w10, s3
+  0xbd400c43,                             //ldr           s3, [x2, #12]
+  0x6f280442,                             //ushr          v2.4s, v2.4s, #24
+  0x4e21d842,                             //scvtf         v2.4s, v2.4s
+  0x8b2a490a,                             //add           x10, x8, w10, uxtw #2
+  0x4f839043,                             //fmul          v3.4s, v2.4s, v3.s[0]
+  0x0d408142,                             //ld1           {v2.s}[0], [x10]
+  0x8b2d490a,                             //add           x10, x8, w13, uxtw #2
+  0x6e140620,                             //mov           v0.s[2], v17.s[0]
+  0xbc6b5911,                             //ldr           s17, [x8, w11, uxtw #2]
+  0x0d409121,                             //ld1           {v1.s}[1], [x9]
+  0x0d409142,                             //ld1           {v2.s}[1], [x10]
+  0x6e1c0600,                             //mov           v0.s[3], v16.s[0]
+  0xbc6c5910,                             //ldr           s16, [x8, w12, uxtw #2]
+  0x6e140641,                             //mov           v1.s[2], v18.s[0]
+  0x6e140622,                             //mov           v2.s[2], v17.s[0]
+  0x6e1c0661,                             //mov           v1.s[3], v19.s[0]
+  0x6e1c0602,                             //mov           v2.s[3], v16.s[0]
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_load_a8_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xbd400c43,                             //ldr           s3, [x2, #12]
+  0x6f00e400,                             //movi          v0.2d, #0x0
+  0x6f00e401,                             //movi          v1.2d, #0x0
+  0xf9400108,                             //ldr           x8, [x8]
+  0x8b000108,                             //add           x8, x8, x0
+  0x39400109,                             //ldrb          w9, [x8]
+  0x3940050a,                             //ldrb          w10, [x8, #1]
+  0x3940090b,                             //ldrb          w11, [x8, #2]
+  0x39400d08,                             //ldrb          w8, [x8, #3]
+  0x4e021d22,                             //mov           v2.h[0], w9
+  0x4e061d42,                             //mov           v2.h[1], w10
+  0x4e0a1d62,                             //mov           v2.h[2], w11
+  0x4e0e1d02,                             //mov           v2.h[3], w8
+  0x2f07b7e2,                             //bic           v2.4h, #0xff, lsl #8
+  0x2f10a442,                             //uxtl          v2.4s, v2.4h
+  0x6e21d842,                             //ucvtf         v2.4s, v2.4s
+  0x4f839043,                             //fmul          v3.4s, v2.4s, v3.s[0]
+  0x6f00e402,                             //movi          v2.2d, #0x0
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_store_a8_aarch64[] = {
+  0xf9400028,                             //ldr           x8, [x1]
+  0xbd400850,                             //ldr           s16, [x2, #8]
+  0xf9400108,                             //ldr           x8, [x8]
+  0x4f909070,                             //fmul          v16.4s, v3.4s, v16.s[0]
+  0x6e21aa10,                             //fcvtnu        v16.4s, v16.4s
+  0x0e612a10,                             //xtn           v16.4h, v16.4s
+  0x0e0e3e09,                             //umov          w9, v16.h[3]
+  0x8b000108,                             //add           x8, x8, x0
+  0x39000d09,                             //strb          w9, [x8, #3]
+  0x0e0a3e09,                             //umov          w9, v16.h[2]
+  0x39000909,                             //strb          w9, [x8, #2]
+  0x0e063e09,                             //umov          w9, v16.h[1]
+  0x39000509,                             //strb          w9, [x8, #1]
+  0x0e023e09,                             //umov          w9, v16.h[0]
+  0x39000109,                             //strb          w9, [x8]
+  0xf9400423,                             //ldr           x3, [x1, #8]
+  0x91004021,                             //add           x1, x1, #0x10
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_load_565_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xd37ff809,                             //lsl           x9, x0, #1
+  0xf9400108,                             //ldr           x8, [x8]
+  0xfc696900,                             //ldr           d0, [x8, x9]
+  0x9101a048,                             //add           x8, x2, #0x68
+  0x4d40c901,                             //ld1r          {v1.4s}, [x8]
+  0x9101b048,                             //add           x8, x2, #0x6c
+  0x4d40c902,                             //ld1r          {v2.4s}, [x8]
+  0x9101c048,                             //add           x8, x2, #0x70
+  0x4d40c903,                             //ld1r          {v3.4s}, [x8]
+  0x2f10a400,                             //uxtl          v0.4s, v0.4h
+  0x4e201c21,                             //and           v1.16b, v1.16b, v0.16b
+  0x4e201c42,                             //and           v2.16b, v2.16b, v0.16b
+  0x4e201c71,                             //and           v17.16b, v3.16b, v0.16b
+  0x2d4e8c50,                             //ldp           s16, s3, [x2, #116]
+  0x4e21d820,                             //scvtf         v0.4s, v1.4s
+  0x4e21d841,                             //scvtf         v1.4s, v2.4s
+  0x4e21da22,                             //scvtf         v2.4s, v17.4s
+  0x4f909000,                             //fmul          v0.4s, v0.4s, v16.s[0]
+  0xbd407c50,                             //ldr           s16, [x2, #124]
+  0x4f839021,                             //fmul          v1.4s, v1.4s, v3.s[0]
+  0x4d40c843,                             //ld1r          {v3.4s}, [x2]
+  0x4f909042,                             //fmul          v2.4s, v2.4s, v16.s[0]
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_store_565_aarch64[] = {
+  0x2d504450,                             //ldp           s16, s17, [x2, #128]
+  0xf9400028,                             //ldr           x8, [x1]
+  0xd37ff809,                             //lsl           x9, x0, #1
+  0x4f909012,                             //fmul          v18.4s, v0.4s, v16.s[0]
+  0x4f919031,                             //fmul          v17.4s, v1.4s, v17.s[0]
+  0x6e21aa52,                             //fcvtnu        v18.4s, v18.4s
+  0x6e21aa31,                             //fcvtnu        v17.4s, v17.4s
+  0xf9400108,                             //ldr           x8, [x8]
+  0x4f909050,                             //fmul          v16.4s, v2.4s, v16.s[0]
+  0x4f2b5652,                             //shl           v18.4s, v18.4s, #11
+  0x4f255631,                             //shl           v17.4s, v17.4s, #5
+  0x4eb21e31,                             //orr           v17.16b, v17.16b, v18.16b
+  0x6e21aa10,                             //fcvtnu        v16.4s, v16.4s
+  0x4eb01e30,                             //orr           v16.16b, v17.16b, v16.16b
+  0x0e612a10,                             //xtn           v16.4h, v16.4s
+  0xfc296910,                             //str           d16, [x8, x9]
+  0xf9400423,                             //ldr           x3, [x1, #8]
+  0x91004021,                             //add           x1, x1, #0x10
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_load_8888_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xd37ef409,                             //lsl           x9, x0, #2
+  0xbd400c42,                             //ldr           s2, [x2, #12]
+  0xf9400108,                             //ldr           x8, [x8]
+  0x3ce96900,                             //ldr           q0, [x8, x9]
+  0x91004048,                             //add           x8, x2, #0x10
+  0x4d40c901,                             //ld1r          {v1.4s}, [x8]
+  0x6f380410,                             //ushr          v16.4s, v0.4s, #8
+  0x6f300411,                             //ushr          v17.4s, v0.4s, #16
+  0x4e201c23,                             //and           v3.16b, v1.16b, v0.16b
+  0x6f280400,                             //ushr          v0.4s, v0.4s, #24
+  0x4e301c30,                             //and           v16.16b, v1.16b, v16.16b
+  0x4e311c21,                             //and           v1.16b, v1.16b, v17.16b
+  0x4e21d863,                             //scvtf         v3.4s, v3.4s
+  0x4e21d811,                             //scvtf         v17.4s, v0.4s
+  0x4e21da10,                             //scvtf         v16.4s, v16.4s
+  0x4e21d832,                             //scvtf         v18.4s, v1.4s
+  0x4f829060,                             //fmul          v0.4s, v3.4s, v2.s[0]
+  0x4f829223,                             //fmul          v3.4s, v17.4s, v2.s[0]
+  0x4f829201,                             //fmul          v1.4s, v16.4s, v2.s[0]
+  0x4f829242,                             //fmul          v2.4s, v18.4s, v2.s[0]
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_store_8888_aarch64[] = {
+  0xbd400850,                             //ldr           s16, [x2, #8]
+  0xf9400028,                             //ldr           x8, [x1]
+  0xd37ef409,                             //lsl           x9, x0, #2
+  0x4f909032,                             //fmul          v18.4s, v1.4s, v16.s[0]
+  0x4f909011,                             //fmul          v17.4s, v0.4s, v16.s[0]
+  0x6e21aa52,                             //fcvtnu        v18.4s, v18.4s
+  0x6e21aa31,                             //fcvtnu        v17.4s, v17.4s
+  0x4f285652,                             //shl           v18.4s, v18.4s, #8
+  0x4eb11e51,                             //orr           v17.16b, v18.16b, v17.16b
+  0x4f909052,                             //fmul          v18.4s, v2.4s, v16.s[0]
+  0xf9400108,                             //ldr           x8, [x8]
+  0x4f909070,                             //fmul          v16.4s, v3.4s, v16.s[0]
+  0x6e21aa52,                             //fcvtnu        v18.4s, v18.4s
+  0x6e21aa10,                             //fcvtnu        v16.4s, v16.4s
+  0x4f305652,                             //shl           v18.4s, v18.4s, #16
+  0x4eb21e31,                             //orr           v17.16b, v17.16b, v18.16b
+  0x4f385610,                             //shl           v16.4s, v16.4s, #24
+  0x4eb01e30,                             //orr           v16.16b, v17.16b, v16.16b
+  0x3ca96910,                             //str           q16, [x8, x9]
+  0xf9400423,                             //ldr           x3, [x1, #8]
+  0x91004021,                             //add           x1, x1, #0x10
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_load_f16_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xf9400108,                             //ldr           x8, [x8]
+  0x8b000d08,                             //add           x8, x8, x0, lsl #3
+  0x0c400510,                             //ld4           {v16.4h-v19.4h}, [x8]
+  0x0e217a00,                             //fcvtl         v0.4s, v16.4h
+  0x0e217a21,                             //fcvtl         v1.4s, v17.4h
+  0x0e217a42,                             //fcvtl         v2.4s, v18.4h
+  0x0e217a63,                             //fcvtl         v3.4s, v19.4h
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_store_f16_aarch64[] = {
+  0xf9400028,                             //ldr           x8, [x1]
+  0x0e216810,                             //fcvtn         v16.4h, v0.4s
+  0x0e216831,                             //fcvtn         v17.4h, v1.4s
+  0x0e216852,                             //fcvtn         v18.4h, v2.4s
+  0xf9400108,                             //ldr           x8, [x8]
+  0x0e216873,                             //fcvtn         v19.4h, v3.4s
+  0x8b000d08,                             //add           x8, x8, x0, lsl #3
+  0x0c000510,                             //st4           {v16.4h-v19.4h}, [x8]
+  0xf9400423,                             //ldr           x3, [x1, #8]
+  0x91004021,                             //add           x1, x1, #0x10
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_store_f32_aarch64[] = {
+  0xf9400028,                             //ldr           x8, [x1]
+  0xf9400108,                             //ldr           x8, [x8]
+  0x8b001108,                             //add           x8, x8, x0, lsl #4
+  0x4c000900,                             //st4           {v0.4s-v3.4s}, [x8]
+  0xf9400423,                             //ldr           x3, [x1, #8]
+  0x91004021,                             //add           x1, x1, #0x10
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_clamp_x_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0x6f00e411,                             //movi          v17.2d, #0x0
+  0x4e20f620,                             //fmax          v0.4s, v17.4s, v0.4s
+  0x6f07e7f1,                             //movi          v17.2d, #0xffffffffffffffff
+  0x4d40c910,                             //ld1r          {v16.4s}, [x8]
+  0x4eb18610,                             //add           v16.4s, v16.4s, v17.4s
+  0x4eb0f400,                             //fmin          v0.4s, v0.4s, v16.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_clamp_y_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0x6f00e411,                             //movi          v17.2d, #0x0
+  0x4e21f621,                             //fmax          v1.4s, v17.4s, v1.4s
+  0x6f07e7f1,                             //movi          v17.2d, #0xffffffffffffffff
+  0x4d40c910,                             //ld1r          {v16.4s}, [x8]
+  0x4eb18610,                             //add           v16.4s, v16.4s, v17.4s
+  0x4eb0f421,                             //fmin          v1.4s, v1.4s, v16.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_repeat_x_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0x6f07e7f1,                             //movi          v17.2d, #0xffffffffffffffff
+  0xbd400110,                             //ldr           s16, [x8]
+  0x4e040612,                             //dup           v18.4s, v16.s[0]
+  0x4eb18651,                             //add           v17.4s, v18.4s, v17.4s
+  0x6e32fc12,                             //fdiv          v18.4s, v0.4s, v18.4s
+  0x4e219a52,                             //frintm        v18.4s, v18.4s
+  0x4f905240,                             //fmls          v0.4s, v18.4s, v16.s[0]
+  0x4eb1f400,                             //fmin          v0.4s, v0.4s, v17.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_repeat_y_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0x6f07e7f1,                             //movi          v17.2d, #0xffffffffffffffff
+  0xbd400110,                             //ldr           s16, [x8]
+  0x4e040612,                             //dup           v18.4s, v16.s[0]
+  0x4eb18651,                             //add           v17.4s, v18.4s, v17.4s
+  0x6e32fc32,                             //fdiv          v18.4s, v1.4s, v18.4s
+  0x4e219a52,                             //frintm        v18.4s, v18.4s
+  0x4f905241,                             //fmls          v1.4s, v18.4s, v16.s[0]
+  0x4eb1f421,                             //fmin          v1.4s, v1.4s, v17.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_mirror_x_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xbd400110,                             //ldr           s16, [x8]
+  0x4e040611,                             //dup           v17.4s, v16.s[0]
+  0x1e302a10,                             //fadd          s16, s16, s16
+  0x4eb1d400,                             //fsub          v0.4s, v0.4s, v17.4s
+  0x4e040612,                             //dup           v18.4s, v16.s[0]
+  0x6e32fc12,                             //fdiv          v18.4s, v0.4s, v18.4s
+  0x4e219a52,                             //frintm        v18.4s, v18.4s
+  0x4f905240,                             //fmls          v0.4s, v18.4s, v16.s[0]
+  0x6f07e7f0,                             //movi          v16.2d, #0xffffffffffffffff
+  0x4eb1d400,                             //fsub          v0.4s, v0.4s, v17.4s
+  0x4eb08630,                             //add           v16.4s, v17.4s, v16.4s
+  0x4ea0f800,                             //fabs          v0.4s, v0.4s
+  0x4eb0f400,                             //fmin          v0.4s, v0.4s, v16.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_mirror_y_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xbd400110,                             //ldr           s16, [x8]
+  0x4e040611,                             //dup           v17.4s, v16.s[0]
+  0x1e302a10,                             //fadd          s16, s16, s16
+  0x4eb1d421,                             //fsub          v1.4s, v1.4s, v17.4s
+  0x4e040612,                             //dup           v18.4s, v16.s[0]
+  0x6e32fc32,                             //fdiv          v18.4s, v1.4s, v18.4s
+  0x4e219a52,                             //frintm        v18.4s, v18.4s
+  0x4f905241,                             //fmls          v1.4s, v18.4s, v16.s[0]
+  0x6f07e7f0,                             //movi          v16.2d, #0xffffffffffffffff
+  0x4eb1d421,                             //fsub          v1.4s, v1.4s, v17.4s
+  0x4eb08630,                             //add           v16.4s, v17.4s, v16.4s
+  0x4ea0f821,                             //fabs          v1.4s, v1.4s
+  0x4eb0f421,                             //fmin          v1.4s, v1.4s, v16.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_matrix_2x3_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xaa0803e9,                             //mov           x9, x8
+  0x9100410a,                             //add           x10, x8, #0x10
+  0x4ddfc932,                             //ld1r          {v18.4s}, [x9], #4
+  0x4d40c950,                             //ld1r          {v16.4s}, [x10]
+  0x2d415113,                             //ldp           s19, s20, [x8, #8]
+  0x9100510a,                             //add           x10, x8, #0x14
+  0x4d40c951,                             //ld1r          {v17.4s}, [x10]
+  0x4f931030,                             //fmla          v16.4s, v1.4s, v19.s[0]
+  0xbd400133,                             //ldr           s19, [x9]
+  0x4f941031,                             //fmla          v17.4s, v1.4s, v20.s[0]
+  0x4e20ce50,                             //fmla          v16.4s, v18.4s, v0.4s
+  0x4f931011,                             //fmla          v17.4s, v0.4s, v19.s[0]
+  0x4eb01e00,                             //mov           v0.16b, v16.16b
+  0x4eb11e21,                             //mov           v1.16b, v17.16b
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_matrix_3x4_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xaa0803e9,                             //mov           x9, x8
+  0x9100910a,                             //add           x10, x8, #0x24
+  0x4ddfc933,                             //ld1r          {v19.4s}, [x9], #4
+  0x4d40c950,                             //ld1r          {v16.4s}, [x10]
+  0x9100a10a,                             //add           x10, x8, #0x28
+  0x4d40c951,                             //ld1r          {v17.4s}, [x10]
+  0x9100b10a,                             //add           x10, x8, #0x2c
+  0x2d435514,                             //ldp           s20, s21, [x8, #24]
+  0xbd402116,                             //ldr           s22, [x8, #32]
+  0x4d40c952,                             //ld1r          {v18.4s}, [x10]
+  0x4f941050,                             //fmla          v16.4s, v2.4s, v20.s[0]
+  0x4f951051,                             //fmla          v17.4s, v2.4s, v21.s[0]
+  0x4f961052,                             //fmla          v18.4s, v2.4s, v22.s[0]
+  0x2d425502,                             //ldp           s2, s21, [x8, #16]
+  0x2d415d14,                             //ldp           s20, s23, [x8, #8]
+  0x4f821031,                             //fmla          v17.4s, v1.4s, v2.s[0]
+  0xbd400122,                             //ldr           s2, [x9]
+  0x4f971030,                             //fmla          v16.4s, v1.4s, v23.s[0]
+  0x4f951032,                             //fmla          v18.4s, v1.4s, v21.s[0]
+  0x4e20ce70,                             //fmla          v16.4s, v19.4s, v0.4s
+  0x4f941012,                             //fmla          v18.4s, v0.4s, v20.s[0]
+  0x4f821011,                             //fmla          v17.4s, v0.4s, v2.s[0]
+  0x4eb01e00,                             //mov           v0.16b, v16.16b
+  0x4eb11e21,                             //mov           v1.16b, v17.16b
+  0x4eb21e42,                             //mov           v2.16b, v18.16b
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_matrix_perspective_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xaa0803e9,                             //mov           x9, x8
+  0x9100510a,                             //add           x10, x8, #0x14
+  0x4ddfc930,                             //ld1r          {v16.4s}, [x9], #4
+  0x4d40c951,                             //ld1r          {v17.4s}, [x10]
+  0x9100810a,                             //add           x10, x8, #0x20
+  0x4d40c952,                             //ld1r          {v18.4s}, [x10]
+  0x2d41d113,                             //ldp           s19, s20, [x8, #12]
+  0x2d435915,                             //ldp           s21, s22, [x8, #24]
+  0x91002108,                             //add           x8, x8, #0x8
+  0x4f941031,                             //fmla          v17.4s, v1.4s, v20.s[0]
+  0x4d40c914,                             //ld1r          {v20.4s}, [x8]
+  0x4f961032,                             //fmla          v18.4s, v1.4s, v22.s[0]
+  0xbd400136,                             //ldr           s22, [x9]
+  0x4f951012,                             //fmla          v18.4s, v0.4s, v21.s[0]
+  0x4f931011,                             //fmla          v17.4s, v0.4s, v19.s[0]
+  0x4f961034,                             //fmla          v20.4s, v1.4s, v22.s[0]
+  0x4ea1da41,                             //frecpe        v1.4s, v18.4s
+  0x4e21fe52,                             //frecps        v18.4s, v18.4s, v1.4s
+  0x6e32dc32,                             //fmul          v18.4s, v1.4s, v18.4s
+  0x4e20ce14,                             //fmla          v20.4s, v16.4s, v0.4s
+  0x6e32de21,                             //fmul          v1.4s, v17.4s, v18.4s
+  0x6e32de80,                             //fmul          v0.4s, v20.4s, v18.4s
+  0xd61f0060,                             //br            x3
+};
+
+CODE const uint32_t sk_linear_gradient_2stops_aarch64[] = {
+  0xa8c10c28,                             //ldp           x8, x3, [x1], #16
+  0xad404503,                             //ldp           q3, q17, [x8]
+  0x4e040470,                             //dup           v16.4s, v3.s[0]
+  0x4e0c0461,                             //dup           v1.4s, v3.s[1]
+  0x4e140462,                             //dup           v2.4s, v3.s[2]
+  0x4e1c0463,                             //dup           v3.4s, v3.s[3]
+  0x4f911010,                             //fmla          v16.4s, v0.4s, v17.s[0]
+  0x4fb11001,                             //fmla          v1.4s, v0.4s, v17.s[1]
+  0x4f911802,                             //fmla          v2.4s, v0.4s, v17.s[2]
+  0x4fb11803,                             //fmla          v3.4s, v0.4s, v17.s[3]
+  0x4eb01e00,                             //mov           v0.16b, v16.16b
+  0xd61f0060,                             //br            x3
+};
+#elif defined(__arm__)
+
+CODE const uint32_t sk_start_pipeline_vfp4[] = {
+  0xe92d41f0,                             //push          {r4, r5, r6, r7, r8, lr}
+  0xe1a07001,                             //mov           r7, r1
+  0xe1a04000,                             //mov           r4, r0
+  0xe1a05003,                             //mov           r5, r3
+  0xe1a08002,                             //mov           r8, r2
+  0xe4976004,                             //ldr           r6, [r7], #4
+  0xe2840002,                             //add           r0, r4, #2
+  0xea00000d,                             //b             58 <sk_start_pipeline_vfp4+0x58>
+  0xf2800010,                             //vmov.i32      d0, #0
+  0xe1a00004,                             //mov           r0, r4
+  0xf2801010,                             //vmov.i32      d1, #0
+  0xe1a01007,                             //mov           r1, r7
+  0xf2802010,                             //vmov.i32      d2, #0
+  0xe1a02008,                             //mov           r2, r8
+  0xf2803010,                             //vmov.i32      d3, #0
+  0xf2804010,                             //vmov.i32      d4, #0
+  0xf2805010,                             //vmov.i32      d5, #0
+  0xf2806010,                             //vmov.i32      d6, #0
+  0xf2807010,                             //vmov.i32      d7, #0
+  0xe12fff36,                             //blx           r6
+  0xe2840004,                             //add           r0, r4, #4
+  0xe2844002,                             //add           r4, r4, #2
+  0xe1500005,                             //cmp           r0, r5
+  0x9affffef,                             //bls           20 <sk_start_pipeline_vfp4+0x20>
+  0xe1a00004,                             //mov           r0, r4
+  0xe8bd81f0,                             //pop           {r4, r5, r6, r7, r8, pc}
+};
+
+CODE const uint32_t sk_just_return_vfp4[] = {
+  0xe12fff1e,                             //bx            lr
+};
+
+CODE const uint32_t sk_seed_shader_vfp4[] = {
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xee800b90,                             //vdup.32       d16, r0
+  0xf3fb0620,                             //vcvt.f32.s32  d16, d16
+  0xedd23b05,                             //vldr          d19, [r2, #20]
+  0xf2803010,                             //vmov.i32      d3, #0
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe2823004,                             //add           r3, r2, #4
+  0xf3fb1621,                             //vcvt.f32.s32  d17, d17
+  0xe2811008,                             //add           r1, r1, #8
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xf2804010,                             //vmov.i32      d4, #0
+  0xf2400da2,                             //vadd.f32      d16, d16, d18
+  0xf2805010,                             //vmov.i32      d5, #0
+  0xf4a22c9f,                             //vld1.32       {d2[]}, [r2 :32]
+  0xf2011da2,                             //vadd.f32      d1, d17, d18
+  0xf2806010,                             //vmov.i32      d6, #0
+  0xf2030da0,                             //vadd.f32      d0, d19, d16
+  0xf2807010,                             //vmov.i32      d7, #0
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_constant_color_vfp4[] = {
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xe2811008,                             //add           r1, r1, #8
+  0xf4630a0f,                             //vld1.8        {d16-d17}, [r3]
+  0xf3b40c20,                             //vdup.32       d0, d16[0]
+  0xf3bc1c20,                             //vdup.32       d1, d16[1]
+  0xf3b42c21,                             //vdup.32       d2, d17[0]
+  0xf3bc3c21,                             //vdup.32       d3, d17[1]
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_clear_vfp4[] = {
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf2800010,                             //vmov.i32      d0, #0
+  0xf2801010,                             //vmov.i32      d1, #0
+  0xf2802010,                             //vmov.i32      d2, #0
+  0xf2803010,                             //vmov.i32      d3, #0
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_plus__vfp4[] = {
+  0xf2000d04,                             //vadd.f32      d0, d0, d4
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf2011d05,                             //vadd.f32      d1, d1, d5
+  0xf2022d06,                             //vadd.f32      d2, d2, d6
+  0xf2033d07,                             //vadd.f32      d3, d3, d7
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_srcover_vfp4[] = {
+  0xf4e20c9f,                             //vld1.32       {d16[]}, [r2 :32]
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf2600d83,                             //vsub.f32      d16, d16, d3
+  0xf2040c30,                             //vfma.f32      d0, d4, d16
+  0xf2051c30,                             //vfma.f32      d1, d5, d16
+  0xf2062c30,                             //vfma.f32      d2, d6, d16
+  0xf2073c30,                             //vfma.f32      d3, d7, d16
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_dstover_vfp4[] = {
+  0xf4e20c9f,                             //vld1.32       {d16[]}, [r2 :32]
+  0xf2651115,                             //vorr          d17, d5, d5
+  0xf2604d87,                             //vsub.f32      d20, d16, d7
+  0xf2640114,                             //vorr          d16, d4, d4
+  0xf2662116,                             //vorr          d18, d6, d6
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf2673117,                             //vorr          d19, d7, d7
+  0xf2400c34,                             //vfma.f32      d16, d0, d20
+  0xf2411c34,                             //vfma.f32      d17, d1, d20
+  0xf2422c34,                             //vfma.f32      d18, d2, d20
+  0xf2433c34,                             //vfma.f32      d19, d3, d20
+  0xf22001b0,                             //vorr          d0, d16, d16
+  0xf22111b1,                             //vorr          d1, d17, d17
+  0xf22221b2,                             //vorr          d2, d18, d18
+  0xf22331b3,                             //vorr          d3, d19, d19
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_clamp_0_vfp4[] = {
+  0xf2c00010,                             //vmov.i32      d16, #0
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf2000f20,                             //vmax.f32      d0, d0, d16
+  0xf2011f20,                             //vmax.f32      d1, d1, d16
+  0xf2022f20,                             //vmax.f32      d2, d2, d16
+  0xf2033f20,                             //vmax.f32      d3, d3, d16
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_clamp_1_vfp4[] = {
+  0xf4e20c9f,                             //vld1.32       {d16[]}, [r2 :32]
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf2200f20,                             //vmin.f32      d0, d0, d16
+  0xf2211f20,                             //vmin.f32      d1, d1, d16
+  0xf2222f20,                             //vmin.f32      d2, d2, d16
+  0xf2233f20,                             //vmin.f32      d3, d3, d16
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_clamp_a_vfp4[] = {
+  0xf4e20c9f,                             //vld1.32       {d16[]}, [r2 :32]
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf2233f20,                             //vmin.f32      d3, d3, d16
+  0xf2200f03,                             //vmin.f32      d0, d0, d3
+  0xf2211f03,                             //vmin.f32      d1, d1, d3
+  0xf2222f03,                             //vmin.f32      d2, d2, d3
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_set_rgb_vfp4[] = {
+  0xe92d4800,                             //push          {fp, lr}
+  0xe591e000,                             //ldr           lr, [r1]
+  0xe591c004,                             //ldr           ip, [r1, #4]
+  0xe2811008,                             //add           r1, r1, #8
+  0xe28e3008,                             //add           r3, lr, #8
+  0xf4ae0c9f,                             //vld1.32       {d0[]}, [lr :32]
+  0xf4a32c9f,                             //vld1.32       {d2[]}, [r3 :32]
+  0xe28e3004,                             //add           r3, lr, #4
+  0xf4a31c9f,                             //vld1.32       {d1[]}, [r3 :32]
+  0xe8bd4800,                             //pop           {fp, lr}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_swap_rb_vfp4[] = {
+  0xeef00b40,                             //vmov.f64      d16, d0
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xeeb00b42,                             //vmov.f64      d0, d2
+  0xeeb02b60,                             //vmov.f64      d2, d16
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_swap_vfp4[] = {
+  0xeef00b43,                             //vmov.f64      d16, d3
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xeef01b42,                             //vmov.f64      d17, d2
+  0xeef02b41,                             //vmov.f64      d18, d1
+  0xeef03b40,                             //vmov.f64      d19, d0
+  0xeeb00b44,                             //vmov.f64      d0, d4
+  0xeeb01b45,                             //vmov.f64      d1, d5
+  0xeeb02b46,                             //vmov.f64      d2, d6
+  0xeeb03b47,                             //vmov.f64      d3, d7
+  0xeeb04b63,                             //vmov.f64      d4, d19
+  0xeeb05b62,                             //vmov.f64      d5, d18
+  0xeeb06b61,                             //vmov.f64      d6, d17
+  0xeeb07b60,                             //vmov.f64      d7, d16
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_move_src_dst_vfp4[] = {
+  0xeeb04b40,                             //vmov.f64      d4, d0
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xeeb05b41,                             //vmov.f64      d5, d1
+  0xeeb06b42,                             //vmov.f64      d6, d2
+  0xeeb07b43,                             //vmov.f64      d7, d3
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_move_dst_src_vfp4[] = {
+  0xeeb00b44,                             //vmov.f64      d0, d4
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xeeb01b45,                             //vmov.f64      d1, d5
+  0xeeb02b46,                             //vmov.f64      d2, d6
+  0xeeb03b47,                             //vmov.f64      d3, d7
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_premul_vfp4[] = {
+  0xf3000d13,                             //vmul.f32      d0, d0, d3
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf3011d13,                             //vmul.f32      d1, d1, d3
+  0xf3022d13,                             //vmul.f32      d2, d2, d3
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_unpremul_vfp4[] = {
+  0xed2d8b04,                             //vpush         {d8-d9}
+  0xed928a00,                             //vldr          s16, [r2]
+  0xf2c00010,                             //vmov.i32      d16, #0
+  0xf3f91503,                             //vceq.f32      d17, d3, #0
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xeec89a23,                             //vdiv.f32      s19, s16, s7
+  0xee889a03,                             //vdiv.f32      s18, s16, s6
+  0xf3501199,                             //vbsl          d17, d16, d9
+  0xf3010d90,                             //vmul.f32      d0, d17, d0
+  0xf3011d91,                             //vmul.f32      d1, d17, d1
+  0xf3012d92,                             //vmul.f32      d2, d17, d2
+  0xecbd8b04,                             //vpop          {d8-d9}
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_from_srgb_vfp4[] = {
+  0xed2d8b02,                             //vpush         {d8}
+  0xe282303c,                             //add           r3, r2, #60
+  0xed928a10,                             //vldr          s16, [r2, #64]
+  0xf3402d10,                             //vmul.f32      d18, d0, d0
+  0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
+  0xe2823038,                             //add           r3, r2, #56
+  0xf3413d11,                             //vmul.f32      d19, d1, d1
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe2823044,                             //add           r3, r2, #68
+  0xf26141b1,                             //vorr          d20, d17, d17
+  0xf26171b1,                             //vorr          d23, d17, d17
+  0xf4e38c9f,                             //vld1.32       {d24[]}, [r3 :32]
+  0xf2404c30,                             //vfma.f32      d20, d0, d16
+  0xe2823034,                             //add           r3, r2, #52
+  0xf2417c30,                             //vfma.f32      d23, d1, d16
+  0xf2421c30,                             //vfma.f32      d17, d2, d16
+  0xf3425d12,                             //vmul.f32      d21, d2, d2
+  0xf2e16948,                             //vmul.f32      d22, d1, d8[0]
+  0xf2e00948,                             //vmul.f32      d16, d0, d8[0]
+  0xf2e29948,                             //vmul.f32      d25, d2, d8[0]
+  0xf3282e82,                             //vcgt.f32      d2, d24, d2
+  0xf3281e81,                             //vcgt.f32      d1, d24, d1
+  0xf3280e80,                             //vcgt.f32      d0, d24, d0
+  0xf4e38c9f,                             //vld1.32       {d24[]}, [r3 :32]
+  0xf268a1b8,                             //vorr          d26, d24, d24
+  0xf242acb4,                             //vfma.f32      d26, d18, d20
+  0xf26821b8,                             //vorr          d18, d24, d24
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf2432cb7,                             //vfma.f32      d18, d19, d23
+  0xf2458cb1,                             //vfma.f32      d24, d21, d17
+  0xf31001ba,                             //vbsl          d0, d16, d26
+  0xf31611b2,                             //vbsl          d1, d22, d18
+  0xf31921b8,                             //vbsl          d2, d25, d24
+  0xecbd8b02,                             //vpop          {d8}
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_to_srgb_vfp4[] = {
+  0xed2d8b02,                             //vpush         {d8}
+  0xf3fb0580,                             //vrsqrte.f32   d16, d0
+  0xe2823050,                             //add           r3, r2, #80
+  0xf3fb1581,                             //vrsqrte.f32   d17, d1
+  0xed928a12,                             //vldr          s16, [r2, #72]
+  0xf3fb2582,                             //vrsqrte.f32   d18, d2
+  0xf3403db0,                             //vmul.f32      d19, d16, d16
+  0xf3414db1,                             //vmul.f32      d20, d17, d17
+  0xf3425db2,                             //vmul.f32      d21, d18, d18
+  0xf2603f33,                             //vrsqrts.f32   d19, d0, d19
+  0xf2614f34,                             //vrsqrts.f32   d20, d1, d20
+  0xf2625f35,                             //vrsqrts.f32   d21, d2, d21
+  0xf3400db3,                             //vmul.f32      d16, d16, d19
+  0xf3411db4,                             //vmul.f32      d17, d17, d20
+  0xf3422db5,                             //vmul.f32      d18, d18, d21
+  0xf3fb3520,                             //vrecpe.f32    d19, d16
+  0xf3fb4521,                             //vrecpe.f32    d20, d17
+  0xf3fb6522,                             //vrecpe.f32    d22, d18
+  0xf3fb55a2,                             //vrsqrte.f32   d21, d18
+  0xf3fb75a0,                             //vrsqrte.f32   d23, d16
+  0xf3fb85a1,                             //vrsqrte.f32   d24, d17
+  0xf2409fb3,                             //vrecps.f32    d25, d16, d19
+  0xf241afb4,                             //vrecps.f32    d26, d17, d20
+  0xf242bfb6,                             //vrecps.f32    d27, d18, d22
+  0xf345cdb5,                             //vmul.f32      d28, d21, d21
+  0xf347ddb7,                             //vmul.f32      d29, d23, d23
+  0xf348edb8,                             //vmul.f32      d30, d24, d24
+  0xf2622fbc,                             //vrsqrts.f32   d18, d18, d28
+  0xf2600fbd,                             //vrsqrts.f32   d16, d16, d29
+  0xf2611fbe,                             //vrsqrts.f32   d17, d17, d30
+  0xf3433db9,                             //vmul.f32      d19, d19, d25
+  0xf4e39c9f,                             //vld1.32       {d25[]}, [r3 :32]
+  0xe2823054,                             //add           r3, r2, #84
+  0xf3444dba,                             //vmul.f32      d20, d20, d26
+  0xf3466dbb,                             //vmul.f32      d22, d22, d27
+  0xf4e3ac9f,                             //vld1.32       {d26[]}, [r3 :32]
+  0xe282304c,                             //add           r3, r2, #76
+  0xf26ab1ba,                             //vorr          d27, d26, d26
+  0xf249bcb3,                             //vfma.f32      d27, d25, d19
+  0xf26a31ba,                             //vorr          d19, d26, d26
+  0xf2493cb4,                             //vfma.f32      d19, d25, d20
+  0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
+  0xf249acb6,                             //vfma.f32      d26, d25, d22
+  0xe2823058,                             //add           r3, r2, #88
+  0xf3452db2,                             //vmul.f32      d18, d21, d18
+  0xf3470db0,                             //vmul.f32      d16, d23, d16
+  0xf3481db1,                             //vmul.f32      d17, d24, d17
+  0xf2e05948,                             //vmul.f32      d21, d0, d8[0]
+  0xf244bcb0,                             //vfma.f32      d27, d20, d16
+  0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
+  0xf2443cb1,                             //vfma.f32      d19, d20, d17
+  0xf244acb2,                             //vfma.f32      d26, d20, d18
+  0xf4e24c9f,                             //vld1.32       {d20[]}, [r2 :32]
+  0xf2e11948,                             //vmul.f32      d17, d1, d8[0]
+  0xf2e22948,                             //vmul.f32      d18, d2, d8[0]
+  0xf3201e81,                             //vcgt.f32      d1, d16, d1
+  0xe4913004,                             //ldr           r3, [r1], #4
+  0xf3200e80,                             //vcgt.f32      d0, d16, d0
+  0xf3202e82,                             //vcgt.f32      d2, d16, d2
+  0xf2640fab,                             //vmin.f32      d16, d20, d27
+  0xf2643fa3,                             //vmin.f32      d19, d20, d19
+  0xf2644faa,                             //vmin.f32      d20, d20, d26
+  0xf31501b0,                             //vbsl          d0, d21, d16
+  0xf31111b3,                             //vbsl          d1, d17, d19
+  0xf31221b4,                             //vbsl          d2, d18, d20
+  0xecbd8b02,                             //vpop          {d8}
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_scale_1_float_vfp4[] = {
+  0xed2d8b02,                             //vpush         {d8}
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xe2811008,                             //add           r1, r1, #8
+  0xed938a00,                             //vldr          s16, [r3]
+  0xf2a00948,                             //vmul.f32      d0, d0, d8[0]
+  0xf2a11948,                             //vmul.f32      d1, d1, d8[0]
+  0xf2a22948,                             //vmul.f32      d2, d2, d8[0]
+  0xf2a33948,                             //vmul.f32      d3, d3, d8[0]
+  0xecbd8b02,                             //vpop          {d8}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_scale_u8_vfp4[] = {
+  0xed2d8b02,                             //vpush         {d8}
+  0xe24dd008,                             //sub           sp, sp, #8
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xe2811008,                             //add           r1, r1, #8
+  0xe5933000,                             //ldr           r3, [r3]
+  0xe0833000,                             //add           r3, r3, r0
+  0xe1d330b0,                             //ldrh          r3, [r3]
+  0xe1cd30b4,                             //strh          r3, [sp, #4]
+  0xe28d3004,                             //add           r3, sp, #4
+  0xed928a03,                             //vldr          s16, [r2, #12]
+  0xf4e3041f,                             //vld1.16       {d16[0]}, [r3 :16]
+  0xf3c80a30,                             //vmovl.u8      q8, d16
+  0xf3d00a30,                             //vmovl.u16     q8, d16
+  0xf3fb06a0,                             //vcvt.f32.u32  d16, d16
+  0xf2e009c8,                             //vmul.f32      d16, d16, d8[0]
+  0xf3000d90,                             //vmul.f32      d0, d16, d0
+  0xf3001d91,                             //vmul.f32      d1, d16, d1
+  0xf3002d92,                             //vmul.f32      d2, d16, d2
+  0xf3003d93,                             //vmul.f32      d3, d16, d3
+  0xe28dd008,                             //add           sp, sp, #8
+  0xecbd8b02,                             //vpop          {d8}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_lerp_1_float_vfp4[] = {
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xf2600d04,                             //vsub.f32      d16, d0, d4
+  0xf2611d05,                             //vsub.f32      d17, d1, d5
+  0xf2622d06,                             //vsub.f32      d18, d2, d6
+  0xe2811008,                             //add           r1, r1, #8
+  0xf2633d07,                             //vsub.f32      d19, d3, d7
+  0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
+  0xf2240114,                             //vorr          d0, d4, d4
+  0xf2251115,                             //vorr          d1, d5, d5
+  0xf2262116,                             //vorr          d2, d6, d6
+  0xf2273117,                             //vorr          d3, d7, d7
+  0xf2000cb4,                             //vfma.f32      d0, d16, d20
+  0xf2011cb4,                             //vfma.f32      d1, d17, d20
+  0xf2022cb4,                             //vfma.f32      d2, d18, d20
+  0xf2033cb4,                             //vfma.f32      d3, d19, d20
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_lerp_u8_vfp4[] = {
+  0xed2d8b02,                             //vpush         {d8}
+  0xe24dd008,                             //sub           sp, sp, #8
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xf2612d05,                             //vsub.f32      d18, d1, d5
+  0xf2623d06,                             //vsub.f32      d19, d2, d6
+  0xf2634d07,                             //vsub.f32      d20, d3, d7
+  0xe2811008,                             //add           r1, r1, #8
+  0xe5933000,                             //ldr           r3, [r3]
+  0xf2251115,                             //vorr          d1, d5, d5
+  0xf2262116,                             //vorr          d2, d6, d6
+  0xe0833000,                             //add           r3, r3, r0
+  0xf2273117,                             //vorr          d3, d7, d7
+  0xe1d330b0,                             //ldrh          r3, [r3]
+  0xe1cd30b4,                             //strh          r3, [sp, #4]
+  0xe28d3004,                             //add           r3, sp, #4
+  0xed928a03,                             //vldr          s16, [r2, #12]
+  0xf4e3041f,                             //vld1.16       {d16[0]}, [r3 :16]
+  0xf3c80a30,                             //vmovl.u8      q8, d16
+  0xf3d00a30,                             //vmovl.u16     q8, d16
+  0xf3fb06a0,                             //vcvt.f32.u32  d16, d16
+  0xf2601d04,                             //vsub.f32      d17, d0, d4
+  0xf2240114,                             //vorr          d0, d4, d4
+  0xf2e009c8,                             //vmul.f32      d16, d16, d8[0]
+  0xf2010cb0,                             //vfma.f32      d0, d17, d16
+  0xf2021cb0,                             //vfma.f32      d1, d18, d16
+  0xf2032cb0,                             //vfma.f32      d2, d19, d16
+  0xf2043cb0,                             //vfma.f32      d3, d20, d16
+  0xe28dd008,                             //add           sp, sp, #8
+  0xecbd8b02,                             //vpop          {d8}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_lerp_565_vfp4[] = {
+  0xed2d8b04,                             //vpush         {d8-d9}
+  0xe24dd008,                             //sub           sp, sp, #8
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xf2603d04,                             //vsub.f32      d19, d0, d4
+  0xf2240114,                             //vorr          d0, d4, d4
+  0xe2811008,                             //add           r1, r1, #8
+  0xe5933000,                             //ldr           r3, [r3]
+  0xe7933080,                             //ldr           r3, [r3, r0, lsl #1]
+  0xe58d3004,                             //str           r3, [sp, #4]
+  0xe28d3004,                             //add           r3, sp, #4
+  0xed923a1d,                             //vldr          s6, [r2, #116]
+  0xf4e3083f,                             //vld1.32       {d16[0]}, [r3 :32]
+  0xe282306c,                             //add           r3, r2, #108
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe2823068,                             //add           r3, r2, #104
+  0xf3d04a30,                             //vmovl.u16     q10, d16
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xe2823070,                             //add           r3, r2, #112
+  0xf24201b4,                             //vand          d16, d18, d20
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xf24221b4,                             //vand          d18, d18, d20
+  0xf24111b4,                             //vand          d17, d17, d20
+  0xf3fb0620,                             //vcvt.f32.s32  d16, d16
+  0xed928a1e,                             //vldr          s16, [r2, #120]
+  0xf3fb1621,                             //vcvt.f32.s32  d17, d17
+  0xed929a1f,                             //vldr          s18, [r2, #124]
+  0xf3fb2622,                             //vcvt.f32.s32  d18, d18
+  0xf2614d05,                             //vsub.f32      d20, d1, d5
+  0xf2e009c3,                             //vmul.f32      d16, d16, d3[0]
+  0xf4a23c9f,                             //vld1.32       {d3[]}, [r2 :32]
+  0xf2625d06,                             //vsub.f32      d21, d2, d6
+  0xf2e119c8,                             //vmul.f32      d17, d17, d8[0]
+  0xf2e229c9,                             //vmul.f32      d18, d18, d9[0]
+  0xf2251115,                             //vorr          d1, d5, d5
+  0xf2262116,                             //vorr          d2, d6, d6
+  0xf2030cb0,                             //vfma.f32      d0, d19, d16
+  0xf2041cb1,                             //vfma.f32      d1, d20, d17
+  0xf2052cb2,                             //vfma.f32      d2, d21, d18
+  0xe28dd008,                             //add           sp, sp, #8
+  0xecbd8b04,                             //vpop          {d8-d9}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_load_tables_vfp4[] = {
+  0xe92d48f0,                             //push          {r4, r5, r6, r7, fp, lr}
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xe2826010,                             //add           r6, r2, #16
+  0xe2811008,                             //add           r1, r1, #8
+  0xe593e000,                             //ldr           lr, [r3]
+  0xe99300b0,                             //ldmib         r3, {r4, r5, r7}
+  0xf4e60c9f,                             //vld1.32       {d16[]}, [r6 :32]
+  0xe08e6100,                             //add           r6, lr, r0, lsl #2
+  0xedd61b00,                             //vldr          d17, [r6]
+  0xf24021b1,                             //vand          d18, d16, d17
+  0xed922a03,                             //vldr          s4, [r2, #12]
+  0xf3f03031,                             //vshr.u32      d19, d17, #16
+  0xee326b90,                             //vmov.32       r6, d18[1]
+  0xe0846106,                             //add           r6, r4, r6, lsl #2
+  0xedd60a00,                             //vldr          s1, [r6]
+  0xee126b90,                             //vmov.32       r6, d18[0]
+  0xf3f82031,                             //vshr.u32      d18, d17, #8
+  0xf24021b2,                             //vand          d18, d16, d18
+  0xf24001b3,                             //vand          d16, d16, d19
+  0xee103b90,                             //vmov.32       r3, d16[0]
+  0xe0846106,                             //add           r6, r4, r6, lsl #2
+  0xee304b90,                             //vmov.32       r4, d16[1]
+  0xf3e80031,                             //vshr.u32      d16, d17, #24
+  0xed960a00,                             //vldr          s0, [r6]
+  0xee326b90,                             //vmov.32       r6, d18[1]
+  0xf3fb0620,                             //vcvt.f32.s32  d16, d16
+  0xe0873103,                             //add           r3, r7, r3, lsl #2
+  0xf2a039c2,                             //vmul.f32      d3, d16, d2[0]
+  0xe0874104,                             //add           r4, r7, r4, lsl #2
+  0xedd42a00,                             //vldr          s5, [r4]
+  0xe0856106,                             //add           r6, r5, r6, lsl #2
+  0xed932a00,                             //vldr          s4, [r3]
+  0xedd61a00,                             //vldr          s3, [r6]
+  0xee126b90,                             //vmov.32       r6, d18[0]
+  0xe0856106,                             //add           r6, r5, r6, lsl #2
+  0xed961a00,                             //vldr          s2, [r6]
+  0xe8bd48f0,                             //pop           {r4, r5, r6, r7, fp, lr}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_load_a8_vfp4[] = {
+  0xe24dd004,                             //sub           sp, sp, #4
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xe2811008,                             //add           r1, r1, #8
+  0xf2801010,                             //vmov.i32      d1, #0
+  0xf2802010,                             //vmov.i32      d2, #0
+  0xe5933000,                             //ldr           r3, [r3]
+  0xe0833000,                             //add           r3, r3, r0
+  0xe1d330b0,                             //ldrh          r3, [r3]
+  0xe1cd30b0,                             //strh          r3, [sp]
+  0xe1a0300d,                             //mov           r3, sp
+  0xf4e3041f,                             //vld1.16       {d16[0]}, [r3 :16]
+  0xed920a03,                             //vldr          s0, [r2, #12]
+  0xf3c80a30,                             //vmovl.u8      q8, d16
+  0xf3d00a30,                             //vmovl.u16     q8, d16
+  0xf3fb06a0,                             //vcvt.f32.u32  d16, d16
+  0xf2a039c0,                             //vmul.f32      d3, d16, d0[0]
+  0xf2800010,                             //vmov.i32      d0, #0
+  0xe28dd004,                             //add           sp, sp, #4
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_store_a8_vfp4[] = {
+  0xe92d4800,                             //push          {fp, lr}
+  0xe2823008,                             //add           r3, r2, #8
+  0xf2c3061f,                             //vmov.i32      d16, #1056964608
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe5913000,                             //ldr           r3, [r1]
+  0xf2430c31,                             //vfma.f32      d16, d3, d17
+  0xe5933000,                             //ldr           r3, [r3]
+  0xf3fb07a0,                             //vcvt.u32.f32  d16, d16
+  0xee10eb90,                             //vmov.32       lr, d16[0]
+  0xee30cb90,                             //vmov.32       ip, d16[1]
+  0xe7e3e000,                             //strb          lr, [r3, r0]!
+  0xe5c3c001,                             //strb          ip, [r3, #1]
+  0xe5913004,                             //ldr           r3, [r1, #4]
+  0xe2811008,                             //add           r1, r1, #8
+  0xe8bd4800,                             //pop           {fp, lr}
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_load_565_vfp4[] = {
+  0xe24dd004,                             //sub           sp, sp, #4
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xe2811008,                             //add           r1, r1, #8
+  0xe5933000,                             //ldr           r3, [r3]
+  0xe7933080,                             //ldr           r3, [r3, r0, lsl #1]
+  0xe58d3000,                             //str           r3, [sp]
+  0xe1a0300d,                             //mov           r3, sp
+  0xf4e3083f,                             //vld1.32       {d16[0]}, [r3 :32]
+  0xe282306c,                             //add           r3, r2, #108
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe2823068,                             //add           r3, r2, #104
+  0xf3d04a30,                             //vmovl.u16     q10, d16
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xe2823070,                             //add           r3, r2, #112
+  0xf24201b4,                             //vand          d16, d18, d20
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xf24111b4,                             //vand          d17, d17, d20
+  0xf24221b4,                             //vand          d18, d18, d20
+  0xf4a23c9f,                             //vld1.32       {d3[]}, [r2 :32]
+  0xf3fb0620,                             //vcvt.f32.s32  d16, d16
+  0xf3fb1621,                             //vcvt.f32.s32  d17, d17
+  0xf3fb2622,                             //vcvt.f32.s32  d18, d18
+  0xed920a1d,                             //vldr          s0, [r2, #116]
+  0xed921a1e,                             //vldr          s2, [r2, #120]
+  0xed922a1f,                             //vldr          s4, [r2, #124]
+  0xf2a009c0,                             //vmul.f32      d0, d16, d0[0]
+  0xf2a119c1,                             //vmul.f32      d1, d17, d1[0]
+  0xf2a229c2,                             //vmul.f32      d2, d18, d2[0]
+  0xe28dd004,                             //add           sp, sp, #4
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_store_565_vfp4[] = {
+  0xe2823080,                             //add           r3, r2, #128
+  0xf2c3361f,                             //vmov.i32      d19, #1056964608
+  0xf2c3461f,                             //vmov.i32      d20, #1056964608
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe2823084,                             //add           r3, r2, #132
+  0xf2403c31,                             //vfma.f32      d19, d0, d17
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xf2c3061f,                             //vmov.i32      d16, #1056964608
+  0xf2414c32,                             //vfma.f32      d20, d1, d18
+  0xf2420c31,                             //vfma.f32      d16, d2, d17
+  0xe5913000,                             //ldr           r3, [r1]
+  0xe5933000,                             //ldr           r3, [r3]
+  0xf3fb17a3,                             //vcvt.u32.f32  d17, d19
+  0xe0833080,                             //add           r3, r3, r0, lsl #1
+  0xf3fb27a4,                             //vcvt.u32.f32  d18, d20
+  0xf3fb07a0,                             //vcvt.u32.f32  d16, d16
+  0xf2eb1531,                             //vshl.s32      d17, d17, #11
+  0xf2e52532,                             //vshl.s32      d18, d18, #5
+  0xf26101b0,                             //vorr          d16, d17, d16
+  0xf26001b2,                             //vorr          d16, d16, d18
+  0xf3f60121,                             //vuzp.16       d16, d17
+  0xf4c3080f,                             //vst1.32       {d16[0]}, [r3]
+  0xe5913004,                             //ldr           r3, [r1, #4]
+  0xe2811008,                             //add           r1, r1, #8
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_load_8888_vfp4[] = {
+  0xe92d4800,                             //push          {fp, lr}
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xe2811008,                             //add           r1, r1, #8
+  0xed922a03,                             //vldr          s4, [r2, #12]
+  0xe593e000,                             //ldr           lr, [r3]
+  0xe2823010,                             //add           r3, r2, #16
+  0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
+  0xe08e3100,                             //add           r3, lr, r0, lsl #2
+  0xedd31b00,                             //vldr          d17, [r3]
+  0xf24021b1,                             //vand          d18, d16, d17
+  0xf3f83031,                             //vshr.u32      d19, d17, #8
+  0xf3e84031,                             //vshr.u32      d20, d17, #24
+  0xf3f01031,                             //vshr.u32      d17, d17, #16
+  0xf24031b3,                             //vand          d19, d16, d19
+  0xf24001b1,                             //vand          d16, d16, d17
+  0xf3fb2622,                             //vcvt.f32.s32  d18, d18
+  0xf3fb4624,                             //vcvt.f32.s32  d20, d20
+  0xf3fb1623,                             //vcvt.f32.s32  d17, d19
+  0xf3fb0620,                             //vcvt.f32.s32  d16, d16
+  0xf2a209c2,                             //vmul.f32      d0, d18, d2[0]
+  0xf2a439c2,                             //vmul.f32      d3, d20, d2[0]
+  0xf2a119c2,                             //vmul.f32      d1, d17, d2[0]
+  0xf2a029c2,                             //vmul.f32      d2, d16, d2[0]
+  0xe8bd4800,                             //pop           {fp, lr}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_store_8888_vfp4[] = {
+  0xe2823008,                             //add           r3, r2, #8
+  0xf2c3261f,                             //vmov.i32      d18, #1056964608
+  0xf2c3361f,                             //vmov.i32      d19, #1056964608
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xf2c3061f,                             //vmov.i32      d16, #1056964608
+  0xf2412c31,                             //vfma.f32      d18, d1, d17
+  0xf2423c31,                             //vfma.f32      d19, d2, d17
+  0xf2c3461f,                             //vmov.i32      d20, #1056964608
+  0xe5913000,                             //ldr           r3, [r1]
+  0xf2400c31,                             //vfma.f32      d16, d0, d17
+  0xf2434c31,                             //vfma.f32      d20, d3, d17
+  0xe5933000,                             //ldr           r3, [r3]
+  0xe0833100,                             //add           r3, r3, r0, lsl #2
+  0xf3fb17a2,                             //vcvt.u32.f32  d17, d18
+  0xf3fb27a3,                             //vcvt.u32.f32  d18, d19
+  0xf3fb07a0,                             //vcvt.u32.f32  d16, d16
+  0xf3fb37a4,                             //vcvt.u32.f32  d19, d20
+  0xf2e81531,                             //vshl.s32      d17, d17, #8
+  0xf2f02532,                             //vshl.s32      d18, d18, #16
+  0xf26101b0,                             //vorr          d16, d17, d16
+  0xf2f81533,                             //vshl.s32      d17, d19, #24
+  0xf26001b2,                             //vorr          d16, d16, d18
+  0xf26001b1,                             //vorr          d16, d16, d17
+  0xedc30b00,                             //vstr          d16, [r3]
+  0xe5913004,                             //ldr           r3, [r1, #4]
+  0xe2811008,                             //add           r1, r1, #8
+  0xe12fff13,                             //bx            r3
+};
+
+CODE const uint32_t sk_load_f16_vfp4[] = {
+  0xed2d8b04,                             //vpush         {d8-d9}
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xe2811008,                             //add           r1, r1, #8
+  0xe5933000,                             //ldr           r3, [r3]
+  0xe0833180,                             //add           r3, r3, r0, lsl #3
+  0xf463084f,                             //vld2.16       {d16-d17}, [r3]
+  0xf3b62720,                             //vcvt.f32.f16  q1, d16
+  0xf3b68721,                             //vcvt.f32.f16  q4, d17
+  0xf2220112,                             //vorr          d0, d2, d2
+  0xeef00a43,                             //vmov.f32      s1, s6
+  0xf2281118,                             //vorr          d1, d8, d8
+  0xeeb03a62,                             //vmov.f32      s6, s5
+  0xeef01a49,                             //vmov.f32      s3, s18
+  0xeeb09a68,                             //vmov.f32      s18, s17
+  0xeeb02b43,                             //vmov.f64      d2, d3
+  0xeeb03b49,                             //vmov.f64      d3, d9
+  0xecbd8b04,                             //vpop          {d8-d9}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_store_f16_vfp4[] = {
+  0xeef00b41,                             //vmov.f64      d16, d1
+  0xeef03b42,                             //vmov.f64      d19, d2
+  0xf2631113,                             //vorr          d17, d3, d3
+  0xf2602110,                             //vorr          d18, d0, d0
+  0xf3fa00a1,                             //vtrn.32       d16, d17
+  0xf3f61620,                             //vcvt.f16.f32  d17, q8
+  0xf3fa20a3,                             //vtrn.32       d18, d19
+  0xe5913000,                             //ldr           r3, [r1]
+  0xf3f60622,                             //vcvt.f16.f32  d16, q9
+  0xe5933000,                             //ldr           r3, [r3]
+  0xe0833180,                             //add           r3, r3, r0, lsl #3
+  0xf443084f,                             //vst2.16       {d16-d17}, [r3]
+  0xe2813008,                             //add           r3, r1, #8
+  0xe591c004,                             //ldr           ip, [r1, #4]
+  0xe1a01003,                             //mov           r1, r3
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_store_f32_vfp4[] = {
+  0xe5913000,                             //ldr           r3, [r1]
+  0xe5933000,                             //ldr           r3, [r3]
+  0xe0833200,                             //add           r3, r3, r0, lsl #4
+  0xf403008f,                             //vst4.32       {d0-d3}, [r3]
+  0xe2813008,                             //add           r3, r1, #8
+  0xe591c004,                             //ldr           ip, [r1, #4]
+  0xe1a01003,                             //mov           r1, r3
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_clamp_x_vfp4[] = {
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xf2c00010,                             //vmov.i32      d16, #0
+  0xf3c71e1f,                             //vmov.i8       d17, #255
+  0xf2400f80,                             //vmax.f32      d16, d16, d0
+  0xe2811008,                             //add           r1, r1, #8
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xf26218a1,                             //vadd.i32      d17, d18, d17
+  0xf2200fa1,                             //vmin.f32      d0, d16, d17
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_clamp_y_vfp4[] = {
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xf2c00010,                             //vmov.i32      d16, #0
+  0xf3c71e1f,                             //vmov.i8       d17, #255
+  0xf2400f81,                             //vmax.f32      d16, d16, d1
+  0xe2811008,                             //add           r1, r1, #8
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xf26218a1,                             //vadd.i32      d17, d18, d17
+  0xf2201fa1,                             //vmin.f32      d1, d16, d17
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_repeat_x_vfp4[] = {
+  0xed2d8b04,                             //vpush         {d8-d9}
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xf2c02010,                             //vmov.i32      d18, #0
+  0xf4e23c9f,                             //vld1.32       {d19[]}, [r2 :32]
+  0xe2811008,                             //add           r1, r1, #8
+  0xed938a00,                             //vldr          s16, [r3]
+  0xeec09a88,                             //vdiv.f32      s19, s1, s16
+  0xee809a08,                             //vdiv.f32      s18, s0, s16
+  0xf3fb0709,                             //vcvt.s32.f32  d16, d9
+  0xf3fb0620,                             //vcvt.f32.s32  d16, d16
+  0xf3601e89,                             //vcgt.f32      d17, d16, d9
+  0xf35311b2,                             //vbsl          d17, d19, d18
+  0xf3f42c08,                             //vdup.32       d18, d8[0]
+  0xf2600da1,                             //vsub.f32      d16, d16, d17
+  0xf3c71e1f,                             //vmov.i8       d17, #255
+  0xf26218a1,                             //vadd.i32      d17, d18, d17
+  0xf2e009c8,                             //vmul.f32      d16, d16, d8[0]
+  0xf2600d20,                             //vsub.f32      d16, d0, d16
+  0xf2200fa1,                             //vmin.f32      d0, d16, d17
+  0xecbd8b04,                             //vpop          {d8-d9}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_repeat_y_vfp4[] = {
+  0xed2d8b04,                             //vpush         {d8-d9}
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xf2c02010,                             //vmov.i32      d18, #0
+  0xf4e23c9f,                             //vld1.32       {d19[]}, [r2 :32]
+  0xe2811008,                             //add           r1, r1, #8
+  0xed938a00,                             //vldr          s16, [r3]
+  0xeec19a88,                             //vdiv.f32      s19, s3, s16
+  0xee819a08,                             //vdiv.f32      s18, s2, s16
+  0xf3fb0709,                             //vcvt.s32.f32  d16, d9
+  0xf3fb0620,                             //vcvt.f32.s32  d16, d16
+  0xf3601e89,                             //vcgt.f32      d17, d16, d9
+  0xf35311b2,                             //vbsl          d17, d19, d18
+  0xf3f42c08,                             //vdup.32       d18, d8[0]
+  0xf2600da1,                             //vsub.f32      d16, d16, d17
+  0xf3c71e1f,                             //vmov.i8       d17, #255
+  0xf26218a1,                             //vadd.i32      d17, d18, d17
+  0xf2e009c8,                             //vmul.f32      d16, d16, d8[0]
+  0xf2610d20,                             //vsub.f32      d16, d1, d16
+  0xf2201fa1,                             //vmin.f32      d1, d16, d17
+  0xecbd8b04,                             //vpop          {d8-d9}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_mirror_x_vfp4[] = {
+  0xed2d8b04,                             //vpush         {d8-d9}
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xf2c03010,                             //vmov.i32      d19, #0
+  0xf4e24c9f,                             //vld1.32       {d20[]}, [r2 :32]
+  0xe2811008,                             //add           r1, r1, #8
+  0xed938a00,                             //vldr          s16, [r3]
+  0xee389a08,                             //vadd.f32      s18, s16, s16
+  0xf3f40c08,                             //vdup.32       d16, d8[0]
+  0xf2200d20,                             //vsub.f32      d0, d0, d16
+  0xeec08a89,                             //vdiv.f32      s17, s1, s18
+  0xee808a09,                             //vdiv.f32      s16, s0, s18
+  0xf3fb1708,                             //vcvt.s32.f32  d17, d8
+  0xf3fb1621,                             //vcvt.f32.s32  d17, d17
+  0xf3612e88,                             //vcgt.f32      d18, d17, d8
+  0xf35421b3,                             //vbsl          d18, d20, d19
+  0xf2611da2,                             //vsub.f32      d17, d17, d18
+  0xf3c72e1f,                             //vmov.i8       d18, #255
+  0xf2e119c9,                             //vmul.f32      d17, d17, d9[0]
+  0xf2601d21,                             //vsub.f32      d17, d0, d17
+  0xf2611da0,                             //vsub.f32      d17, d17, d16
+  0xf26008a2,                             //vadd.i32      d16, d16, d18
+  0xf3f91721,                             //vabs.f32      d17, d17
+  0xf2210fa0,                             //vmin.f32      d0, d17, d16
+  0xecbd8b04,                             //vpop          {d8-d9}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_mirror_y_vfp4[] = {
+  0xed2d8b04,                             //vpush         {d8-d9}
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xf2c03010,                             //vmov.i32      d19, #0
+  0xf4e24c9f,                             //vld1.32       {d20[]}, [r2 :32]
+  0xe2811008,                             //add           r1, r1, #8
+  0xed938a00,                             //vldr          s16, [r3]
+  0xee389a08,                             //vadd.f32      s18, s16, s16
+  0xf3f40c08,                             //vdup.32       d16, d8[0]
+  0xf2211d20,                             //vsub.f32      d1, d1, d16
+  0xeec18a89,                             //vdiv.f32      s17, s3, s18
+  0xee818a09,                             //vdiv.f32      s16, s2, s18
+  0xf3fb1708,                             //vcvt.s32.f32  d17, d8
+  0xf3fb1621,                             //vcvt.f32.s32  d17, d17
+  0xf3612e88,                             //vcgt.f32      d18, d17, d8
+  0xf35421b3,                             //vbsl          d18, d20, d19
+  0xf2611da2,                             //vsub.f32      d17, d17, d18
+  0xf3c72e1f,                             //vmov.i8       d18, #255
+  0xf2e119c9,                             //vmul.f32      d17, d17, d9[0]
+  0xf2611d21,                             //vsub.f32      d17, d1, d17
+  0xf2611da0,                             //vsub.f32      d17, d17, d16
+  0xf26008a2,                             //vadd.i32      d16, d16, d18
+  0xf3f91721,                             //vabs.f32      d17, d17
+  0xf2211fa0,                             //vmin.f32      d1, d17, d16
+  0xecbd8b04,                             //vpop          {d8-d9}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_matrix_2x3_vfp4[] = {
+  0xe92d4800,                             //push          {fp, lr}
+  0xe591e000,                             //ldr           lr, [r1]
+  0xe591c004,                             //ldr           ip, [r1, #4]
+  0xe2811008,                             //add           r1, r1, #8
+  0xe28e300c,                             //add           r3, lr, #12
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xe28e3008,                             //add           r3, lr, #8
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe28e3010,                             //add           r3, lr, #16
+  0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
+  0xe28e3014,                             //add           r3, lr, #20
+  0xf2410c31,                             //vfma.f32      d16, d1, d17
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe28e3004,                             //add           r3, lr, #4
+  0xf2411c32,                             //vfma.f32      d17, d1, d18
+  0xf4ee2c9f,                             //vld1.32       {d18[]}, [lr :32]
+  0xf4e33c9f,                             //vld1.32       {d19[]}, [r3 :32]
+  0xf2400c32,                             //vfma.f32      d16, d0, d18
+  0xf2401c33,                             //vfma.f32      d17, d0, d19
+  0xf22001b0,                             //vorr          d0, d16, d16
+  0xf22111b1,                             //vorr          d1, d17, d17
+  0xe8bd4800,                             //pop           {fp, lr}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_matrix_3x4_vfp4[] = {
+  0xe92d4800,                             //push          {fp, lr}
+  0xe591e000,                             //ldr           lr, [r1]
+  0xe591c004,                             //ldr           ip, [r1, #4]
+  0xe2811008,                             //add           r1, r1, #8
+  0xe28e3020,                             //add           r3, lr, #32
+  0xf4e33c9f,                             //vld1.32       {d19[]}, [r3 :32]
+  0xe28e302c,                             //add           r3, lr, #44
+  0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
+  0xe28e301c,                             //add           r3, lr, #28
+  0xf2420c33,                             //vfma.f32      d16, d2, d19
+  0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
+  0xe28e3018,                             //add           r3, lr, #24
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xe28e3024,                             //add           r3, lr, #36
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe28e3028,                             //add           r3, lr, #40
+  0xf2421c32,                             //vfma.f32      d17, d2, d18
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xe28e3010,                             //add           r3, lr, #16
+  0xf2422c34,                             //vfma.f32      d18, d2, d20
+  0xf4e33c9f,                             //vld1.32       {d19[]}, [r3 :32]
+  0xe28e300c,                             //add           r3, lr, #12
+  0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
+  0xe28e3014,                             //add           r3, lr, #20
+  0xf2411c34,                             //vfma.f32      d17, d1, d20
+  0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
+  0xf2410c34,                             //vfma.f32      d16, d1, d20
+  0xe28e3004,                             //add           r3, lr, #4
+  0xf2412c33,                             //vfma.f32      d18, d1, d19
+  0xf4ee3c9f,                             //vld1.32       {d19[]}, [lr :32]
+  0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
+  0xe28e3008,                             //add           r3, lr, #8
+  0xf2401c33,                             //vfma.f32      d17, d0, d19
+  0xf4e33c9f,                             //vld1.32       {d19[]}, [r3 :32]
+  0xf2400c33,                             //vfma.f32      d16, d0, d19
+  0xf2402c34,                             //vfma.f32      d18, d0, d20
+  0xf22101b1,                             //vorr          d0, d17, d17
+  0xf22021b0,                             //vorr          d2, d16, d16
+  0xf22211b2,                             //vorr          d1, d18, d18
+  0xe8bd4800,                             //pop           {fp, lr}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_matrix_perspective_vfp4[] = {
+  0xe92d4800,                             //push          {fp, lr}
+  0xe591e000,                             //ldr           lr, [r1]
+  0xe591c004,                             //ldr           ip, [r1, #4]
+  0xe2811008,                             //add           r1, r1, #8
+  0xe28e301c,                             //add           r3, lr, #28
+  0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
+  0xe28e3020,                             //add           r3, lr, #32
+  0xf4e31c9f,                             //vld1.32       {d17[]}, [r3 :32]
+  0xe28e3018,                             //add           r3, lr, #24
+  0xf2411c30,                             //vfma.f32      d17, d1, d16
+  0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
+  0xe28e3010,                             //add           r3, lr, #16
+  0xf2401c30,                             //vfma.f32      d17, d0, d16
+  0xf4e30c9f,                             //vld1.32       {d16[]}, [r3 :32]
+  0xe28e3004,                             //add           r3, lr, #4
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xe28e3008,                             //add           r3, lr, #8
+  0xf4e34c9f,                             //vld1.32       {d20[]}, [r3 :32]
+  0xe28e3014,                             //add           r3, lr, #20
+  0xf2414c32,                             //vfma.f32      d20, d1, d18
+  0xf4e32c9f,                             //vld1.32       {d18[]}, [r3 :32]
+  0xe28e300c,                             //add           r3, lr, #12
+  0xf3fb3521,                             //vrecpe.f32    d19, d17
+  0xf2412c30,                             //vfma.f32      d18, d1, d16
+  0xf4e35c9f,                             //vld1.32       {d21[]}, [r3 :32]
+  0xf2410fb3,                             //vrecps.f32    d16, d17, d19
+  0xf4ee1c9f,                             //vld1.32       {d17[]}, [lr :32]
+  0xf2404c31,                             //vfma.f32      d20, d0, d17
+  0xf2402c35,                             //vfma.f32      d18, d0, d21
+  0xf3430db0,                             //vmul.f32      d16, d19, d16
+  0xf3040db0,                             //vmul.f32      d0, d20, d16
+  0xf3021db0,                             //vmul.f32      d1, d18, d16
+  0xe8bd4800,                             //pop           {fp, lr}
+  0xe12fff1c,                             //bx            ip
+};
+
+CODE const uint32_t sk_linear_gradient_2stops_vfp4[] = {
+  0xe8911008,                             //ldm           r1, {r3, ip}
+  0xe2811008,                             //add           r1, r1, #8
+  0xf4632a0d,                             //vld1.8        {d18-d19}, [r3]!
+  0xf4634a0f,                             //vld1.8        {d20-d21}, [r3]
+  0xf3f40c22,                             //vdup.32       d16, d18[0]
+  0xf3f41c24,                             //vdup.32       d17, d20[0]
+  0xf2400c31,                             //vfma.f32      d16, d0, d17
+  0xf3fc6c24,                             //vdup.32       d22, d20[1]
+  0xf3bc1c22,                             //vdup.32       d1, d18[1]
+  0xf3b42c23,                             //vdup.32       d2, d19[0]
+  0xf2001c36,                             //vfma.f32      d1, d0, d22
+  0xf3f41c25,                             //vdup.32       d17, d21[0]
+  0xf3fc4c25,                             //vdup.32       d20, d21[1]
+  0xf2002c31,                             //vfma.f32      d2, d0, d17
+  0xf3bc3c23,                             //vdup.32       d3, d19[1]
+  0xf2003c34,                             //vfma.f32      d3, d0, d20
+  0xf22001b0,                             //vorr          d0, d16, d16
+  0xe12fff1c,                             //bx            ip
+};
+#elif defined(__x86_64__)
+
+CODE const uint8_t sk_start_pipeline_hsw[] = {
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  83,                                     //push          %rbx
+  73,137,205,                             //mov           %rcx,%r13
+  73,137,214,                             //mov           %rdx,%r14
+  72,137,251,                             //mov           %rdi,%rbx
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  73,137,199,                             //mov           %rax,%r15
+  73,137,244,                             //mov           %rsi,%r12
+  72,141,67,8,                            //lea           0x8(%rbx),%rax
+  76,57,232,                              //cmp           %r13,%rax
+  118,5,                                  //jbe           28 <_sk_start_pipeline_hsw+0x28>
+  72,137,223,                             //mov           %rbx,%rdi
+  235,65,                                 //jmp           69 <_sk_start_pipeline_hsw+0x69>
+  185,0,0,0,0,                            //mov           $0x0,%ecx
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  72,137,223,                             //mov           %rbx,%rdi
+  76,137,230,                             //mov           %r12,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,215,                             //callq         *%r15
+  72,141,123,8,                           //lea           0x8(%rbx),%rdi
+  72,131,195,16,                          //add           $0x10,%rbx
+  76,57,235,                              //cmp           %r13,%rbx
+  72,137,251,                             //mov           %rdi,%rbx
+  118,191,                                //jbe           28 <_sk_start_pipeline_hsw+0x28>
+  76,137,233,                             //mov           %r13,%rcx
+  72,41,249,                              //sub           %rdi,%rcx
+  116,41,                                 //je            9a <_sk_start_pipeline_hsw+0x9a>
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  76,137,230,                             //mov           %r12,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,215,                             //callq         *%r15
+  76,137,232,                             //mov           %r13,%rax
+  91,                                     //pop           %rbx
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  197,248,119,                            //vzeroupper
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_just_return_hsw[] = {
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_seed_shader_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,249,110,199,                        //vmovd         %edi,%xmm0
+  196,226,125,24,192,                     //vbroadcastss  %xmm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,4,                    //vbroadcastss  0x4(%rdx),%ymm1
+  197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
+  197,252,88,66,20,                       //vaddps        0x14(%rdx),%ymm0,%ymm0
+  196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
+  196,226,125,24,18,                      //vbroadcastss  (%rdx),%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_constant_color_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clear_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_plus__hsw[] = {
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_srcover_hsw[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  197,60,92,195,                          //vsubps        %ymm3,%ymm8,%ymm8
+  196,194,93,184,192,                     //vfmadd231ps   %ymm8,%ymm4,%ymm0
+  196,194,85,184,200,                     //vfmadd231ps   %ymm8,%ymm5,%ymm1
+  196,194,77,184,208,                     //vfmadd231ps   %ymm8,%ymm6,%ymm2
+  196,194,69,184,216,                     //vfmadd231ps   %ymm8,%ymm7,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_dstover_hsw[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  197,60,92,199,                          //vsubps        %ymm7,%ymm8,%ymm8
+  196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
+  196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
+  196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
+  196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_0_hsw[] = {
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  196,193,124,95,192,                     //vmaxps        %ymm8,%ymm0,%ymm0
+  196,193,116,95,200,                     //vmaxps        %ymm8,%ymm1,%ymm1
+  196,193,108,95,208,                     //vmaxps        %ymm8,%ymm2,%ymm2
+  196,193,100,95,216,                     //vmaxps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_1_hsw[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
+  196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
+  196,193,108,93,208,                     //vminps        %ymm8,%ymm2,%ymm2
+  196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_a_hsw[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
+  197,252,93,195,                         //vminps        %ymm3,%ymm0,%ymm0
+  197,244,93,203,                         //vminps        %ymm3,%ymm1,%ymm1
+  197,236,93,211,                         //vminps        %ymm3,%ymm2,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_set_rgb_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_rb_hsw[] = {
+  197,124,40,192,                         //vmovaps       %ymm0,%ymm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,194,                         //vmovaps       %ymm2,%ymm0
+  197,124,41,194,                         //vmovaps       %ymm8,%ymm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_hsw[] = {
+  197,124,40,195,                         //vmovaps       %ymm3,%ymm8
+  197,124,40,202,                         //vmovaps       %ymm2,%ymm9
+  197,124,40,209,                         //vmovaps       %ymm1,%ymm10
+  197,124,40,216,                         //vmovaps       %ymm0,%ymm11
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,196,                         //vmovaps       %ymm4,%ymm0
+  197,252,40,205,                         //vmovaps       %ymm5,%ymm1
+  197,252,40,214,                         //vmovaps       %ymm6,%ymm2
+  197,252,40,223,                         //vmovaps       %ymm7,%ymm3
+  197,124,41,220,                         //vmovaps       %ymm11,%ymm4
+  197,124,41,213,                         //vmovaps       %ymm10,%ymm5
+  197,124,41,206,                         //vmovaps       %ymm9,%ymm6
+  197,124,41,199,                         //vmovaps       %ymm8,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_src_dst_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,224,                         //vmovaps       %ymm0,%ymm4
+  197,252,40,233,                         //vmovaps       %ymm1,%ymm5
+  197,252,40,242,                         //vmovaps       %ymm2,%ymm6
+  197,252,40,251,                         //vmovaps       %ymm3,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_dst_src_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,196,                         //vmovaps       %ymm4,%ymm0
+  197,252,40,205,                         //vmovaps       %ymm5,%ymm1
+  197,252,40,214,                         //vmovaps       %ymm6,%ymm2
+  197,252,40,223,                         //vmovaps       %ymm7,%ymm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_premul_hsw[] = {
+  197,252,89,195,                         //vmulps        %ymm3,%ymm0,%ymm0
+  197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
+  197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_unpremul_hsw[] = {
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  196,65,100,194,200,0,                   //vcmpeqps      %ymm8,%ymm3,%ymm9
+  196,98,125,24,18,                       //vbroadcastss  (%rdx),%ymm10
+  197,44,94,211,                          //vdivps        %ymm3,%ymm10,%ymm10
+  196,67,45,74,192,144,                   //vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_from_srgb_hsw[] = {
+  196,98,125,24,66,64,                    //vbroadcastss  0x40(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  197,124,89,208,                         //vmulps        %ymm0,%ymm0,%ymm10
+  196,98,125,24,90,60,                    //vbroadcastss  0x3c(%rdx),%ymm11
+  196,98,125,24,98,56,                    //vbroadcastss  0x38(%rdx),%ymm12
+  196,65,124,40,235,                      //vmovaps       %ymm11,%ymm13
+  196,66,125,168,236,                     //vfmadd213ps   %ymm12,%ymm0,%ymm13
+  196,98,125,24,114,52,                   //vbroadcastss  0x34(%rdx),%ymm14
+  196,66,45,168,238,                      //vfmadd213ps   %ymm14,%ymm10,%ymm13
+  196,98,125,24,82,68,                    //vbroadcastss  0x44(%rdx),%ymm10
+  196,193,124,194,194,1,                  //vcmpltps      %ymm10,%ymm0,%ymm0
+  196,195,21,74,193,0,                    //vblendvps     %ymm0,%ymm9,%ymm13,%ymm0
+  197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
+  197,116,89,233,                         //vmulps        %ymm1,%ymm1,%ymm13
+  196,65,124,40,251,                      //vmovaps       %ymm11,%ymm15
+  196,66,117,168,252,                     //vfmadd213ps   %ymm12,%ymm1,%ymm15
+  196,66,21,168,254,                      //vfmadd213ps   %ymm14,%ymm13,%ymm15
+  196,193,116,194,202,1,                  //vcmpltps      %ymm10,%ymm1,%ymm1
+  196,195,5,74,201,16,                    //vblendvps     %ymm1,%ymm9,%ymm15,%ymm1
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  197,108,89,202,                         //vmulps        %ymm2,%ymm2,%ymm9
+  196,66,109,168,220,                     //vfmadd213ps   %ymm12,%ymm2,%ymm11
+  196,66,53,168,222,                      //vfmadd213ps   %ymm14,%ymm9,%ymm11
+  196,193,108,194,210,1,                  //vcmpltps      %ymm10,%ymm2,%ymm2
+  196,195,37,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm11,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_to_srgb_hsw[] = {
+  197,124,82,192,                         //vrsqrtps      %ymm0,%ymm8
+  196,65,124,83,200,                      //vrcpps        %ymm8,%ymm9
+  196,65,124,82,208,                      //vrsqrtps      %ymm8,%ymm10
+  196,98,125,24,66,72,                    //vbroadcastss  0x48(%rdx),%ymm8
+  197,60,89,216,                          //vmulps        %ymm0,%ymm8,%ymm11
+  196,98,125,24,34,                       //vbroadcastss  (%rdx),%ymm12
+  196,98,125,24,106,76,                   //vbroadcastss  0x4c(%rdx),%ymm13
+  196,98,125,24,114,80,                   //vbroadcastss  0x50(%rdx),%ymm14
+  196,98,125,24,122,84,                   //vbroadcastss  0x54(%rdx),%ymm15
+  196,66,13,168,207,                      //vfmadd213ps   %ymm15,%ymm14,%ymm9
+  196,66,21,184,202,                      //vfmadd231ps   %ymm10,%ymm13,%ymm9
+  196,65,28,93,201,                       //vminps        %ymm9,%ymm12,%ymm9
+  196,98,125,24,82,88,                    //vbroadcastss  0x58(%rdx),%ymm10
+  196,193,124,194,194,1,                  //vcmpltps      %ymm10,%ymm0,%ymm0
+  196,195,53,74,195,0,                    //vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
+  197,124,82,201,                         //vrsqrtps      %ymm1,%ymm9
+  196,65,124,83,217,                      //vrcpps        %ymm9,%ymm11
+  196,65,124,82,201,                      //vrsqrtps      %ymm9,%ymm9
+  196,66,13,168,223,                      //vfmadd213ps   %ymm15,%ymm14,%ymm11
+  196,66,21,184,217,                      //vfmadd231ps   %ymm9,%ymm13,%ymm11
+  197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
+  196,65,28,93,219,                       //vminps        %ymm11,%ymm12,%ymm11
+  196,193,116,194,202,1,                  //vcmpltps      %ymm10,%ymm1,%ymm1
+  196,195,37,74,201,16,                   //vblendvps     %ymm1,%ymm9,%ymm11,%ymm1
+  197,124,82,202,                         //vrsqrtps      %ymm2,%ymm9
+  196,65,124,83,217,                      //vrcpps        %ymm9,%ymm11
+  196,66,13,168,223,                      //vfmadd213ps   %ymm15,%ymm14,%ymm11
+  196,65,124,82,201,                      //vrsqrtps      %ymm9,%ymm9
+  196,66,21,184,217,                      //vfmadd231ps   %ymm9,%ymm13,%ymm11
+  196,65,28,93,203,                       //vminps        %ymm11,%ymm12,%ymm9
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  196,193,108,194,210,1,                  //vcmpltps      %ymm10,%ymm2,%ymm2
+  196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_1_float_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_u8_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,48,                                 //jne           41a <_sk_scale_u8_hsw+0x40>
+  197,123,16,0,                           //vmovsd        (%rax),%xmm8
+  196,66,125,49,192,                      //vpmovzxbd     %xmm8,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,12,                    //vbroadcastss  0xc(%rdx),%ymm9
+  196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           422 <_sk_scale_u8_hsw+0x48>
+  196,65,249,110,193,                     //vmovq         %r9,%xmm8
+  235,175,                                //jmp           3ee <_sk_scale_u8_hsw+0x14>
+};
+
+CODE const uint8_t sk_lerp_1_float_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
+  197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
+  196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_u8_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,68,                                 //jne           4c2 <_sk_lerp_u8_hsw+0x54>
+  197,123,16,0,                           //vmovsd        (%rax),%xmm8
+  196,66,125,49,192,                      //vpmovzxbd     %xmm8,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,12,                    //vbroadcastss  0xc(%rdx),%ymm9
+  196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
+  197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
+  196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           4ca <_sk_lerp_u8_hsw+0x5c>
+  196,65,249,110,193,                     //vmovq         %r9,%xmm8
+  235,155,                                //jmp           482 <_sk_lerp_u8_hsw+0x14>
+};
+
+CODE const uint8_t sk_lerp_565_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  117,123,                                //jne           56c <_sk_lerp_565_hsw+0x85>
+  196,193,122,111,28,122,                 //vmovdqu       (%r10,%rdi,2),%xmm3
+  196,226,125,51,219,                     //vpmovzxwd     %xmm3,%ymm3
+  196,98,125,88,66,104,                   //vpbroadcastd  0x68(%rdx),%ymm8
+  197,61,219,195,                         //vpand         %ymm3,%ymm8,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,116,                   //vbroadcastss  0x74(%rdx),%ymm9
+  196,65,52,89,192,                       //vmulps        %ymm8,%ymm9,%ymm8
+  196,98,125,88,74,108,                   //vpbroadcastd  0x6c(%rdx),%ymm9
+  197,53,219,203,                         //vpand         %ymm3,%ymm9,%ymm9
+  196,65,124,91,201,                      //vcvtdq2ps     %ymm9,%ymm9
+  196,98,125,24,82,120,                   //vbroadcastss  0x78(%rdx),%ymm10
+  196,65,44,89,201,                       //vmulps        %ymm9,%ymm10,%ymm9
+  196,98,125,88,82,112,                   //vpbroadcastd  0x70(%rdx),%ymm10
+  197,173,219,219,                        //vpand         %ymm3,%ymm10,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,98,125,24,82,124,                   //vbroadcastss  0x7c(%rdx),%ymm10
+  197,172,89,219,                         //vmulps        %ymm3,%ymm10,%ymm3
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,226,53,168,205,                     //vfmadd213ps   %ymm5,%ymm9,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,226,101,168,214,                    //vfmadd213ps   %ymm6,%ymm3,%ymm2
+  196,226,125,24,26,                      //vbroadcastss  (%rdx),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  15,135,111,255,255,255,                 //ja            4f7 <_sk_lerp_565_hsw+0x10>
+  76,141,13,73,0,0,0,                     //lea           0x49(%rip),%r9        # 5d8 <_sk_lerp_565_hsw+0xf1>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  196,193,97,196,92,122,12,6,             //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,10,5,             //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,8,4,              //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,6,3,              //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,4,2,              //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,2,1,              //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,28,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm3,%xmm3
+  233,31,255,255,255,                     //jmpq          4f7 <_sk_lerp_565_hsw+0x10>
+  244,                                    //hlt
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  236,                                    //in            (%dx),%al
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,228,                                //jmpq          *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  220,255,                                //fdivr         %st,%st(7)
+  255,                                    //(bad)
+  255,212,                                //callq         *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,204,                                //dec           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,192,                                //inc           %eax
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_tables_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
+  76,3,8,                                 //add           (%rax),%r9
+  77,133,192,                             //test          %r8,%r8
+  117,106,                                //jne           673 <_sk_load_tables_hsw+0x7f>
+  196,193,126,111,25,                     //vmovdqu       (%r9),%ymm3
+  196,226,125,88,82,16,                   //vpbroadcastd  0x10(%rdx),%ymm2
+  197,237,219,203,                        //vpand         %ymm3,%ymm2,%ymm1
+  196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
+  72,139,72,8,                            //mov           0x8(%rax),%rcx
+  76,139,72,16,                           //mov           0x10(%rax),%r9
+  196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
+  196,226,53,146,4,137,                   //vgatherdps    %ymm9,(%rcx,%ymm1,4),%ymm0
+  197,245,114,211,8,                      //vpsrld        $0x8,%ymm3,%ymm1
+  197,109,219,201,                        //vpand         %ymm1,%ymm2,%ymm9
+  196,65,45,118,210,                      //vpcmpeqd      %ymm10,%ymm10,%ymm10
+  196,130,45,146,12,137,                  //vgatherdps    %ymm10,(%r9,%ymm9,4),%ymm1
+  72,139,64,24,                           //mov           0x18(%rax),%rax
+  197,181,114,211,16,                     //vpsrld        $0x10,%ymm3,%ymm9
+  196,65,109,219,201,                     //vpand         %ymm9,%ymm2,%ymm9
+  196,162,61,146,20,136,                  //vgatherdps    %ymm8,(%rax,%ymm9,4),%ymm2
+  197,229,114,211,24,                     //vpsrld        $0x18,%ymm3,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,98,125,24,66,12,                    //vbroadcastss  0xc(%rdx),%ymm8
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  185,8,0,0,0,                            //mov           $0x8,%ecx
+  68,41,193,                              //sub           %r8d,%ecx
+  192,225,3,                              //shl           $0x3,%cl
+  73,199,194,255,255,255,255,             //mov           $0xffffffffffffffff,%r10
+  73,211,234,                             //shr           %cl,%r10
+  196,193,249,110,194,                    //vmovq         %r10,%xmm0
+  196,226,125,33,192,                     //vpmovsxbd     %xmm0,%ymm0
+  196,194,125,140,25,                     //vpmaskmovd    (%r9),%ymm0,%ymm3
+  233,114,255,255,255,                    //jmpq          60e <_sk_load_tables_hsw+0x1a>
+};
+
+CODE const uint8_t sk_load_a8_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,42,                                 //jne           6d6 <_sk_load_a8_hsw+0x3a>
+  197,251,16,0,                           //vmovsd        (%rax),%xmm0
+  196,226,125,49,192,                     //vpmovzxbd     %xmm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,12,                   //vbroadcastss  0xc(%rdx),%ymm1
+  197,252,89,217,                         //vmulps        %ymm1,%ymm0,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           6de <_sk_load_a8_hsw+0x42>
+  196,193,249,110,193,                    //vmovq         %r9,%xmm0
+  235,181,                                //jmp           6b0 <_sk_load_a8_hsw+0x14>
+};
+
+CODE const uint8_t sk_store_a8_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,66,8,                     //vbroadcastss  0x8(%rdx),%ymm8
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
+  196,65,57,103,192,                      //vpackuswb     %xmm8,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           72e <_sk_store_a8_hsw+0x33>
+  196,65,123,17,4,57,                     //vmovsd        %xmm8,(%r9,%rdi,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            72a <_sk_store_a8_hsw+0x2f>
+  196,66,121,48,192,                      //vpmovzxbw     %xmm8,%xmm8
+  76,141,21,66,0,0,0,                     //lea           0x42(%rip),%r10        # 78c <_sk_store_a8_hsw+0x91>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,121,20,68,57,6,12,               //vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
+  196,67,121,20,68,57,5,10,               //vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
+  196,67,121,20,68,57,4,8,                //vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
+  196,67,121,20,68,57,3,6,                //vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
+  196,67,121,20,68,57,2,4,                //vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
+  196,67,121,20,68,57,1,2,                //vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
+  196,67,121,20,4,57,0,                   //vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
+  235,158,                                //jmp           72a <_sk_store_a8_hsw+0x2f>
+  247,255,                                //idiv          %edi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  239,                                    //out           %eax,(%dx)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,231,                                //jmpq          *%rdi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  223,255,                                //(bad)
+  255,                                    //(bad)
+  255,215,                                //callq         *%rdi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,207,                                //dec           %edi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,199,                                //inc           %edi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_565_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  117,92,                                 //jne           80e <_sk_load_565_hsw+0x66>
+  196,193,122,111,4,122,                  //vmovdqu       (%r10,%rdi,2),%xmm0
+  196,226,125,51,208,                     //vpmovzxwd     %xmm0,%ymm2
+  196,226,125,88,66,104,                  //vpbroadcastd  0x68(%rdx),%ymm0
+  197,253,219,194,                        //vpand         %ymm2,%ymm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,116,                  //vbroadcastss  0x74(%rdx),%ymm1
+  197,244,89,192,                         //vmulps        %ymm0,%ymm1,%ymm0
+  196,226,125,88,74,108,                  //vpbroadcastd  0x6c(%rdx),%ymm1
+  197,245,219,202,                        //vpand         %ymm2,%ymm1,%ymm1
+  197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
+  196,226,125,24,90,120,                  //vbroadcastss  0x78(%rdx),%ymm3
+  197,228,89,201,                         //vmulps        %ymm1,%ymm3,%ymm1
+  196,226,125,88,90,112,                  //vpbroadcastd  0x70(%rdx),%ymm3
+  197,229,219,210,                        //vpand         %ymm2,%ymm3,%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  196,226,125,24,90,124,                  //vbroadcastss  0x7c(%rdx),%ymm3
+  197,228,89,210,                         //vmulps        %ymm2,%ymm3,%ymm2
+  196,226,125,24,26,                      //vbroadcastss  (%rdx),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,146,                                //ja            7b8 <_sk_load_565_hsw+0x10>
+  76,141,13,75,0,0,0,                     //lea           0x4b(%rip),%r9        # 878 <_sk_load_565_hsw+0xd0>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
+  196,193,121,196,68,122,12,6,            //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,10,5,            //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,8,4,             //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,6,3,             //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,4,2,             //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,2,1,             //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,4,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
+  233,66,255,255,255,                     //jmpq          7b8 <_sk_load_565_hsw+0x10>
+  102,144,                                //xchg          %ax,%ax
+  242,255,                                //repnz         (bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  234,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,226,                                //jmpq          *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  218,255,                                //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,202,                                //dec           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  190,                                    //.byte         0xbe
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_store_565_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,130,128,0,0,0,            //vbroadcastss  0x80(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
+  196,193,53,114,241,11,                  //vpslld        $0xb,%ymm9,%ymm9
+  196,98,125,24,146,132,0,0,0,            //vbroadcastss  0x84(%rdx),%ymm10
+  197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,45,114,242,5,                   //vpslld        $0x5,%ymm10,%ymm10
+  196,65,45,235,201,                      //vpor          %ymm9,%ymm10,%ymm9
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,65,53,235,192,                      //vpor          %ymm8,%ymm9,%ymm8
+  196,67,125,57,193,1,                    //vextracti128  $0x1,%ymm8,%xmm9
+  196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           8f6 <_sk_store_565_hsw+0x62>
+  196,65,122,127,4,121,                   //vmovdqu       %xmm8,(%r9,%rdi,2)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            8f2 <_sk_store_565_hsw+0x5e>
+  76,141,21,71,0,0,0,                     //lea           0x47(%rip),%r10        # 954 <_sk_store_565_hsw+0xc0>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,121,21,68,121,12,6,              //vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
+  196,67,121,21,68,121,10,5,              //vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
+  196,67,121,21,68,121,8,4,               //vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
+  196,67,121,21,68,121,6,3,               //vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
+  196,67,121,21,68,121,4,2,               //vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
+  196,67,121,21,68,121,2,1,               //vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
+  197,121,126,192,                        //vmovd         %xmm8,%eax
+  102,65,137,4,121,                       //mov           %ax,(%r9,%rdi,2)
+  235,161,                                //jmp           8f2 <_sk_store_565_hsw+0x5e>
+  15,31,0,                                //nopl          (%rax)
+  242,255,                                //repnz         (bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  234,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,226,                                //jmpq          *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  218,255,                                //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,202,                                //dec           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,194,                                //inc           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_8888_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
+  76,3,8,                                 //add           (%rax),%r9
+  77,133,192,                             //test          %r8,%r8
+  117,85,                                 //jne           9da <_sk_load_8888_hsw+0x6a>
+  196,193,126,111,25,                     //vmovdqu       (%r9),%ymm3
+  196,226,125,88,82,16,                   //vpbroadcastd  0x10(%rdx),%ymm2
+  197,237,219,195,                        //vpand         %ymm3,%ymm2,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,98,125,24,66,12,                    //vbroadcastss  0xc(%rdx),%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,245,114,211,8,                      //vpsrld        $0x8,%ymm3,%ymm1
+  197,237,219,201,                        //vpand         %ymm1,%ymm2,%ymm1
+  197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,181,114,211,16,                     //vpsrld        $0x10,%ymm3,%ymm9
+  196,193,109,219,209,                    //vpand         %ymm9,%ymm2,%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,229,114,211,24,                     //vpsrld        $0x18,%ymm3,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  185,8,0,0,0,                            //mov           $0x8,%ecx
+  68,41,193,                              //sub           %r8d,%ecx
+  192,225,3,                              //shl           $0x3,%cl
+  72,199,192,255,255,255,255,             //mov           $0xffffffffffffffff,%rax
+  72,211,232,                             //shr           %cl,%rax
+  196,225,249,110,192,                    //vmovq         %rax,%xmm0
+  196,226,125,33,192,                     //vpmovsxbd     %xmm0,%ymm0
+  196,194,125,140,25,                     //vpmaskmovd    (%r9),%ymm0,%ymm3
+  235,138,                                //jmp           98a <_sk_load_8888_hsw+0x1a>
+};
+
+CODE const uint8_t sk_store_8888_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
+  76,3,8,                                 //add           (%rax),%r9
+  196,98,125,24,66,8,                     //vbroadcastss  0x8(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
+  197,60,89,209,                          //vmulps        %ymm1,%ymm8,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,45,114,242,8,                   //vpslld        $0x8,%ymm10,%ymm10
+  196,65,45,235,201,                      //vpor          %ymm9,%ymm10,%ymm9
+  197,60,89,210,                          //vmulps        %ymm2,%ymm8,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,45,114,242,16,                  //vpslld        $0x10,%ymm10,%ymm10
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,193,61,114,240,24,                  //vpslld        $0x18,%ymm8,%ymm8
+  196,65,45,235,192,                      //vpor          %ymm8,%ymm10,%ymm8
+  196,65,53,235,192,                      //vpor          %ymm8,%ymm9,%ymm8
+  77,133,192,                             //test          %r8,%r8
+  117,12,                                 //jne           a6c <_sk_store_8888_hsw+0x6c>
+  196,65,126,127,1,                       //vmovdqu       %ymm8,(%r9)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  185,8,0,0,0,                            //mov           $0x8,%ecx
+  68,41,193,                              //sub           %r8d,%ecx
+  192,225,3,                              //shl           $0x3,%cl
+  72,199,192,255,255,255,255,             //mov           $0xffffffffffffffff,%rax
+  72,211,232,                             //shr           %cl,%rax
+  196,97,249,110,200,                     //vmovq         %rax,%xmm9
+  196,66,125,33,201,                      //vpmovsxbd     %xmm9,%ymm9
+  196,66,53,142,1,                        //vpmaskmovd    %ymm8,%ymm9,(%r9)
+  235,211,                                //jmp           a65 <_sk_store_8888_hsw+0x65>
+};
+
+CODE const uint8_t sk_load_f16_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,133,201,                             //test          %rcx,%rcx
+  117,97,                                 //jne           afd <_sk_load_f16_hsw+0x6b>
+  197,249,16,12,248,                      //vmovupd       (%rax,%rdi,8),%xmm1
+  197,249,16,84,248,16,                   //vmovupd       0x10(%rax,%rdi,8),%xmm2
+  197,249,16,92,248,32,                   //vmovupd       0x20(%rax,%rdi,8),%xmm3
+  197,121,16,68,248,48,                   //vmovupd       0x30(%rax,%rdi,8),%xmm8
+  197,241,97,194,                         //vpunpcklwd    %xmm2,%xmm1,%xmm0
+  197,241,105,202,                        //vpunpckhwd    %xmm2,%xmm1,%xmm1
+  196,193,97,97,208,                      //vpunpcklwd    %xmm8,%xmm3,%xmm2
+  196,193,97,105,216,                     //vpunpckhwd    %xmm8,%xmm3,%xmm3
+  197,121,97,193,                         //vpunpcklwd    %xmm1,%xmm0,%xmm8
+  197,121,105,201,                        //vpunpckhwd    %xmm1,%xmm0,%xmm9
+  197,233,97,203,                         //vpunpcklwd    %xmm3,%xmm2,%xmm1
+  197,233,105,219,                        //vpunpckhwd    %xmm3,%xmm2,%xmm3
+  197,185,108,193,                        //vpunpcklqdq   %xmm1,%xmm8,%xmm0
+  196,226,125,19,192,                     //vcvtph2ps     %xmm0,%ymm0
+  197,185,109,201,                        //vpunpckhqdq   %xmm1,%xmm8,%xmm1
+  196,226,125,19,201,                     //vcvtph2ps     %xmm1,%ymm1
+  197,177,108,211,                        //vpunpcklqdq   %xmm3,%xmm9,%xmm2
+  196,226,125,19,210,                     //vcvtph2ps     %xmm2,%ymm2
+  197,177,109,219,                        //vpunpckhqdq   %xmm3,%xmm9,%xmm3
+  196,226,125,19,219,                     //vcvtph2ps     %xmm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  197,251,16,12,248,                      //vmovsd        (%rax,%rdi,8),%xmm1
+  196,65,57,87,192,                       //vxorpd        %xmm8,%xmm8,%xmm8
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  117,6,                                  //jne           b13 <_sk_load_f16_hsw+0x81>
+  197,250,126,201,                        //vmovq         %xmm1,%xmm1
+  235,30,                                 //jmp           b31 <_sk_load_f16_hsw+0x9f>
+  197,241,22,76,248,8,                    //vmovhpd       0x8(%rax,%rdi,8),%xmm1,%xmm1
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,18,                                 //jb            b31 <_sk_load_f16_hsw+0x9f>
+  197,251,16,84,248,16,                   //vmovsd        0x10(%rax,%rdi,8),%xmm2
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  117,19,                                 //jne           b3e <_sk_load_f16_hsw+0xac>
+  197,250,126,210,                        //vmovq         %xmm2,%xmm2
+  235,46,                                 //jmp           b5f <_sk_load_f16_hsw+0xcd>
+  197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
+  197,233,87,210,                         //vxorpd        %xmm2,%xmm2,%xmm2
+  233,117,255,255,255,                    //jmpq          ab3 <_sk_load_f16_hsw+0x21>
+  197,233,22,84,248,24,                   //vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,21,                                 //jb            b5f <_sk_load_f16_hsw+0xcd>
+  197,251,16,92,248,32,                   //vmovsd        0x20(%rax,%rdi,8),%xmm3
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  117,18,                                 //jne           b68 <_sk_load_f16_hsw+0xd6>
+  197,250,126,219,                        //vmovq         %xmm3,%xmm3
+  233,84,255,255,255,                     //jmpq          ab3 <_sk_load_f16_hsw+0x21>
+  197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
+  233,75,255,255,255,                     //jmpq          ab3 <_sk_load_f16_hsw+0x21>
+  197,225,22,92,248,40,                   //vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  15,130,59,255,255,255,                  //jb            ab3 <_sk_load_f16_hsw+0x21>
+  197,123,16,68,248,48,                   //vmovsd        0x30(%rax,%rdi,8),%xmm8
+  233,48,255,255,255,                     //jmpq          ab3 <_sk_load_f16_hsw+0x21>
+};
+
+CODE const uint8_t sk_store_f16_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  196,195,125,29,192,4,                   //vcvtps2ph     $0x4,%ymm0,%xmm8
+  196,195,125,29,201,4,                   //vcvtps2ph     $0x4,%ymm1,%xmm9
+  196,195,125,29,210,4,                   //vcvtps2ph     $0x4,%ymm2,%xmm10
+  196,195,125,29,219,4,                   //vcvtps2ph     $0x4,%ymm3,%xmm11
+  196,65,57,97,225,                       //vpunpcklwd    %xmm9,%xmm8,%xmm12
+  196,65,57,105,193,                      //vpunpckhwd    %xmm9,%xmm8,%xmm8
+  196,65,41,97,203,                       //vpunpcklwd    %xmm11,%xmm10,%xmm9
+  196,65,41,105,235,                      //vpunpckhwd    %xmm11,%xmm10,%xmm13
+  196,65,25,98,217,                       //vpunpckldq    %xmm9,%xmm12,%xmm11
+  196,65,25,106,209,                      //vpunpckhdq    %xmm9,%xmm12,%xmm10
+  196,65,57,98,205,                       //vpunpckldq    %xmm13,%xmm8,%xmm9
+  196,65,57,106,197,                      //vpunpckhdq    %xmm13,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,27,                                 //jne           be8 <_sk_store_f16_hsw+0x65>
+  197,120,17,28,248,                      //vmovups       %xmm11,(%rax,%rdi,8)
+  197,120,17,84,248,16,                   //vmovups       %xmm10,0x10(%rax,%rdi,8)
+  197,120,17,76,248,32,                   //vmovups       %xmm9,0x20(%rax,%rdi,8)
+  197,122,127,68,248,48,                  //vmovdqu       %xmm8,0x30(%rax,%rdi,8)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  197,121,214,28,248,                     //vmovq         %xmm11,(%rax,%rdi,8)
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  116,241,                                //je            be4 <_sk_store_f16_hsw+0x61>
+  197,121,23,92,248,8,                    //vmovhpd       %xmm11,0x8(%rax,%rdi,8)
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,229,                                //jb            be4 <_sk_store_f16_hsw+0x61>
+  197,121,214,84,248,16,                  //vmovq         %xmm10,0x10(%rax,%rdi,8)
+  116,221,                                //je            be4 <_sk_store_f16_hsw+0x61>
+  197,121,23,84,248,24,                   //vmovhpd       %xmm10,0x18(%rax,%rdi,8)
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,209,                                //jb            be4 <_sk_store_f16_hsw+0x61>
+  197,121,214,76,248,32,                  //vmovq         %xmm9,0x20(%rax,%rdi,8)
+  116,201,                                //je            be4 <_sk_store_f16_hsw+0x61>
+  197,121,23,76,248,40,                   //vmovhpd       %xmm9,0x28(%rax,%rdi,8)
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  114,189,                                //jb            be4 <_sk_store_f16_hsw+0x61>
+  197,121,214,68,248,48,                  //vmovq         %xmm8,0x30(%rax,%rdi,8)
+  235,181,                                //jmp           be4 <_sk_store_f16_hsw+0x61>
+};
+
+CODE const uint8_t sk_store_f32_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,0,                               //mov           (%rax),%r8
+  72,141,4,189,0,0,0,0,                   //lea           0x0(,%rdi,4),%rax
+  197,124,20,193,                         //vunpcklps     %ymm1,%ymm0,%ymm8
+  197,124,21,217,                         //vunpckhps     %ymm1,%ymm0,%ymm11
+  197,108,20,203,                         //vunpcklps     %ymm3,%ymm2,%ymm9
+  197,108,21,227,                         //vunpckhps     %ymm3,%ymm2,%ymm12
+  196,65,61,20,209,                       //vunpcklpd     %ymm9,%ymm8,%ymm10
+  196,65,61,21,201,                       //vunpckhpd     %ymm9,%ymm8,%ymm9
+  196,65,37,20,196,                       //vunpcklpd     %ymm12,%ymm11,%ymm8
+  196,65,37,21,220,                       //vunpckhpd     %ymm12,%ymm11,%ymm11
+  72,133,201,                             //test          %rcx,%rcx
+  117,55,                                 //jne           c9c <_sk_store_f32_hsw+0x6d>
+  196,67,45,24,225,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
+  196,67,61,24,235,1,                     //vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
+  196,67,45,6,201,49,                     //vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
+  196,67,61,6,195,49,                     //vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
+  196,65,125,17,36,128,                   //vmovupd       %ymm12,(%r8,%rax,4)
+  196,65,125,17,108,128,32,               //vmovupd       %ymm13,0x20(%r8,%rax,4)
+  196,65,125,17,76,128,64,                //vmovupd       %ymm9,0x40(%r8,%rax,4)
+  196,65,125,17,68,128,96,                //vmovupd       %ymm8,0x60(%r8,%rax,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  196,65,121,17,20,128,                   //vmovupd       %xmm10,(%r8,%rax,4)
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  116,240,                                //je            c98 <_sk_store_f32_hsw+0x69>
+  196,65,121,17,76,128,16,                //vmovupd       %xmm9,0x10(%r8,%rax,4)
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,227,                                //jb            c98 <_sk_store_f32_hsw+0x69>
+  196,65,121,17,68,128,32,                //vmovupd       %xmm8,0x20(%r8,%rax,4)
+  116,218,                                //je            c98 <_sk_store_f32_hsw+0x69>
+  196,65,121,17,92,128,48,                //vmovupd       %xmm11,0x30(%r8,%rax,4)
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,205,                                //jb            c98 <_sk_store_f32_hsw+0x69>
+  196,67,125,25,84,128,64,1,              //vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
+  116,195,                                //je            c98 <_sk_store_f32_hsw+0x69>
+  196,67,125,25,76,128,80,1,              //vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  114,181,                                //jb            c98 <_sk_store_f32_hsw+0x69>
+  196,67,125,25,68,128,96,1,              //vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
+  235,171,                                //jmp           c98 <_sk_store_f32_hsw+0x69>
+};
+
+CODE const uint8_t sk_clamp_x_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,188,95,192,                         //vmaxps        %ymm0,%ymm8,%ymm0
+  196,98,125,88,0,                        //vpbroadcastd  (%rax),%ymm8
+  196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
+  196,65,61,254,193,                      //vpaddd        %ymm9,%ymm8,%ymm8
+  196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_y_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,188,95,201,                         //vmaxps        %ymm1,%ymm8,%ymm1
+  196,98,125,88,0,                        //vpbroadcastd  (%rax),%ymm8
+  196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
+  196,65,61,254,193,                      //vpaddd        %ymm9,%ymm8,%ymm8
+  196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_x_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,65,124,94,200,                      //vdivps        %ymm8,%ymm0,%ymm9
+  196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
+  196,98,61,172,200,                      //vfnmadd213ps  %ymm0,%ymm8,%ymm9
+  197,253,118,192,                        //vpcmpeqd      %ymm0,%ymm0,%ymm0
+  197,189,254,192,                        //vpaddd        %ymm0,%ymm8,%ymm0
+  197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_y_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,65,116,94,200,                      //vdivps        %ymm8,%ymm1,%ymm9
+  196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
+  196,98,61,172,201,                      //vfnmadd213ps  %ymm1,%ymm8,%ymm9
+  197,245,118,201,                        //vpcmpeqd      %ymm1,%ymm1,%ymm1
+  197,189,254,201,                        //vpaddd        %ymm1,%ymm8,%ymm1
+  197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_x_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,122,16,0,                           //vmovss        (%rax),%xmm8
+  196,66,125,24,200,                      //vbroadcastss  %xmm8,%ymm9
+  196,65,124,92,209,                      //vsubps        %ymm9,%ymm0,%ymm10
+  196,193,58,88,192,                      //vaddss        %xmm8,%xmm8,%xmm0
+  196,226,125,24,192,                     //vbroadcastss  %xmm0,%ymm0
+  197,44,94,192,                          //vdivps        %ymm0,%ymm10,%ymm8
+  196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
+  196,66,125,172,194,                     //vfnmadd213ps  %ymm10,%ymm0,%ymm8
+  196,193,60,92,193,                      //vsubps        %ymm9,%ymm8,%ymm0
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,92,192,                          //vsubps        %ymm0,%ymm8,%ymm8
+  197,188,84,192,                         //vandps        %ymm0,%ymm8,%ymm0
+  196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
+  196,65,53,254,192,                      //vpaddd        %ymm8,%ymm9,%ymm8
+  196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_y_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,122,16,0,                           //vmovss        (%rax),%xmm8
+  196,66,125,24,200,                      //vbroadcastss  %xmm8,%ymm9
+  196,65,116,92,209,                      //vsubps        %ymm9,%ymm1,%ymm10
+  196,193,58,88,200,                      //vaddss        %xmm8,%xmm8,%xmm1
+  196,226,125,24,201,                     //vbroadcastss  %xmm1,%ymm1
+  197,44,94,193,                          //vdivps        %ymm1,%ymm10,%ymm8
+  196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
+  196,66,117,172,194,                     //vfnmadd213ps  %ymm10,%ymm1,%ymm8
+  196,193,60,92,201,                      //vsubps        %ymm9,%ymm8,%ymm1
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,92,193,                          //vsubps        %ymm1,%ymm8,%ymm8
+  197,188,84,201,                         //vandps        %ymm1,%ymm8,%ymm1
+  196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
+  196,65,53,254,192,                      //vpaddd        %ymm8,%ymm9,%ymm8
+  196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_2x3_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
+  196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
+  196,98,125,24,64,16,                    //vbroadcastss  0x10(%rax),%ymm8
+  196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
+  196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
+  196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
+  196,98,125,24,88,12,                    //vbroadcastss  0xc(%rax),%ymm11
+  196,98,125,24,72,20,                    //vbroadcastss  0x14(%rax),%ymm9
+  196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
+  196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  197,124,41,201,                         //vmovaps       %ymm9,%ymm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_3x4_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
+  196,98,125,24,80,12,                    //vbroadcastss  0xc(%rax),%ymm10
+  196,98,125,24,88,24,                    //vbroadcastss  0x18(%rax),%ymm11
+  196,98,125,24,64,36,                    //vbroadcastss  0x24(%rax),%ymm8
+  196,66,109,184,195,                     //vfmadd231ps   %ymm11,%ymm2,%ymm8
+  196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
+  196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
+  196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
+  196,98,125,24,88,16,                    //vbroadcastss  0x10(%rax),%ymm11
+  196,98,125,24,96,28,                    //vbroadcastss  0x1c(%rax),%ymm12
+  196,98,125,24,72,40,                    //vbroadcastss  0x28(%rax),%ymm9
+  196,66,109,184,204,                     //vfmadd231ps   %ymm12,%ymm2,%ymm9
+  196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
+  196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
+  196,98,125,24,88,8,                     //vbroadcastss  0x8(%rax),%ymm11
+  196,98,125,24,96,20,                    //vbroadcastss  0x14(%rax),%ymm12
+  196,98,125,24,104,32,                   //vbroadcastss  0x20(%rax),%ymm13
+  196,98,125,24,80,44,                    //vbroadcastss  0x2c(%rax),%ymm10
+  196,66,109,184,213,                     //vfmadd231ps   %ymm13,%ymm2,%ymm10
+  196,66,117,184,212,                     //vfmadd231ps   %ymm12,%ymm1,%ymm10
+  196,66,125,184,211,                     //vfmadd231ps   %ymm11,%ymm0,%ymm10
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  197,124,41,201,                         //vmovaps       %ymm9,%ymm1
+  197,124,41,210,                         //vmovaps       %ymm10,%ymm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_perspective_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
+  196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
+  196,66,117,184,209,                     //vfmadd231ps   %ymm9,%ymm1,%ymm10
+  196,66,125,184,208,                     //vfmadd231ps   %ymm8,%ymm0,%ymm10
+  196,98,125,24,64,12,                    //vbroadcastss  0xc(%rax),%ymm8
+  196,98,125,24,72,16,                    //vbroadcastss  0x10(%rax),%ymm9
+  196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
+  196,66,117,184,217,                     //vfmadd231ps   %ymm9,%ymm1,%ymm11
+  196,66,125,184,216,                     //vfmadd231ps   %ymm8,%ymm0,%ymm11
+  196,98,125,24,64,24,                    //vbroadcastss  0x18(%rax),%ymm8
+  196,98,125,24,72,28,                    //vbroadcastss  0x1c(%rax),%ymm9
+  196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
+  196,66,117,184,225,                     //vfmadd231ps   %ymm9,%ymm1,%ymm12
+  196,66,125,184,224,                     //vfmadd231ps   %ymm8,%ymm0,%ymm12
+  196,193,124,83,204,                     //vrcpps        %ymm12,%ymm1
+  197,172,89,193,                         //vmulps        %ymm1,%ymm10,%ymm0
+  197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_linear_gradient_2stops_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,72,16,                   //vbroadcastss  0x10(%rax),%ymm1
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,184,193,                     //vfmadd231ps   %ymm1,%ymm0,%ymm8
+  196,226,125,24,80,20,                   //vbroadcastss  0x14(%rax),%ymm2
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,184,202,                    //vfmadd231ps   %ymm2,%ymm0,%ymm1
+  196,226,125,24,88,24,                   //vbroadcastss  0x18(%rax),%ymm3
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  196,226,125,184,211,                    //vfmadd231ps   %ymm3,%ymm0,%ymm2
+  196,98,125,24,72,28,                    //vbroadcastss  0x1c(%rax),%ymm9
+  196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
+  196,194,125,184,217,                    //vfmadd231ps   %ymm9,%ymm0,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_start_pipeline_avx[] = {
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  83,                                     //push          %rbx
+  73,137,205,                             //mov           %rcx,%r13
+  73,137,214,                             //mov           %rdx,%r14
+  72,137,251,                             //mov           %rdi,%rbx
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  73,137,199,                             //mov           %rax,%r15
+  73,137,244,                             //mov           %rsi,%r12
+  72,141,67,8,                            //lea           0x8(%rbx),%rax
+  76,57,232,                              //cmp           %r13,%rax
+  118,5,                                  //jbe           28 <_sk_start_pipeline_avx+0x28>
+  72,137,223,                             //mov           %rbx,%rdi
+  235,65,                                 //jmp           69 <_sk_start_pipeline_avx+0x69>
+  185,0,0,0,0,                            //mov           $0x0,%ecx
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  72,137,223,                             //mov           %rbx,%rdi
+  76,137,230,                             //mov           %r12,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,215,                             //callq         *%r15
+  72,141,123,8,                           //lea           0x8(%rbx),%rdi
+  72,131,195,16,                          //add           $0x10,%rbx
+  76,57,235,                              //cmp           %r13,%rbx
+  72,137,251,                             //mov           %rdi,%rbx
+  118,191,                                //jbe           28 <_sk_start_pipeline_avx+0x28>
+  76,137,233,                             //mov           %r13,%rcx
+  72,41,249,                              //sub           %rdi,%rcx
+  116,41,                                 //je            9a <_sk_start_pipeline_avx+0x9a>
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  76,137,230,                             //mov           %r12,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,215,                             //callq         *%r15
+  76,137,232,                             //mov           %r13,%rax
+  91,                                     //pop           %rbx
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  197,248,119,                            //vzeroupper
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_just_return_avx[] = {
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_seed_shader_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,249,110,199,                        //vmovd         %edi,%xmm0
+  197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
+  196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,4,                    //vbroadcastss  0x4(%rdx),%ymm1
+  197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
+  197,252,88,66,20,                       //vaddps        0x14(%rdx),%ymm0,%ymm0
+  196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
+  196,226,125,24,18,                      //vbroadcastss  (%rdx),%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_constant_color_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clear_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_plus__avx[] = {
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_srcover_avx[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  197,60,92,195,                          //vsubps        %ymm3,%ymm8,%ymm8
+  197,60,89,204,                          //vmulps        %ymm4,%ymm8,%ymm9
+  197,180,88,192,                         //vaddps        %ymm0,%ymm9,%ymm0
+  197,60,89,205,                          //vmulps        %ymm5,%ymm8,%ymm9
+  197,180,88,201,                         //vaddps        %ymm1,%ymm9,%ymm1
+  197,60,89,206,                          //vmulps        %ymm6,%ymm8,%ymm9
+  197,180,88,210,                         //vaddps        %ymm2,%ymm9,%ymm2
+  197,60,89,199,                          //vmulps        %ymm7,%ymm8,%ymm8
+  197,188,88,219,                         //vaddps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_dstover_avx[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  197,60,92,199,                          //vsubps        %ymm7,%ymm8,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_0_avx[] = {
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  196,193,124,95,192,                     //vmaxps        %ymm8,%ymm0,%ymm0
+  196,193,116,95,200,                     //vmaxps        %ymm8,%ymm1,%ymm1
+  196,193,108,95,208,                     //vmaxps        %ymm8,%ymm2,%ymm2
+  196,193,100,95,216,                     //vmaxps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_1_avx[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
+  196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
+  196,193,108,93,208,                     //vminps        %ymm8,%ymm2,%ymm2
+  196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_a_avx[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
+  197,252,93,195,                         //vminps        %ymm3,%ymm0,%ymm0
+  197,244,93,203,                         //vminps        %ymm3,%ymm1,%ymm1
+  197,236,93,211,                         //vminps        %ymm3,%ymm2,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_set_rgb_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_rb_avx[] = {
+  197,124,40,192,                         //vmovaps       %ymm0,%ymm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,194,                         //vmovaps       %ymm2,%ymm0
+  197,124,41,194,                         //vmovaps       %ymm8,%ymm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_avx[] = {
+  197,124,40,195,                         //vmovaps       %ymm3,%ymm8
+  197,124,40,202,                         //vmovaps       %ymm2,%ymm9
+  197,124,40,209,                         //vmovaps       %ymm1,%ymm10
+  197,124,40,216,                         //vmovaps       %ymm0,%ymm11
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,196,                         //vmovaps       %ymm4,%ymm0
+  197,252,40,205,                         //vmovaps       %ymm5,%ymm1
+  197,252,40,214,                         //vmovaps       %ymm6,%ymm2
+  197,252,40,223,                         //vmovaps       %ymm7,%ymm3
+  197,124,41,220,                         //vmovaps       %ymm11,%ymm4
+  197,124,41,213,                         //vmovaps       %ymm10,%ymm5
+  197,124,41,206,                         //vmovaps       %ymm9,%ymm6
+  197,124,41,199,                         //vmovaps       %ymm8,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_src_dst_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,224,                         //vmovaps       %ymm0,%ymm4
+  197,252,40,233,                         //vmovaps       %ymm1,%ymm5
+  197,252,40,242,                         //vmovaps       %ymm2,%ymm6
+  197,252,40,251,                         //vmovaps       %ymm3,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_dst_src_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,196,                         //vmovaps       %ymm4,%ymm0
+  197,252,40,205,                         //vmovaps       %ymm5,%ymm1
+  197,252,40,214,                         //vmovaps       %ymm6,%ymm2
+  197,252,40,223,                         //vmovaps       %ymm7,%ymm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_premul_avx[] = {
+  197,252,89,195,                         //vmulps        %ymm3,%ymm0,%ymm0
+  197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
+  197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_unpremul_avx[] = {
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  196,65,100,194,200,0,                   //vcmpeqps      %ymm8,%ymm3,%ymm9
+  196,98,125,24,18,                       //vbroadcastss  (%rdx),%ymm10
+  197,44,94,211,                          //vdivps        %ymm3,%ymm10,%ymm10
+  196,67,45,74,192,144,                   //vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_from_srgb_avx[] = {
+  196,98,125,24,66,64,                    //vbroadcastss  0x40(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  197,124,89,208,                         //vmulps        %ymm0,%ymm0,%ymm10
+  196,98,125,24,90,60,                    //vbroadcastss  0x3c(%rdx),%ymm11
+  196,98,125,24,98,56,                    //vbroadcastss  0x38(%rdx),%ymm12
+  197,36,89,232,                          //vmulps        %ymm0,%ymm11,%ymm13
+  196,65,20,88,236,                       //vaddps        %ymm12,%ymm13,%ymm13
+  196,98,125,24,114,52,                   //vbroadcastss  0x34(%rdx),%ymm14
+  196,65,44,89,213,                       //vmulps        %ymm13,%ymm10,%ymm10
+  196,65,12,88,210,                       //vaddps        %ymm10,%ymm14,%ymm10
+  196,98,125,24,106,68,                   //vbroadcastss  0x44(%rdx),%ymm13
+  196,193,124,194,197,1,                  //vcmpltps      %ymm13,%ymm0,%ymm0
+  196,195,45,74,193,0,                    //vblendvps     %ymm0,%ymm9,%ymm10,%ymm0
+  197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
+  197,116,89,209,                         //vmulps        %ymm1,%ymm1,%ymm10
+  197,36,89,249,                          //vmulps        %ymm1,%ymm11,%ymm15
+  196,65,4,88,252,                        //vaddps        %ymm12,%ymm15,%ymm15
+  196,65,44,89,215,                       //vmulps        %ymm15,%ymm10,%ymm10
+  196,65,12,88,210,                       //vaddps        %ymm10,%ymm14,%ymm10
+  196,193,116,194,205,1,                  //vcmpltps      %ymm13,%ymm1,%ymm1
+  196,195,45,74,201,16,                   //vblendvps     %ymm1,%ymm9,%ymm10,%ymm1
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  197,108,89,202,                         //vmulps        %ymm2,%ymm2,%ymm9
+  197,36,89,210,                          //vmulps        %ymm2,%ymm11,%ymm10
+  196,65,44,88,212,                       //vaddps        %ymm12,%ymm10,%ymm10
+  196,65,52,89,202,                       //vmulps        %ymm10,%ymm9,%ymm9
+  196,65,12,88,201,                       //vaddps        %ymm9,%ymm14,%ymm9
+  196,193,108,194,213,1,                  //vcmpltps      %ymm13,%ymm2,%ymm2
+  196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_to_srgb_avx[] = {
+  197,124,82,192,                         //vrsqrtps      %ymm0,%ymm8
+  196,65,124,83,200,                      //vrcpps        %ymm8,%ymm9
+  196,65,124,82,208,                      //vrsqrtps      %ymm8,%ymm10
+  196,98,125,24,66,72,                    //vbroadcastss  0x48(%rdx),%ymm8
+  197,60,89,216,                          //vmulps        %ymm0,%ymm8,%ymm11
+  196,98,125,24,34,                       //vbroadcastss  (%rdx),%ymm12
+  196,98,125,24,106,76,                   //vbroadcastss  0x4c(%rdx),%ymm13
+  196,98,125,24,114,80,                   //vbroadcastss  0x50(%rdx),%ymm14
+  196,98,125,24,122,84,                   //vbroadcastss  0x54(%rdx),%ymm15
+  196,65,52,89,206,                       //vmulps        %ymm14,%ymm9,%ymm9
+  196,65,52,88,207,                       //vaddps        %ymm15,%ymm9,%ymm9
+  196,65,44,89,213,                       //vmulps        %ymm13,%ymm10,%ymm10
+  196,65,44,88,201,                       //vaddps        %ymm9,%ymm10,%ymm9
+  196,65,28,93,201,                       //vminps        %ymm9,%ymm12,%ymm9
+  196,98,125,24,82,88,                    //vbroadcastss  0x58(%rdx),%ymm10
+  196,193,124,194,194,1,                  //vcmpltps      %ymm10,%ymm0,%ymm0
+  196,195,53,74,195,0,                    //vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
+  197,124,82,201,                         //vrsqrtps      %ymm1,%ymm9
+  196,65,124,83,217,                      //vrcpps        %ymm9,%ymm11
+  196,65,124,82,201,                      //vrsqrtps      %ymm9,%ymm9
+  196,65,12,89,219,                       //vmulps        %ymm11,%ymm14,%ymm11
+  196,65,4,88,219,                        //vaddps        %ymm11,%ymm15,%ymm11
+  196,65,20,89,201,                       //vmulps        %ymm9,%ymm13,%ymm9
+  196,65,52,88,203,                       //vaddps        %ymm11,%ymm9,%ymm9
+  197,60,89,217,                          //vmulps        %ymm1,%ymm8,%ymm11
+  196,65,28,93,201,                       //vminps        %ymm9,%ymm12,%ymm9
+  196,193,116,194,202,1,                  //vcmpltps      %ymm10,%ymm1,%ymm1
+  196,195,53,74,203,16,                   //vblendvps     %ymm1,%ymm11,%ymm9,%ymm1
+  197,124,82,202,                         //vrsqrtps      %ymm2,%ymm9
+  196,65,124,83,217,                      //vrcpps        %ymm9,%ymm11
+  196,65,12,89,219,                       //vmulps        %ymm11,%ymm14,%ymm11
+  196,65,4,88,219,                        //vaddps        %ymm11,%ymm15,%ymm11
+  196,65,124,82,201,                      //vrsqrtps      %ymm9,%ymm9
+  196,65,20,89,201,                       //vmulps        %ymm9,%ymm13,%ymm9
+  196,65,52,88,203,                       //vaddps        %ymm11,%ymm9,%ymm9
+  196,65,28,93,201,                       //vminps        %ymm9,%ymm12,%ymm9
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  196,193,108,194,210,1,                  //vcmpltps      %ymm10,%ymm2,%ymm2
+  196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_1_float_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_u8_avx[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,65,                                 //jne           478 <_sk_scale_u8_avx+0x51>
+  197,123,16,0,                           //vmovsd        (%rax),%xmm8
+  196,66,121,49,200,                      //vpmovzxbd     %xmm8,%xmm9
+  196,67,121,4,192,229,                   //vpermilps     $0xe5,%xmm8,%xmm8
+  196,66,121,49,192,                      //vpmovzxbd     %xmm8,%xmm8
+  196,67,53,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,12,                    //vbroadcastss  0xc(%rdx),%ymm9
+  196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           480 <_sk_scale_u8_avx+0x59>
+  196,65,249,110,193,                     //vmovq         %r9,%xmm8
+  235,158,                                //jmp           43b <_sk_scale_u8_avx+0x14>
+};
+
+CODE const uint8_t sk_lerp_1_float_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_u8_avx[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,101,                                //jne           551 <_sk_lerp_u8_avx+0x75>
+  197,123,16,0,                           //vmovsd        (%rax),%xmm8
+  196,66,121,49,200,                      //vpmovzxbd     %xmm8,%xmm9
+  196,67,121,4,192,229,                   //vpermilps     $0xe5,%xmm8,%xmm8
+  196,66,121,49,192,                      //vpmovzxbd     %xmm8,%xmm8
+  196,67,53,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,12,                    //vbroadcastss  0xc(%rdx),%ymm9
+  196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           559 <_sk_lerp_u8_avx+0x7d>
+  196,65,249,110,193,                     //vmovq         %r9,%xmm8
+  233,119,255,255,255,                    //jmpq          4f0 <_sk_lerp_u8_avx+0x14>
+};
+
+CODE const uint8_t sk_lerp_565_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  15,133,148,0,0,0,                       //jne           61b <_sk_lerp_565_avx+0xa2>
+  196,65,122,111,4,122,                   //vmovdqu       (%r10,%rdi,2),%xmm8
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  197,185,105,219,                        //vpunpckhwd    %xmm3,%xmm8,%xmm3
+  196,66,121,51,192,                      //vpmovzxwd     %xmm8,%xmm8
+  196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
+  196,98,125,24,66,104,                   //vbroadcastss  0x68(%rdx),%ymm8
+  197,60,84,195,                          //vandps        %ymm3,%ymm8,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,116,                   //vbroadcastss  0x74(%rdx),%ymm9
+  196,65,52,89,192,                       //vmulps        %ymm8,%ymm9,%ymm8
+  196,98,125,24,74,108,                   //vbroadcastss  0x6c(%rdx),%ymm9
+  197,52,84,203,                          //vandps        %ymm3,%ymm9,%ymm9
+  196,65,124,91,201,                      //vcvtdq2ps     %ymm9,%ymm9
+  196,98,125,24,82,120,                   //vbroadcastss  0x78(%rdx),%ymm10
+  196,65,44,89,201,                       //vmulps        %ymm9,%ymm10,%ymm9
+  196,98,125,24,82,112,                   //vbroadcastss  0x70(%rdx),%ymm10
+  197,172,84,219,                         //vandps        %ymm3,%ymm10,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,98,125,24,82,124,                   //vbroadcastss  0x7c(%rdx),%ymm10
+  197,172,89,219,                         //vmulps        %ymm3,%ymm10,%ymm3
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,193,116,89,201,                     //vmulps        %ymm9,%ymm1,%ymm1
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  196,226,125,24,26,                      //vbroadcastss  (%rdx),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  196,65,57,239,192,                      //vpxor         %xmm8,%xmm8,%xmm8
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  15,135,85,255,255,255,                  //ja            58d <_sk_lerp_565_avx+0x14>
+  76,141,13,73,0,0,0,                     //lea           0x49(%rip),%r9        # 688 <_sk_lerp_565_avx+0x10f>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  196,65,97,196,68,122,12,6,              //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8
+  196,65,57,196,68,122,10,5,              //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,68,122,8,4,               //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,68,122,6,3,               //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,68,122,4,2,               //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,68,122,2,1,               //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,4,122,0,                  //vpinsrw       $0x0,(%r10,%rdi,2),%xmm8,%xmm8
+  233,5,255,255,255,                      //jmpq          58d <_sk_lerp_565_avx+0x14>
+  244,                                    //hlt
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  236,                                    //in            (%dx),%al
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,228,                                //jmpq          *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  220,255,                                //fdivr         %st,%st(7)
+  255,                                    //(bad)
+  255,212,                                //callq         *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,204,                                //dec           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,192,                                //inc           %eax
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_tables_avx[] = {
+  85,                                     //push          %rbp
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  83,                                     //push          %rbx
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,0,                               //mov           (%rax),%r8
+  72,133,201,                             //test          %rcx,%rcx
+  15,133,18,2,0,0,                        //jne           8ce <_sk_load_tables_avx+0x22a>
+  196,65,124,16,4,184,                    //vmovups       (%r8,%rdi,4),%ymm8
+  196,98,125,24,74,16,                    //vbroadcastss  0x10(%rdx),%ymm9
+  196,193,52,84,192,                      //vandps        %ymm8,%ymm9,%ymm0
+  196,193,249,126,193,                    //vmovq         %xmm0,%r9
+  69,137,203,                             //mov           %r9d,%r11d
+  196,195,249,22,194,1,                   //vpextrq       $0x1,%xmm0,%r10
+  69,137,214,                             //mov           %r10d,%r14d
+  73,193,234,32,                          //shr           $0x20,%r10
+  73,193,233,32,                          //shr           $0x20,%r9
+  196,227,125,25,192,1,                   //vextractf128  $0x1,%ymm0,%xmm0
+  196,193,249,126,196,                    //vmovq         %xmm0,%r12
+  69,137,231,                             //mov           %r12d,%r15d
+  196,227,249,22,195,1,                   //vpextrq       $0x1,%xmm0,%rbx
+  65,137,221,                             //mov           %ebx,%r13d
+  72,193,235,32,                          //shr           $0x20,%rbx
+  73,193,236,32,                          //shr           $0x20,%r12
+  72,139,104,8,                           //mov           0x8(%rax),%rbp
+  76,139,64,16,                           //mov           0x10(%rax),%r8
+  196,161,122,16,68,189,0,                //vmovss        0x0(%rbp,%r15,4),%xmm0
+  196,163,121,33,68,165,0,16,             //vinsertps     $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0
+  196,163,121,33,68,173,0,32,             //vinsertps     $0x20,0x0(%rbp,%r13,4),%xmm0,%xmm0
+  197,250,16,76,157,0,                    //vmovss        0x0(%rbp,%rbx,4),%xmm1
+  196,227,121,33,193,48,                  //vinsertps     $0x30,%xmm1,%xmm0,%xmm0
+  196,161,122,16,76,157,0,                //vmovss        0x0(%rbp,%r11,4),%xmm1
+  196,163,113,33,76,141,0,16,             //vinsertps     $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1
+  196,163,113,33,76,181,0,32,             //vinsertps     $0x20,0x0(%rbp,%r14,4),%xmm1,%xmm1
+  196,161,122,16,92,149,0,                //vmovss        0x0(%rbp,%r10,4),%xmm3
+  196,227,113,33,203,48,                  //vinsertps     $0x30,%xmm3,%xmm1,%xmm1
+  196,227,117,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
+  196,193,113,114,208,8,                  //vpsrld        $0x8,%xmm8,%xmm1
+  196,67,125,25,194,1,                    //vextractf128  $0x1,%ymm8,%xmm10
+  196,193,105,114,210,8,                  //vpsrld        $0x8,%xmm10,%xmm2
+  196,227,117,24,202,1,                   //vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
+  197,180,84,201,                         //vandps        %ymm1,%ymm9,%ymm1
+  196,193,249,126,201,                    //vmovq         %xmm1,%r9
+  69,137,203,                             //mov           %r9d,%r11d
+  196,195,249,22,202,1,                   //vpextrq       $0x1,%xmm1,%r10
+  69,137,214,                             //mov           %r10d,%r14d
+  73,193,234,32,                          //shr           $0x20,%r10
+  73,193,233,32,                          //shr           $0x20,%r9
+  196,227,125,25,201,1,                   //vextractf128  $0x1,%ymm1,%xmm1
+  196,225,249,126,205,                    //vmovq         %xmm1,%rbp
+  65,137,239,                             //mov           %ebp,%r15d
+  196,227,249,22,203,1,                   //vpextrq       $0x1,%xmm1,%rbx
+  65,137,220,                             //mov           %ebx,%r12d
+  72,193,235,32,                          //shr           $0x20,%rbx
+  72,193,237,32,                          //shr           $0x20,%rbp
+  196,129,122,16,12,184,                  //vmovss        (%r8,%r15,4),%xmm1
+  196,195,113,33,12,168,16,               //vinsertps     $0x10,(%r8,%rbp,4),%xmm1,%xmm1
+  196,129,122,16,20,160,                  //vmovss        (%r8,%r12,4),%xmm2
+  196,227,113,33,202,32,                  //vinsertps     $0x20,%xmm2,%xmm1,%xmm1
+  196,193,122,16,20,152,                  //vmovss        (%r8,%rbx,4),%xmm2
+  196,227,113,33,202,48,                  //vinsertps     $0x30,%xmm2,%xmm1,%xmm1
+  196,129,122,16,20,152,                  //vmovss        (%r8,%r11,4),%xmm2
+  196,131,105,33,20,136,16,               //vinsertps     $0x10,(%r8,%r9,4),%xmm2,%xmm2
+  196,129,122,16,28,176,                  //vmovss        (%r8,%r14,4),%xmm3
+  196,227,105,33,211,32,                  //vinsertps     $0x20,%xmm3,%xmm2,%xmm2
+  196,129,122,16,28,144,                  //vmovss        (%r8,%r10,4),%xmm3
+  196,227,105,33,211,48,                  //vinsertps     $0x30,%xmm3,%xmm2,%xmm2
+  196,227,109,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
+  72,139,64,24,                           //mov           0x18(%rax),%rax
+  196,193,105,114,208,16,                 //vpsrld        $0x10,%xmm8,%xmm2
+  196,193,97,114,210,16,                  //vpsrld        $0x10,%xmm10,%xmm3
+  196,227,109,24,211,1,                   //vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
+  197,180,84,210,                         //vandps        %ymm2,%ymm9,%ymm2
+  196,193,249,126,208,                    //vmovq         %xmm2,%r8
+  69,137,194,                             //mov           %r8d,%r10d
+  196,195,249,22,209,1,                   //vpextrq       $0x1,%xmm2,%r9
+  69,137,203,                             //mov           %r9d,%r11d
+  73,193,233,32,                          //shr           $0x20,%r9
+  73,193,232,32,                          //shr           $0x20,%r8
+  196,227,125,25,210,1,                   //vextractf128  $0x1,%ymm2,%xmm2
+  196,225,249,126,213,                    //vmovq         %xmm2,%rbp
+  65,137,238,                             //mov           %ebp,%r14d
+  196,227,249,22,211,1,                   //vpextrq       $0x1,%xmm2,%rbx
+  65,137,223,                             //mov           %ebx,%r15d
+  72,193,235,32,                          //shr           $0x20,%rbx
+  72,193,237,32,                          //shr           $0x20,%rbp
+  196,161,122,16,20,176,                  //vmovss        (%rax,%r14,4),%xmm2
+  196,227,105,33,20,168,16,               //vinsertps     $0x10,(%rax,%rbp,4),%xmm2,%xmm2
+  196,161,122,16,28,184,                  //vmovss        (%rax,%r15,4),%xmm3
+  196,227,105,33,211,32,                  //vinsertps     $0x20,%xmm3,%xmm2,%xmm2
+  197,250,16,28,152,                      //vmovss        (%rax,%rbx,4),%xmm3
+  196,99,105,33,203,48,                   //vinsertps     $0x30,%xmm3,%xmm2,%xmm9
+  196,161,122,16,28,144,                  //vmovss        (%rax,%r10,4),%xmm3
+  196,163,97,33,28,128,16,                //vinsertps     $0x10,(%rax,%r8,4),%xmm3,%xmm3
+  196,161,122,16,20,152,                  //vmovss        (%rax,%r11,4),%xmm2
+  196,227,97,33,210,32,                   //vinsertps     $0x20,%xmm2,%xmm3,%xmm2
+  196,161,122,16,28,136,                  //vmovss        (%rax,%r9,4),%xmm3
+  196,227,105,33,211,48,                  //vinsertps     $0x30,%xmm3,%xmm2,%xmm2
+  196,195,109,24,209,1,                   //vinsertf128   $0x1,%xmm9,%ymm2,%ymm2
+  196,193,57,114,208,24,                  //vpsrld        $0x18,%xmm8,%xmm8
+  196,193,97,114,210,24,                  //vpsrld        $0x18,%xmm10,%xmm3
+  196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,98,125,24,66,12,                    //vbroadcastss  0xc(%rdx),%ymm8
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  91,                                     //pop           %rbx
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  93,                                     //pop           %rbp
+  255,224,                                //jmpq          *%rax
+  65,137,201,                             //mov           %ecx,%r9d
+  65,128,225,7,                           //and           $0x7,%r9b
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  65,254,201,                             //dec           %r9b
+  69,15,182,201,                          //movzbl        %r9b,%r9d
+  65,128,249,6,                           //cmp           $0x6,%r9b
+  15,135,215,253,255,255,                 //ja            6c2 <_sk_load_tables_avx+0x1e>
+  76,141,21,138,0,0,0,                    //lea           0x8a(%rip),%r10        # 97c <_sk_load_tables_avx+0x2d8>
+  79,99,12,138,                           //movslq        (%r10,%r9,4),%r9
+  77,1,209,                               //add           %r10,%r9
+  65,255,225,                             //jmpq          *%r9
+  196,193,121,110,68,184,24,              //vmovd         0x18(%r8,%rdi,4),%xmm0
+  197,249,112,192,68,                     //vpshufd       $0x44,%xmm0,%xmm0
+  196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  196,99,117,12,192,64,                   //vblendps      $0x40,%ymm0,%ymm1,%ymm8
+  196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
+  196,195,121,34,68,184,20,1,             //vpinsrd       $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0
+  196,99,61,24,192,1,                     //vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
+  196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
+  196,195,121,34,68,184,16,0,             //vpinsrd       $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0
+  196,99,61,24,192,1,                     //vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
+  196,195,57,34,68,184,12,3,              //vpinsrd       $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0
+  196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
+  196,195,57,34,68,184,8,2,               //vpinsrd       $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0
+  196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
+  196,195,57,34,68,184,4,1,               //vpinsrd       $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0
+  196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
+  196,195,57,34,4,184,0,                  //vpinsrd       $0x0,(%r8,%rdi,4),%xmm8,%xmm0
+  196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
+  233,70,253,255,255,                     //jmpq          6c2 <_sk_load_tables_avx+0x1e>
+  238,                                    //out           %al,(%dx)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,224,                                //jmpq          *%rax
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,196,                                //inc           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,176,255,255,255,156,                //pushq         -0x63000001(%rax)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+  128,255,255,                            //cmp           $0xff,%bh
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_a8_avx[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,59,                                 //jne           9e3 <_sk_load_a8_avx+0x4b>
+  197,251,16,0,                           //vmovsd        (%rax),%xmm0
+  196,226,121,49,200,                     //vpmovzxbd     %xmm0,%xmm1
+  196,227,121,4,192,229,                  //vpermilps     $0xe5,%xmm0,%xmm0
+  196,226,121,49,192,                     //vpmovzxbd     %xmm0,%xmm0
+  196,227,117,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,12,                   //vbroadcastss  0xc(%rdx),%ymm1
+  197,252,89,217,                         //vmulps        %ymm1,%ymm0,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           9eb <_sk_load_a8_avx+0x53>
+  196,193,249,110,193,                    //vmovq         %r9,%xmm0
+  235,164,                                //jmp           9ac <_sk_load_a8_avx+0x14>
+};
+
+CODE const uint8_t sk_store_a8_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,66,8,                     //vbroadcastss  0x8(%rdx),%ymm8
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
+  196,65,57,103,192,                      //vpackuswb     %xmm8,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           a3b <_sk_store_a8_avx+0x33>
+  196,65,123,17,4,57,                     //vmovsd        %xmm8,(%r9,%rdi,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            a37 <_sk_store_a8_avx+0x2f>
+  196,66,121,48,192,                      //vpmovzxbw     %xmm8,%xmm8
+  76,141,21,69,0,0,0,                     //lea           0x45(%rip),%r10        # a9c <_sk_store_a8_avx+0x94>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,121,20,68,57,6,12,               //vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
+  196,67,121,20,68,57,5,10,               //vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
+  196,67,121,20,68,57,4,8,                //vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
+  196,67,121,20,68,57,3,6,                //vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
+  196,67,121,20,68,57,2,4,                //vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
+  196,67,121,20,68,57,1,2,                //vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
+  196,67,121,20,4,57,0,                   //vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
+  235,158,                                //jmp           a37 <_sk_store_a8_avx+0x2f>
+  15,31,0,                                //nopl          (%rax)
+  244,                                    //hlt
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  236,                                    //in            (%dx),%al
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,228,                                //jmpq          *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  220,255,                                //fdivr         %st,%st(7)
+  255,                                    //(bad)
+  255,212,                                //callq         *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,204,                                //dec           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,196,                                //inc           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_565_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  117,106,                                //jne           b2c <_sk_load_565_avx+0x74>
+  196,193,122,111,4,122,                  //vmovdqu       (%r10,%rdi,2),%xmm0
+  197,241,239,201,                        //vpxor         %xmm1,%xmm1,%xmm1
+  197,249,105,201,                        //vpunpckhwd    %xmm1,%xmm0,%xmm1
+  196,226,121,51,192,                     //vpmovzxwd     %xmm0,%xmm0
+  196,227,125,24,209,1,                   //vinsertf128   $0x1,%xmm1,%ymm0,%ymm2
+  196,226,125,24,66,104,                  //vbroadcastss  0x68(%rdx),%ymm0
+  197,252,84,194,                         //vandps        %ymm2,%ymm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,116,                  //vbroadcastss  0x74(%rdx),%ymm1
+  197,244,89,192,                         //vmulps        %ymm0,%ymm1,%ymm0
+  196,226,125,24,74,108,                  //vbroadcastss  0x6c(%rdx),%ymm1
+  197,244,84,202,                         //vandps        %ymm2,%ymm1,%ymm1
+  197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
+  196,226,125,24,90,120,                  //vbroadcastss  0x78(%rdx),%ymm3
+  197,228,89,201,                         //vmulps        %ymm1,%ymm3,%ymm1
+  196,226,125,24,90,112,                  //vbroadcastss  0x70(%rdx),%ymm3
+  197,228,84,210,                         //vandps        %ymm2,%ymm3,%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  196,226,125,24,90,124,                  //vbroadcastss  0x7c(%rdx),%ymm3
+  197,228,89,210,                         //vmulps        %ymm2,%ymm3,%ymm2
+  196,226,125,24,26,                      //vbroadcastss  (%rdx),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,132,                                //ja            ac8 <_sk_load_565_avx+0x10>
+  76,141,13,73,0,0,0,                     //lea           0x49(%rip),%r9        # b94 <_sk_load_565_avx+0xdc>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
+  196,193,121,196,68,122,12,6,            //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,10,5,            //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,8,4,             //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,6,3,             //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,4,2,             //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,2,1,             //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,4,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
+  233,52,255,255,255,                     //jmpq          ac8 <_sk_load_565_avx+0x10>
+  244,                                    //hlt
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  236,                                    //in            (%dx),%al
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,228,                                //jmpq          *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  220,255,                                //fdivr         %st,%st(7)
+  255,                                    //(bad)
+  255,212,                                //callq         *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,204,                                //dec           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,192,                                //inc           %eax
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_store_565_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,130,128,0,0,0,            //vbroadcastss  0x80(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
+  196,193,41,114,241,11,                  //vpslld        $0xb,%xmm9,%xmm10
+  196,67,125,25,201,1,                    //vextractf128  $0x1,%ymm9,%xmm9
+  196,193,49,114,241,11,                  //vpslld        $0xb,%xmm9,%xmm9
+  196,67,45,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm9
+  196,98,125,24,146,132,0,0,0,            //vbroadcastss  0x84(%rdx),%ymm10
+  197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,33,114,242,5,                   //vpslld        $0x5,%xmm10,%xmm11
+  196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
+  196,193,41,114,242,5,                   //vpslld        $0x5,%xmm10,%xmm10
+  196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  196,65,45,86,201,                       //vorpd         %ymm9,%ymm10,%ymm9
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,65,53,86,192,                       //vorpd         %ymm8,%ymm9,%ymm8
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           c36 <_sk_store_565_avx+0x86>
+  196,65,122,127,4,121,                   //vmovdqu       %xmm8,(%r9,%rdi,2)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            c32 <_sk_store_565_avx+0x82>
+  76,141,21,71,0,0,0,                     //lea           0x47(%rip),%r10        # c94 <_sk_store_565_avx+0xe4>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,121,21,68,121,12,6,              //vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
+  196,67,121,21,68,121,10,5,              //vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
+  196,67,121,21,68,121,8,4,               //vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
+  196,67,121,21,68,121,6,3,               //vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
+  196,67,121,21,68,121,4,2,               //vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
+  196,67,121,21,68,121,2,1,               //vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
+  197,121,126,192,                        //vmovd         %xmm8,%eax
+  102,65,137,4,121,                       //mov           %ax,(%r9,%rdi,2)
+  235,161,                                //jmp           c32 <_sk_store_565_avx+0x82>
+  15,31,0,                                //nopl          (%rax)
+  242,255,                                //repnz         (bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  234,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,226,                                //jmpq          *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  218,255,                                //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,202,                                //dec           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,194,                                //inc           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_8888_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  117,125,                                //jne           d37 <_sk_load_8888_avx+0x87>
+  196,65,124,16,12,186,                   //vmovups       (%r10,%rdi,4),%ymm9
+  196,98,125,24,90,16,                    //vbroadcastss  0x10(%rdx),%ymm11
+  196,193,36,84,193,                      //vandps        %ymm9,%ymm11,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,98,125,24,66,12,                    //vbroadcastss  0xc(%rdx),%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  196,193,41,114,209,8,                   //vpsrld        $0x8,%xmm9,%xmm10
+  196,99,125,25,203,1,                    //vextractf128  $0x1,%ymm9,%xmm3
+  197,241,114,211,8,                      //vpsrld        $0x8,%xmm3,%xmm1
+  196,227,45,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm10,%ymm1
+  197,164,84,201,                         //vandps        %ymm1,%ymm11,%ymm1
+  197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  196,193,41,114,209,16,                  //vpsrld        $0x10,%xmm9,%xmm10
+  197,233,114,211,16,                     //vpsrld        $0x10,%xmm3,%xmm2
+  196,227,45,24,210,1,                    //vinsertf128   $0x1,%xmm2,%ymm10,%ymm2
+  197,164,84,210,                         //vandps        %ymm2,%ymm11,%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  196,193,49,114,209,24,                  //vpsrld        $0x18,%xmm9,%xmm9
+  197,225,114,211,24,                     //vpsrld        $0x18,%xmm3,%xmm3
+  196,227,53,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm9,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  196,65,52,87,201,                       //vxorps        %ymm9,%ymm9,%ymm9
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  15,135,108,255,255,255,                 //ja            cc0 <_sk_load_8888_avx+0x10>
+  76,141,13,137,0,0,0,                    //lea           0x89(%rip),%r9        # de4 <_sk_load_8888_avx+0x134>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  196,193,121,110,68,186,24,              //vmovd         0x18(%r10,%rdi,4),%xmm0
+  197,249,112,192,68,                     //vpshufd       $0x44,%xmm0,%xmm0
+  196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  196,99,117,12,200,64,                   //vblendps      $0x40,%ymm0,%ymm1,%ymm9
+  196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
+  196,195,121,34,68,186,20,1,             //vpinsrd       $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0
+  196,99,53,24,200,1,                     //vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
+  196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
+  196,195,121,34,68,186,16,0,             //vpinsrd       $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0
+  196,99,53,24,200,1,                     //vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
+  196,195,49,34,68,186,12,3,              //vpinsrd       $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0
+  196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
+  196,195,49,34,68,186,8,2,               //vpinsrd       $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0
+  196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
+  196,195,49,34,68,186,4,1,               //vpinsrd       $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0
+  196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
+  196,195,49,34,4,186,0,                  //vpinsrd       $0x0,(%r10,%rdi,4),%xmm9,%xmm0
+  196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
+  233,220,254,255,255,                    //jmpq          cc0 <_sk_load_8888_avx+0x10>
+  238,                                    //out           %al,(%dx)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,224,                                //jmpq          *%rax
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,196,                                //inc           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,176,255,255,255,156,                //pushq         -0x63000001(%rax)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+  128,255,255,                            //cmp           $0xff,%bh
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_store_8888_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,66,8,                     //vbroadcastss  0x8(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
+  197,60,89,209,                          //vmulps        %ymm1,%ymm8,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,33,114,242,8,                   //vpslld        $0x8,%xmm10,%xmm11
+  196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
+  196,193,41,114,242,8,                   //vpslld        $0x8,%xmm10,%xmm10
+  196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  196,65,45,86,201,                       //vorpd         %ymm9,%ymm10,%ymm9
+  197,60,89,210,                          //vmulps        %ymm2,%ymm8,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,33,114,242,16,                  //vpslld        $0x10,%xmm10,%xmm11
+  196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
+  196,193,41,114,242,16,                  //vpslld        $0x10,%xmm10,%xmm10
+  196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,193,33,114,240,24,                  //vpslld        $0x18,%xmm8,%xmm11
+  196,67,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm8
+  196,193,57,114,240,24,                  //vpslld        $0x18,%xmm8,%xmm8
+  196,67,37,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm11,%ymm8
+  196,65,45,86,192,                       //vorpd         %ymm8,%ymm10,%ymm8
+  196,65,53,86,192,                       //vorpd         %ymm8,%ymm9,%ymm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           e95 <_sk_store_8888_avx+0x95>
+  196,65,124,17,4,185,                    //vmovups       %ymm8,(%r9,%rdi,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            e91 <_sk_store_8888_avx+0x91>
+  76,141,21,84,0,0,0,                     //lea           0x54(%rip),%r10        # f00 <_sk_store_8888_avx+0x100>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,67,121,22,76,185,24,2,              //vpextrd       $0x2,%xmm9,0x18(%r9,%rdi,4)
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,67,121,22,76,185,20,1,              //vpextrd       $0x1,%xmm9,0x14(%r9,%rdi,4)
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,65,121,126,76,185,16,               //vmovd         %xmm9,0x10(%r9,%rdi,4)
+  196,67,121,22,68,185,12,3,              //vpextrd       $0x3,%xmm8,0xc(%r9,%rdi,4)
+  196,67,121,22,68,185,8,2,               //vpextrd       $0x2,%xmm8,0x8(%r9,%rdi,4)
+  196,67,121,22,68,185,4,1,               //vpextrd       $0x1,%xmm8,0x4(%r9,%rdi,4)
+  196,65,121,126,4,185,                   //vmovd         %xmm8,(%r9,%rdi,4)
+  235,147,                                //jmp           e91 <_sk_store_8888_avx+0x91>
+  102,144,                                //xchg          %ax,%ax
+  246,255,                                //idiv          %bh
+  255,                                    //(bad)
+  255,                                    //(bad)
+  238,                                    //out           %al,(%dx)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,230,                                //jmpq          *%rsi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  222,255,                                //fdivrp        %st,%st(7)
+  255,                                    //(bad)
+  255,209,                                //callq         *%rcx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,195,                                //inc           %ebx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+  181,255,                                //mov           $0xff,%ch
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_f16_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,133,201,                             //test          %rcx,%rcx
+  15,133,240,0,0,0,                       //jne           101a <_sk_load_f16_avx+0xfe>
+  197,249,16,12,248,                      //vmovupd       (%rax,%rdi,8),%xmm1
+  197,249,16,84,248,16,                   //vmovupd       0x10(%rax,%rdi,8),%xmm2
+  197,249,16,92,248,32,                   //vmovupd       0x20(%rax,%rdi,8),%xmm3
+  197,121,16,68,248,48,                   //vmovupd       0x30(%rax,%rdi,8),%xmm8
+  197,241,97,194,                         //vpunpcklwd    %xmm2,%xmm1,%xmm0
+  197,241,105,202,                        //vpunpckhwd    %xmm2,%xmm1,%xmm1
+  196,193,97,97,208,                      //vpunpcklwd    %xmm8,%xmm3,%xmm2
+  196,193,97,105,216,                     //vpunpckhwd    %xmm8,%xmm3,%xmm3
+  197,121,97,193,                         //vpunpcklwd    %xmm1,%xmm0,%xmm8
+  197,249,105,193,                        //vpunpckhwd    %xmm1,%xmm0,%xmm0
+  197,233,97,203,                         //vpunpcklwd    %xmm3,%xmm2,%xmm1
+  197,105,105,203,                        //vpunpckhwd    %xmm3,%xmm2,%xmm9
+  197,249,110,90,100,                     //vmovd         0x64(%rdx),%xmm3
+  197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
+  196,193,97,101,208,                     //vpcmpgtw      %xmm8,%xmm3,%xmm2
+  196,65,105,223,192,                     //vpandn        %xmm8,%xmm2,%xmm8
+  197,225,101,208,                        //vpcmpgtw      %xmm0,%xmm3,%xmm2
+  197,233,223,192,                        //vpandn        %xmm0,%xmm2,%xmm0
+  197,225,101,209,                        //vpcmpgtw      %xmm1,%xmm3,%xmm2
+  197,233,223,201,                        //vpandn        %xmm1,%xmm2,%xmm1
+  196,193,97,101,209,                     //vpcmpgtw      %xmm9,%xmm3,%xmm2
+  196,193,105,223,209,                    //vpandn        %xmm9,%xmm2,%xmm2
+  196,66,121,51,208,                      //vpmovzxwd     %xmm8,%xmm10
+  196,98,121,51,201,                      //vpmovzxwd     %xmm1,%xmm9
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  197,57,105,195,                         //vpunpckhwd    %xmm3,%xmm8,%xmm8
+  197,241,105,203,                        //vpunpckhwd    %xmm3,%xmm1,%xmm1
+  196,98,121,51,216,                      //vpmovzxwd     %xmm0,%xmm11
+  196,98,121,51,226,                      //vpmovzxwd     %xmm2,%xmm12
+  197,121,105,235,                        //vpunpckhwd    %xmm3,%xmm0,%xmm13
+  197,105,105,243,                        //vpunpckhwd    %xmm3,%xmm2,%xmm14
+  196,193,121,114,242,13,                 //vpslld        $0xd,%xmm10,%xmm0
+  196,193,105,114,241,13,                 //vpslld        $0xd,%xmm9,%xmm2
+  196,227,125,24,194,1,                   //vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
+  196,98,125,24,74,92,                    //vbroadcastss  0x5c(%rdx),%ymm9
+  197,180,89,192,                         //vmulps        %ymm0,%ymm9,%ymm0
+  196,193,105,114,240,13,                 //vpslld        $0xd,%xmm8,%xmm2
+  197,241,114,241,13,                     //vpslld        $0xd,%xmm1,%xmm1
+  196,227,109,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
+  197,180,89,201,                         //vmulps        %ymm1,%ymm9,%ymm1
+  196,193,105,114,243,13,                 //vpslld        $0xd,%xmm11,%xmm2
+  196,193,97,114,244,13,                  //vpslld        $0xd,%xmm12,%xmm3
+  196,227,109,24,211,1,                   //vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
+  197,180,89,210,                         //vmulps        %ymm2,%ymm9,%ymm2
+  196,193,57,114,245,13,                  //vpslld        $0xd,%xmm13,%xmm8
+  196,193,97,114,246,13,                  //vpslld        $0xd,%xmm14,%xmm3
+  196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
+  197,180,89,219,                         //vmulps        %ymm3,%ymm9,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  197,251,16,12,248,                      //vmovsd        (%rax,%rdi,8),%xmm1
+  196,65,57,87,192,                       //vxorpd        %xmm8,%xmm8,%xmm8
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  117,6,                                  //jne           1030 <_sk_load_f16_avx+0x114>
+  197,250,126,201,                        //vmovq         %xmm1,%xmm1
+  235,30,                                 //jmp           104e <_sk_load_f16_avx+0x132>
+  197,241,22,76,248,8,                    //vmovhpd       0x8(%rax,%rdi,8),%xmm1,%xmm1
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,18,                                 //jb            104e <_sk_load_f16_avx+0x132>
+  197,251,16,84,248,16,                   //vmovsd        0x10(%rax,%rdi,8),%xmm2
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  117,19,                                 //jne           105b <_sk_load_f16_avx+0x13f>
+  197,250,126,210,                        //vmovq         %xmm2,%xmm2
+  235,46,                                 //jmp           107c <_sk_load_f16_avx+0x160>
+  197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
+  197,233,87,210,                         //vxorpd        %xmm2,%xmm2,%xmm2
+  233,230,254,255,255,                    //jmpq          f41 <_sk_load_f16_avx+0x25>
+  197,233,22,84,248,24,                   //vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,21,                                 //jb            107c <_sk_load_f16_avx+0x160>
+  197,251,16,92,248,32,                   //vmovsd        0x20(%rax,%rdi,8),%xmm3
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  117,18,                                 //jne           1085 <_sk_load_f16_avx+0x169>
+  197,250,126,219,                        //vmovq         %xmm3,%xmm3
+  233,197,254,255,255,                    //jmpq          f41 <_sk_load_f16_avx+0x25>
+  197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
+  233,188,254,255,255,                    //jmpq          f41 <_sk_load_f16_avx+0x25>
+  197,225,22,92,248,40,                   //vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  15,130,172,254,255,255,                 //jb            f41 <_sk_load_f16_avx+0x25>
+  197,123,16,68,248,48,                   //vmovsd        0x30(%rax,%rdi,8),%xmm8
+  233,161,254,255,255,                    //jmpq          f41 <_sk_load_f16_avx+0x25>
+};
+
+CODE const uint8_t sk_store_f16_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  196,98,125,24,66,96,                    //vbroadcastss  0x60(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,67,125,25,202,1,                    //vextractf128  $0x1,%ymm9,%xmm10
+  196,193,41,114,210,13,                  //vpsrld        $0xd,%xmm10,%xmm10
+  196,193,49,114,209,13,                  //vpsrld        $0xd,%xmm9,%xmm9
+  197,60,89,217,                          //vmulps        %ymm1,%ymm8,%ymm11
+  196,67,125,25,220,1,                    //vextractf128  $0x1,%ymm11,%xmm12
+  196,193,25,114,212,13,                  //vpsrld        $0xd,%xmm12,%xmm12
+  196,193,33,114,211,13,                  //vpsrld        $0xd,%xmm11,%xmm11
+  197,60,89,234,                          //vmulps        %ymm2,%ymm8,%ymm13
+  196,67,125,25,238,1,                    //vextractf128  $0x1,%ymm13,%xmm14
+  196,193,9,114,214,13,                   //vpsrld        $0xd,%xmm14,%xmm14
+  196,193,17,114,213,13,                  //vpsrld        $0xd,%xmm13,%xmm13
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,67,125,25,199,1,                    //vextractf128  $0x1,%ymm8,%xmm15
+  196,193,1,114,215,13,                   //vpsrld        $0xd,%xmm15,%xmm15
+  196,193,57,114,208,13,                  //vpsrld        $0xd,%xmm8,%xmm8
+  196,193,33,115,251,2,                   //vpslldq       $0x2,%xmm11,%xmm11
+  196,65,33,235,201,                      //vpor          %xmm9,%xmm11,%xmm9
+  196,193,33,115,252,2,                   //vpslldq       $0x2,%xmm12,%xmm11
+  196,65,33,235,226,                      //vpor          %xmm10,%xmm11,%xmm12
+  196,193,57,115,248,2,                   //vpslldq       $0x2,%xmm8,%xmm8
+  196,65,57,235,197,                      //vpor          %xmm13,%xmm8,%xmm8
+  196,193,41,115,255,2,                   //vpslldq       $0x2,%xmm15,%xmm10
+  196,65,41,235,238,                      //vpor          %xmm14,%xmm10,%xmm13
+  196,65,49,98,216,                       //vpunpckldq    %xmm8,%xmm9,%xmm11
+  196,65,49,106,208,                      //vpunpckhdq    %xmm8,%xmm9,%xmm10
+  196,65,25,98,205,                       //vpunpckldq    %xmm13,%xmm12,%xmm9
+  196,65,25,106,197,                      //vpunpckhdq    %xmm13,%xmm12,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,27,                                 //jne           1163 <_sk_store_f16_avx+0xc3>
+  197,120,17,28,248,                      //vmovups       %xmm11,(%rax,%rdi,8)
+  197,120,17,84,248,16,                   //vmovups       %xmm10,0x10(%rax,%rdi,8)
+  197,120,17,76,248,32,                   //vmovups       %xmm9,0x20(%rax,%rdi,8)
+  197,122,127,68,248,48,                  //vmovdqu       %xmm8,0x30(%rax,%rdi,8)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  197,121,214,28,248,                     //vmovq         %xmm11,(%rax,%rdi,8)
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  116,241,                                //je            115f <_sk_store_f16_avx+0xbf>
+  197,121,23,92,248,8,                    //vmovhpd       %xmm11,0x8(%rax,%rdi,8)
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,229,                                //jb            115f <_sk_store_f16_avx+0xbf>
+  197,121,214,84,248,16,                  //vmovq         %xmm10,0x10(%rax,%rdi,8)
+  116,221,                                //je            115f <_sk_store_f16_avx+0xbf>
+  197,121,23,84,248,24,                   //vmovhpd       %xmm10,0x18(%rax,%rdi,8)
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,209,                                //jb            115f <_sk_store_f16_avx+0xbf>
+  197,121,214,76,248,32,                  //vmovq         %xmm9,0x20(%rax,%rdi,8)
+  116,201,                                //je            115f <_sk_store_f16_avx+0xbf>
+  197,121,23,76,248,40,                   //vmovhpd       %xmm9,0x28(%rax,%rdi,8)
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  114,189,                                //jb            115f <_sk_store_f16_avx+0xbf>
+  197,121,214,68,248,48,                  //vmovq         %xmm8,0x30(%rax,%rdi,8)
+  235,181,                                //jmp           115f <_sk_store_f16_avx+0xbf>
+};
+
+CODE const uint8_t sk_store_f32_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,0,                               //mov           (%rax),%r8
+  72,141,4,189,0,0,0,0,                   //lea           0x0(,%rdi,4),%rax
+  197,124,20,193,                         //vunpcklps     %ymm1,%ymm0,%ymm8
+  197,124,21,217,                         //vunpckhps     %ymm1,%ymm0,%ymm11
+  197,108,20,203,                         //vunpcklps     %ymm3,%ymm2,%ymm9
+  197,108,21,227,                         //vunpckhps     %ymm3,%ymm2,%ymm12
+  196,65,61,20,209,                       //vunpcklpd     %ymm9,%ymm8,%ymm10
+  196,65,61,21,201,                       //vunpckhpd     %ymm9,%ymm8,%ymm9
+  196,65,37,20,196,                       //vunpcklpd     %ymm12,%ymm11,%ymm8
+  196,65,37,21,220,                       //vunpckhpd     %ymm12,%ymm11,%ymm11
+  72,133,201,                             //test          %rcx,%rcx
+  117,55,                                 //jne           1217 <_sk_store_f32_avx+0x6d>
+  196,67,45,24,225,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
+  196,67,61,24,235,1,                     //vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
+  196,67,45,6,201,49,                     //vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
+  196,67,61,6,195,49,                     //vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
+  196,65,125,17,36,128,                   //vmovupd       %ymm12,(%r8,%rax,4)
+  196,65,125,17,108,128,32,               //vmovupd       %ymm13,0x20(%r8,%rax,4)
+  196,65,125,17,76,128,64,                //vmovupd       %ymm9,0x40(%r8,%rax,4)
+  196,65,125,17,68,128,96,                //vmovupd       %ymm8,0x60(%r8,%rax,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  196,65,121,17,20,128,                   //vmovupd       %xmm10,(%r8,%rax,4)
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  116,240,                                //je            1213 <_sk_store_f32_avx+0x69>
+  196,65,121,17,76,128,16,                //vmovupd       %xmm9,0x10(%r8,%rax,4)
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,227,                                //jb            1213 <_sk_store_f32_avx+0x69>
+  196,65,121,17,68,128,32,                //vmovupd       %xmm8,0x20(%r8,%rax,4)
+  116,218,                                //je            1213 <_sk_store_f32_avx+0x69>
+  196,65,121,17,92,128,48,                //vmovupd       %xmm11,0x30(%r8,%rax,4)
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,205,                                //jb            1213 <_sk_store_f32_avx+0x69>
+  196,67,125,25,84,128,64,1,              //vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
+  116,195,                                //je            1213 <_sk_store_f32_avx+0x69>
+  196,67,125,25,76,128,80,1,              //vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  114,181,                                //jb            1213 <_sk_store_f32_avx+0x69>
+  196,67,125,25,68,128,96,1,              //vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
+  235,171,                                //jmp           1213 <_sk_store_f32_avx+0x69>
+};
+
+CODE const uint8_t sk_clamp_x_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,95,200,                          //vmaxps        %ymm0,%ymm8,%ymm9
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
+  196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
+  196,227,61,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
+  197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_y_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,95,201,                          //vmaxps        %ymm1,%ymm8,%ymm9
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,99,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm1
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
+  196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
+  196,227,61,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
+  197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_x_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,65,124,94,200,                      //vdivps        %ymm8,%ymm0,%ymm9
+  196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
+  196,65,52,89,200,                       //vmulps        %ymm8,%ymm9,%ymm9
+  196,65,124,92,201,                      //vsubps        %ymm9,%ymm0,%ymm9
+  196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
+  196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
+  196,227,61,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
+  197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_y_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,65,116,94,200,                      //vdivps        %ymm8,%ymm1,%ymm9
+  196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
+  196,65,52,89,200,                       //vmulps        %ymm8,%ymm9,%ymm9
+  196,65,116,92,201,                      //vsubps        %ymm9,%ymm1,%ymm9
+  196,99,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm1
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
+  196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
+  196,227,61,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
+  197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_x_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,122,16,0,                           //vmovss        (%rax),%xmm8
+  196,65,121,112,200,0,                   //vpshufd       $0x0,%xmm8,%xmm9
+  196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
+  196,65,124,92,209,                      //vsubps        %ymm9,%ymm0,%ymm10
+  196,193,58,88,192,                      //vaddss        %xmm8,%xmm8,%xmm0
+  196,227,121,4,192,0,                    //vpermilps     $0x0,%xmm0,%xmm0
+  196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  197,44,94,192,                          //vdivps        %ymm0,%ymm10,%ymm8
+  196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,172,92,192,                         //vsubps        %ymm0,%ymm10,%ymm0
+  196,193,124,92,193,                     //vsubps        %ymm9,%ymm0,%ymm0
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,92,192,                          //vsubps        %ymm0,%ymm8,%ymm8
+  197,60,84,192,                          //vandps        %ymm0,%ymm8,%ymm8
+  196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
+  196,65,49,254,202,                      //vpaddd        %xmm10,%xmm9,%xmm9
+  196,227,53,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm9,%ymm0
+  197,188,93,192,                         //vminps        %ymm0,%ymm8,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_y_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,122,16,0,                           //vmovss        (%rax),%xmm8
+  196,65,121,112,200,0,                   //vpshufd       $0x0,%xmm8,%xmm9
+  196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
+  196,65,116,92,209,                      //vsubps        %ymm9,%ymm1,%ymm10
+  196,193,58,88,200,                      //vaddss        %xmm8,%xmm8,%xmm1
+  196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
+  196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
+  197,44,94,193,                          //vdivps        %ymm1,%ymm10,%ymm8
+  196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,172,92,201,                         //vsubps        %ymm1,%ymm10,%ymm1
+  196,193,116,92,201,                     //vsubps        %ymm9,%ymm1,%ymm1
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,92,193,                          //vsubps        %ymm1,%ymm8,%ymm8
+  197,60,84,193,                          //vandps        %ymm1,%ymm8,%ymm8
+  196,99,125,25,201,1,                    //vextractf128  $0x1,%ymm9,%xmm1
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
+  196,65,49,254,202,                      //vpaddd        %xmm10,%xmm9,%xmm9
+  196,227,53,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm9,%ymm1
+  197,188,93,201,                         //vminps        %ymm1,%ymm8,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_2x3_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,24,72,8,                     //vbroadcastss  0x8(%rax),%ymm9
+  196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
+  197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
+  196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
+  196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
+  196,98,125,24,80,12,                    //vbroadcastss  0xc(%rax),%ymm10
+  196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
+  197,172,89,201,                         //vmulps        %ymm1,%ymm10,%ymm1
+  196,193,116,88,203,                     //vaddps        %ymm11,%ymm1,%ymm1
+  197,180,89,192,                         //vmulps        %ymm0,%ymm9,%ymm0
+  197,252,88,201,                         //vaddps        %ymm1,%ymm0,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_3x4_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
+  196,98,125,24,80,24,                    //vbroadcastss  0x18(%rax),%ymm10
+  196,98,125,24,88,36,                    //vbroadcastss  0x24(%rax),%ymm11
+  197,44,89,210,                          //vmulps        %ymm2,%ymm10,%ymm10
+  196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
+  197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
+  196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
+  196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
+  196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
+  196,98,125,24,88,28,                    //vbroadcastss  0x1c(%rax),%ymm11
+  196,98,125,24,96,40,                    //vbroadcastss  0x28(%rax),%ymm12
+  197,36,89,218,                          //vmulps        %ymm2,%ymm11,%ymm11
+  196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
+  197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
+  196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
+  197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
+  196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
+  196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
+  196,98,125,24,104,44,                   //vbroadcastss  0x2c(%rax),%ymm13
+  197,156,89,210,                         //vmulps        %ymm2,%ymm12,%ymm2
+  196,193,108,88,213,                     //vaddps        %ymm13,%ymm2,%ymm2
+  197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
+  197,244,88,202,                         //vaddps        %ymm2,%ymm1,%ymm1
+  197,172,89,192,                         //vmulps        %ymm0,%ymm10,%ymm0
+  197,252,88,209,                         //vaddps        %ymm1,%ymm0,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  197,124,41,201,                         //vmovaps       %ymm9,%ymm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_perspective_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
+  196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
+  197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
+  196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
+  196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
+  196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
+  196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
+  197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
+  196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
+  197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  196,98,125,24,80,24,                    //vbroadcastss  0x18(%rax),%ymm10
+  196,98,125,24,88,28,                    //vbroadcastss  0x1c(%rax),%ymm11
+  196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
+  197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
+  196,193,116,88,204,                     //vaddps        %ymm12,%ymm1,%ymm1
+  197,172,89,192,                         //vmulps        %ymm0,%ymm10,%ymm0
+  197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
+  197,252,83,200,                         //vrcpps        %ymm0,%ymm1
+  197,188,89,193,                         //vmulps        %ymm1,%ymm8,%ymm0
+  197,180,89,201,                         //vmulps        %ymm1,%ymm9,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_linear_gradient_2stops_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,72,16,                   //vbroadcastss  0x10(%rax),%ymm1
+  196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
+  197,244,89,200,                         //vmulps        %ymm0,%ymm1,%ymm1
+  197,108,88,193,                         //vaddps        %ymm1,%ymm2,%ymm8
+  196,226,125,24,72,20,                   //vbroadcastss  0x14(%rax),%ymm1
+  196,226,125,24,80,4,                    //vbroadcastss  0x4(%rax),%ymm2
+  197,244,89,200,                         //vmulps        %ymm0,%ymm1,%ymm1
+  197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
+  196,226,125,24,80,24,                   //vbroadcastss  0x18(%rax),%ymm2
+  196,226,125,24,88,8,                    //vbroadcastss  0x8(%rax),%ymm3
+  197,236,89,208,                         //vmulps        %ymm0,%ymm2,%ymm2
+  197,228,88,210,                         //vaddps        %ymm2,%ymm3,%ymm2
+  196,226,125,24,88,28,                   //vbroadcastss  0x1c(%rax),%ymm3
+  196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
+  197,228,89,192,                         //vmulps        %ymm0,%ymm3,%ymm0
+  197,180,88,216,                         //vaddps        %ymm0,%ymm9,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_start_pipeline_sse41[] = {
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  83,                                     //push          %rbx
+  73,137,207,                             //mov           %rcx,%r15
+  73,137,214,                             //mov           %rdx,%r14
+  72,137,251,                             //mov           %rdi,%rbx
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  73,137,196,                             //mov           %rax,%r12
+  73,137,245,                             //mov           %rsi,%r13
+  72,141,67,4,                            //lea           0x4(%rbx),%rax
+  76,57,248,                              //cmp           %r15,%rax
+  118,5,                                  //jbe           28 <_sk_start_pipeline_sse41+0x28>
+  72,137,216,                             //mov           %rbx,%rax
+  235,52,                                 //jmp           5c <_sk_start_pipeline_sse41+0x5c>
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  15,87,228,                              //xorps         %xmm4,%xmm4
+  15,87,237,                              //xorps         %xmm5,%xmm5
+  15,87,246,                              //xorps         %xmm6,%xmm6
+  15,87,255,                              //xorps         %xmm7,%xmm7
+  72,137,223,                             //mov           %rbx,%rdi
+  76,137,238,                             //mov           %r13,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,212,                             //callq         *%r12
+  72,141,67,4,                            //lea           0x4(%rbx),%rax
+  72,131,195,8,                           //add           $0x8,%rbx
+  76,57,251,                              //cmp           %r15,%rbx
+  72,137,195,                             //mov           %rax,%rbx
+  118,204,                                //jbe           28 <_sk_start_pipeline_sse41+0x28>
+  91,                                     //pop           %rbx
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_just_return_sse41[] = {
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_seed_shader_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  102,15,110,199,                         //movd          %edi,%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
+  243,15,16,18,                           //movss         (%rdx),%xmm2
+  243,15,16,90,4,                         //movss         0x4(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  15,88,203,                              //addps         %xmm3,%xmm1
+  15,16,66,20,                            //movups        0x14(%rdx),%xmm0
+  15,88,193,                              //addps         %xmm1,%xmm0
+  102,15,110,8,                           //movd          (%rax),%xmm1
+  102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
+  15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
+  15,88,203,                              //addps         %xmm3,%xmm1
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  15,87,228,                              //xorps         %xmm4,%xmm4
+  15,87,237,                              //xorps         %xmm5,%xmm5
+  15,87,246,                              //xorps         %xmm6,%xmm6
+  15,87,255,                              //xorps         %xmm7,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_constant_color_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,16,24,                               //movups        (%rax),%xmm3
+  15,40,195,                              //movaps        %xmm3,%xmm0
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,40,203,                              //movaps        %xmm3,%xmm1
+  15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
+  15,40,211,                              //movaps        %xmm3,%xmm2
+  15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
+  15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clear_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_plus__sse41[] = {
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_srcover_sse41[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,92,195,                           //subps         %xmm3,%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,204,                           //mulps         %xmm4,%xmm9
+  65,15,88,193,                           //addps         %xmm9,%xmm0
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,205,                           //mulps         %xmm5,%xmm9
+  65,15,88,201,                           //addps         %xmm9,%xmm1
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,206,                           //mulps         %xmm6,%xmm9
+  65,15,88,209,                           //addps         %xmm9,%xmm2
+  68,15,89,199,                           //mulps         %xmm7,%xmm8
+  65,15,88,216,                           //addps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_dstover_sse41[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,92,199,                           //subps         %xmm7,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_0_sse41[] = {
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  65,15,95,192,                           //maxps         %xmm8,%xmm0
+  65,15,95,200,                           //maxps         %xmm8,%xmm1
+  65,15,95,208,                           //maxps         %xmm8,%xmm2
+  65,15,95,216,                           //maxps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_1_sse41[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,93,192,                           //minps         %xmm8,%xmm0
+  65,15,93,200,                           //minps         %xmm8,%xmm1
+  65,15,93,208,                           //minps         %xmm8,%xmm2
+  65,15,93,216,                           //minps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_a_sse41[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,93,216,                           //minps         %xmm8,%xmm3
+  15,93,195,                              //minps         %xmm3,%xmm0
+  15,93,203,                              //minps         %xmm3,%xmm1
+  15,93,211,                              //minps         %xmm3,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_set_rgb_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,15,16,80,8,                         //movss         0x8(%rax),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_rb_sse41[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,194,                              //movaps        %xmm2,%xmm0
+  65,15,40,208,                           //movaps        %xmm8,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_sse41[] = {
+  68,15,40,195,                           //movaps        %xmm3,%xmm8
+  68,15,40,202,                           //movaps        %xmm2,%xmm9
+  68,15,40,209,                           //movaps        %xmm1,%xmm10
+  68,15,40,216,                           //movaps        %xmm0,%xmm11
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,196,                              //movaps        %xmm4,%xmm0
+  15,40,205,                              //movaps        %xmm5,%xmm1
+  15,40,214,                              //movaps        %xmm6,%xmm2
+  15,40,223,                              //movaps        %xmm7,%xmm3
+  65,15,40,227,                           //movaps        %xmm11,%xmm4
+  65,15,40,234,                           //movaps        %xmm10,%xmm5
+  65,15,40,241,                           //movaps        %xmm9,%xmm6
+  65,15,40,248,                           //movaps        %xmm8,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_src_dst_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,224,                              //movaps        %xmm0,%xmm4
+  15,40,233,                              //movaps        %xmm1,%xmm5
+  15,40,242,                              //movaps        %xmm2,%xmm6
+  15,40,251,                              //movaps        %xmm3,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_dst_src_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,196,                              //movaps        %xmm4,%xmm0
+  15,40,205,                              //movaps        %xmm5,%xmm1
+  15,40,214,                              //movaps        %xmm6,%xmm2
+  15,40,223,                              //movaps        %xmm7,%xmm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_premul_sse41[] = {
+  15,89,195,                              //mulps         %xmm3,%xmm0
+  15,89,203,                              //mulps         %xmm3,%xmm1
+  15,89,211,                              //mulps         %xmm3,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_unpremul_sse41[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  69,15,87,201,                           //xorps         %xmm9,%xmm9
+  243,68,15,16,18,                        //movss         (%rdx),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  68,15,94,211,                           //divps         %xmm3,%xmm10
+  15,40,195,                              //movaps        %xmm3,%xmm0
+  65,15,194,193,0,                        //cmpeqps       %xmm9,%xmm0
+  102,69,15,56,20,209,                    //blendvps      %xmm0,%xmm9,%xmm10
+  69,15,89,194,                           //mulps         %xmm10,%xmm8
+  65,15,89,202,                           //mulps         %xmm10,%xmm1
+  65,15,89,210,                           //mulps         %xmm10,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_from_srgb_sse41[] = {
+  68,15,40,194,                           //movaps        %xmm2,%xmm8
+  243,68,15,16,90,64,                     //movss         0x40(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,40,211,                           //movaps        %xmm11,%xmm10
+  68,15,89,208,                           //mulps         %xmm0,%xmm10
+  68,15,40,240,                           //movaps        %xmm0,%xmm14
+  69,15,89,246,                           //mulps         %xmm14,%xmm14
+  243,15,16,82,60,                        //movss         0x3c(%rdx),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  243,68,15,16,98,52,                     //movss         0x34(%rdx),%xmm12
+  243,68,15,16,106,56,                    //movss         0x38(%rdx),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  68,15,40,202,                           //movaps        %xmm2,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  69,15,88,205,                           //addps         %xmm13,%xmm9
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  69,15,89,206,                           //mulps         %xmm14,%xmm9
+  69,15,88,204,                           //addps         %xmm12,%xmm9
+  243,68,15,16,114,68,                    //movss         0x44(%rdx),%xmm14
+  69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
+  65,15,194,198,1,                        //cmpltps       %xmm14,%xmm0
+  102,69,15,56,20,202,                    //blendvps      %xmm0,%xmm10,%xmm9
+  69,15,40,251,                           //movaps        %xmm11,%xmm15
+  68,15,89,249,                           //mulps         %xmm1,%xmm15
+  15,40,193,                              //movaps        %xmm1,%xmm0
+  15,89,192,                              //mulps         %xmm0,%xmm0
+  68,15,40,210,                           //movaps        %xmm2,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  69,15,88,213,                           //addps         %xmm13,%xmm10
+  68,15,89,208,                           //mulps         %xmm0,%xmm10
+  69,15,88,212,                           //addps         %xmm12,%xmm10
+  65,15,194,206,1,                        //cmpltps       %xmm14,%xmm1
+  15,40,193,                              //movaps        %xmm1,%xmm0
+  102,69,15,56,20,215,                    //blendvps      %xmm0,%xmm15,%xmm10
+  69,15,89,216,                           //mulps         %xmm8,%xmm11
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  15,89,192,                              //mulps         %xmm0,%xmm0
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  65,15,88,213,                           //addps         %xmm13,%xmm2
+  15,89,208,                              //mulps         %xmm0,%xmm2
+  65,15,88,212,                           //addps         %xmm12,%xmm2
+  69,15,194,198,1,                        //cmpltps       %xmm14,%xmm8
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  102,65,15,56,20,211,                    //blendvps      %xmm0,%xmm11,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,193,                           //movaps        %xmm9,%xmm0
+  65,15,40,202,                           //movaps        %xmm10,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_to_srgb_sse41[] = {
+  72,131,236,24,                          //sub           $0x18,%rsp
+  15,41,60,36,                            //movaps        %xmm7,(%rsp)
+  15,40,254,                              //movaps        %xmm6,%xmm7
+  15,40,245,                              //movaps        %xmm5,%xmm6
+  15,40,236,                              //movaps        %xmm4,%xmm5
+  15,40,227,                              //movaps        %xmm3,%xmm4
+  68,15,40,194,                           //movaps        %xmm2,%xmm8
+  15,40,217,                              //movaps        %xmm1,%xmm3
+  15,82,208,                              //rsqrtps       %xmm0,%xmm2
+  68,15,83,202,                           //rcpps         %xmm2,%xmm9
+  68,15,82,210,                           //rsqrtps       %xmm2,%xmm10
+  243,15,16,18,                           //movss         (%rdx),%xmm2
+  243,68,15,16,90,72,                     //movss         0x48(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  65,15,40,203,                           //movaps        %xmm11,%xmm1
+  15,89,200,                              //mulps         %xmm0,%xmm1
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  243,68,15,16,98,76,                     //movss         0x4c(%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  243,68,15,16,106,80,                    //movss         0x50(%rdx),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  243,68,15,16,114,84,                    //movss         0x54(%rdx),%xmm14
+  69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
+  69,15,89,205,                           //mulps         %xmm13,%xmm9
+  69,15,88,206,                           //addps         %xmm14,%xmm9
+  69,15,89,212,                           //mulps         %xmm12,%xmm10
+  69,15,88,209,                           //addps         %xmm9,%xmm10
+  68,15,40,202,                           //movaps        %xmm2,%xmm9
+  69,15,93,202,                           //minps         %xmm10,%xmm9
+  243,68,15,16,122,88,                    //movss         0x58(%rdx),%xmm15
+  69,15,198,255,0,                        //shufps        $0x0,%xmm15,%xmm15
+  65,15,194,199,1,                        //cmpltps       %xmm15,%xmm0
+  102,68,15,56,20,201,                    //blendvps      %xmm0,%xmm1,%xmm9
+  15,82,195,                              //rsqrtps       %xmm3,%xmm0
+  15,83,200,                              //rcpps         %xmm0,%xmm1
+  15,82,192,                              //rsqrtps       %xmm0,%xmm0
+  65,15,89,205,                           //mulps         %xmm13,%xmm1
+  65,15,88,206,                           //addps         %xmm14,%xmm1
+  65,15,89,196,                           //mulps         %xmm12,%xmm0
+  15,88,193,                              //addps         %xmm1,%xmm0
+  68,15,40,210,                           //movaps        %xmm2,%xmm10
+  68,15,93,208,                           //minps         %xmm0,%xmm10
+  65,15,40,203,                           //movaps        %xmm11,%xmm1
+  15,89,203,                              //mulps         %xmm3,%xmm1
+  65,15,194,223,1,                        //cmpltps       %xmm15,%xmm3
+  15,40,195,                              //movaps        %xmm3,%xmm0
+  102,68,15,56,20,209,                    //blendvps      %xmm0,%xmm1,%xmm10
+  65,15,82,192,                           //rsqrtps       %xmm8,%xmm0
+  15,83,200,                              //rcpps         %xmm0,%xmm1
+  65,15,89,205,                           //mulps         %xmm13,%xmm1
+  65,15,88,206,                           //addps         %xmm14,%xmm1
+  15,82,192,                              //rsqrtps       %xmm0,%xmm0
+  65,15,89,196,                           //mulps         %xmm12,%xmm0
+  15,88,193,                              //addps         %xmm1,%xmm0
+  15,93,208,                              //minps         %xmm0,%xmm2
+  69,15,89,216,                           //mulps         %xmm8,%xmm11
+  69,15,194,199,1,                        //cmpltps       %xmm15,%xmm8
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  102,65,15,56,20,211,                    //blendvps      %xmm0,%xmm11,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,193,                           //movaps        %xmm9,%xmm0
+  65,15,40,202,                           //movaps        %xmm10,%xmm1
+  15,40,220,                              //movaps        %xmm4,%xmm3
+  15,40,229,                              //movaps        %xmm5,%xmm4
+  15,40,238,                              //movaps        %xmm6,%xmm5
+  15,40,247,                              //movaps        %xmm7,%xmm6
+  15,40,60,36,                            //movaps        (%rsp),%xmm7
+  72,131,196,24,                          //add           $0x18,%rsp
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_1_float_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_u8_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,56,49,4,56,                   //pmovzxbd      (%rax,%rdi,1),%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,68,15,16,74,12,                     //movss         0xc(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_1_float_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,92,223,                              //subps         %xmm7,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_u8_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,56,49,4,56,                   //pmovzxbd      (%rax,%rdi,1),%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,68,15,16,74,12,                     //movss         0xc(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,92,223,                              //subps         %xmm7,%xmm3
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_565_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,56,51,4,120,                  //pmovzxwd      (%rax,%rdi,2),%xmm8
+  102,15,110,90,104,                      //movd          0x68(%rdx),%xmm3
+  102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
+  102,65,15,219,216,                      //pand          %xmm8,%xmm3
+  68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,68,15,16,82,116,                    //movss         0x74(%rdx),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  102,68,15,110,74,108,                   //movd          0x6c(%rdx),%xmm9
+  102,69,15,112,201,0,                    //pshufd        $0x0,%xmm9,%xmm9
+  102,69,15,219,200,                      //pand          %xmm8,%xmm9
+  69,15,91,201,                           //cvtdq2ps      %xmm9,%xmm9
+  243,68,15,16,90,120,                    //movss         0x78(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  102,68,15,110,74,112,                   //movd          0x70(%rdx),%xmm9
+  102,69,15,112,201,0,                    //pshufd        $0x0,%xmm9,%xmm9
+  102,69,15,219,200,                      //pand          %xmm8,%xmm9
+  69,15,91,193,                           //cvtdq2ps      %xmm9,%xmm8
+  243,68,15,16,74,124,                    //movss         0x7c(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,194,                           //mulps         %xmm10,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,203,                           //mulps         %xmm11,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_tables_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,8,                               //mov           (%rax),%rcx
+  76,139,64,8,                            //mov           0x8(%rax),%r8
+  243,68,15,111,4,185,                    //movdqu        (%rcx,%rdi,4),%xmm8
+  102,15,110,66,16,                       //movd          0x10(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,65,15,111,200,                      //movdqa        %xmm8,%xmm1
+  102,15,114,209,8,                       //psrld         $0x8,%xmm1
+  102,15,219,200,                         //pand          %xmm0,%xmm1
+  102,65,15,111,208,                      //movdqa        %xmm8,%xmm2
+  102,15,114,210,16,                      //psrld         $0x10,%xmm2
+  102,15,219,208,                         //pand          %xmm0,%xmm2
+  102,65,15,219,192,                      //pand          %xmm8,%xmm0
+  102,72,15,58,22,193,1,                  //pextrq        $0x1,%xmm0,%rcx
+  65,137,201,                             //mov           %ecx,%r9d
+  72,193,233,32,                          //shr           $0x20,%rcx
+  102,73,15,126,194,                      //movq          %xmm0,%r10
+  69,137,211,                             //mov           %r10d,%r11d
+  73,193,234,32,                          //shr           $0x20,%r10
+  243,67,15,16,4,152,                     //movss         (%r8,%r11,4),%xmm0
+  102,67,15,58,33,4,144,16,               //insertps      $0x10,(%r8,%r10,4),%xmm0
+  102,67,15,58,33,4,136,32,               //insertps      $0x20,(%r8,%r9,4),%xmm0
+  102,65,15,58,33,4,136,48,               //insertps      $0x30,(%r8,%rcx,4),%xmm0
+  72,139,72,16,                           //mov           0x10(%rax),%rcx
+  102,73,15,58,22,200,1,                  //pextrq        $0x1,%xmm1,%r8
+  69,137,193,                             //mov           %r8d,%r9d
+  73,193,232,32,                          //shr           $0x20,%r8
+  102,73,15,126,202,                      //movq          %xmm1,%r10
+  69,137,211,                             //mov           %r10d,%r11d
+  73,193,234,32,                          //shr           $0x20,%r10
+  243,66,15,16,12,153,                    //movss         (%rcx,%r11,4),%xmm1
+  102,66,15,58,33,12,145,16,              //insertps      $0x10,(%rcx,%r10,4),%xmm1
+  243,66,15,16,28,137,                    //movss         (%rcx,%r9,4),%xmm3
+  102,15,58,33,203,32,                    //insertps      $0x20,%xmm3,%xmm1
+  243,66,15,16,28,129,                    //movss         (%rcx,%r8,4),%xmm3
+  102,15,58,33,203,48,                    //insertps      $0x30,%xmm3,%xmm1
+  72,139,64,24,                           //mov           0x18(%rax),%rax
+  102,72,15,58,22,209,1,                  //pextrq        $0x1,%xmm2,%rcx
+  65,137,200,                             //mov           %ecx,%r8d
+  72,193,233,32,                          //shr           $0x20,%rcx
+  102,73,15,126,209,                      //movq          %xmm2,%r9
+  69,137,202,                             //mov           %r9d,%r10d
+  73,193,233,32,                          //shr           $0x20,%r9
+  243,66,15,16,20,144,                    //movss         (%rax,%r10,4),%xmm2
+  102,66,15,58,33,20,136,16,              //insertps      $0x10,(%rax,%r9,4),%xmm2
+  243,66,15,16,28,128,                    //movss         (%rax,%r8,4),%xmm3
+  102,15,58,33,211,32,                    //insertps      $0x20,%xmm3,%xmm2
+  243,15,16,28,136,                       //movss         (%rax,%rcx,4),%xmm3
+  102,15,58,33,211,48,                    //insertps      $0x30,%xmm3,%xmm2
+  102,65,15,114,208,24,                   //psrld         $0x18,%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,15,16,90,12,                        //movss         0xc(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_a8_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,15,56,49,4,56,                      //pmovzxbd      (%rax,%rdi,1),%xmm0
+  15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
+  243,15,16,90,12,                        //movss         0xc(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  15,89,216,                              //mulps         %xmm0,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_a8_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,66,8,                      //movss         0x8(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,69,15,56,43,192,                    //packusdw      %xmm8,%xmm8
+  102,69,15,103,192,                      //packuswb      %xmm8,%xmm8
+  102,68,15,126,4,56,                     //movd          %xmm8,(%rax,%rdi,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_565_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,56,51,12,120,                 //pmovzxwd      (%rax,%rdi,2),%xmm9
+  102,15,110,66,104,                      //movd          0x68(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,65,15,219,193,                      //pand          %xmm9,%xmm0
+  15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,15,16,66,116,                       //movss         0x74(%rdx),%xmm0
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,89,193,                              //mulps         %xmm1,%xmm0
+  102,15,110,74,108,                      //movd          0x6c(%rdx),%xmm1
+  102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
+  102,65,15,219,201,                      //pand          %xmm9,%xmm1
+  68,15,91,193,                           //cvtdq2ps      %xmm1,%xmm8
+  243,15,16,74,120,                       //movss         0x78(%rdx),%xmm1
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  102,15,110,82,112,                      //movd          0x70(%rdx),%xmm2
+  102,15,112,210,0,                       //pshufd        $0x0,%xmm2,%xmm2
+  102,65,15,219,209,                      //pand          %xmm9,%xmm2
+  68,15,91,194,                           //cvtdq2ps      %xmm2,%xmm8
+  243,15,16,82,124,                       //movss         0x7c(%rdx),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_565_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,130,128,0,0,0,             //movss         0x80(%rdx),%xmm8
+  243,68,15,16,138,132,0,0,0,             //movss         0x84(%rdx),%xmm9
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,208,                           //movaps        %xmm8,%xmm10
+  68,15,89,208,                           //mulps         %xmm0,%xmm10
+  102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
+  102,65,15,114,242,11,                   //pslld         $0xb,%xmm10
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  102,65,15,114,241,5,                    //pslld         $0x5,%xmm9
+  102,69,15,235,202,                      //por           %xmm10,%xmm9
+  68,15,89,194,                           //mulps         %xmm2,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,69,15,86,193,                       //orpd          %xmm9,%xmm8
+  102,69,15,56,43,192,                    //packusdw      %xmm8,%xmm8
+  102,68,15,214,4,120,                    //movq          %xmm8,(%rax,%rdi,2)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_8888_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,15,111,28,184,                      //movdqu        (%rax,%rdi,4),%xmm3
+  102,15,110,66,16,                       //movd          0x10(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,15,111,203,                         //movdqa        %xmm3,%xmm1
+  102,15,114,209,8,                       //psrld         $0x8,%xmm1
+  102,15,219,200,                         //pand          %xmm0,%xmm1
+  102,15,111,211,                         //movdqa        %xmm3,%xmm2
+  102,15,114,210,16,                      //psrld         $0x10,%xmm2
+  102,15,219,208,                         //pand          %xmm0,%xmm2
+  102,15,219,195,                         //pand          %xmm3,%xmm0
+  15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
+  243,68,15,16,66,12,                     //movss         0xc(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,91,210,                              //cvtdq2ps      %xmm2,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  102,15,114,211,24,                      //psrld         $0x18,%xmm3
+  15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_8888_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,66,8,                      //movss         0x8(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  69,15,40,208,                           //movaps        %xmm8,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
+  102,65,15,114,242,8,                    //pslld         $0x8,%xmm10
+  102,69,15,235,209,                      //por           %xmm9,%xmm10
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,202,                           //mulps         %xmm2,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  102,65,15,114,241,16,                   //pslld         $0x10,%xmm9
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,65,15,114,240,24,                   //pslld         $0x18,%xmm8
+  102,69,15,235,193,                      //por           %xmm9,%xmm8
+  102,69,15,235,194,                      //por           %xmm10,%xmm8
+  243,68,15,127,4,184,                    //movdqu        %xmm8,(%rax,%rdi,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_f16_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,15,111,4,248,                       //movdqu        (%rax,%rdi,8),%xmm0
+  243,15,111,76,248,16,                   //movdqu        0x10(%rax,%rdi,8),%xmm1
+  102,15,111,208,                         //movdqa        %xmm0,%xmm2
+  102,15,97,209,                          //punpcklwd     %xmm1,%xmm2
+  102,15,105,193,                         //punpckhwd     %xmm1,%xmm0
+  102,68,15,111,194,                      //movdqa        %xmm2,%xmm8
+  102,68,15,97,192,                       //punpcklwd     %xmm0,%xmm8
+  102,15,105,208,                         //punpckhwd     %xmm0,%xmm2
+  102,15,110,66,100,                      //movd          0x64(%rdx),%xmm0
+  102,15,112,216,0,                       //pshufd        $0x0,%xmm0,%xmm3
+  102,15,111,203,                         //movdqa        %xmm3,%xmm1
+  102,65,15,101,200,                      //pcmpgtw       %xmm8,%xmm1
+  102,65,15,223,200,                      //pandn         %xmm8,%xmm1
+  102,15,101,218,                         //pcmpgtw       %xmm2,%xmm3
+  102,15,223,218,                         //pandn         %xmm2,%xmm3
+  102,15,56,51,193,                       //pmovzxwd      %xmm1,%xmm0
+  102,15,114,240,13,                      //pslld         $0xd,%xmm0
+  102,15,110,82,92,                       //movd          0x5c(%rdx),%xmm2
+  102,68,15,112,194,0,                    //pshufd        $0x0,%xmm2,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  102,69,15,239,201,                      //pxor          %xmm9,%xmm9
+  102,65,15,105,201,                      //punpckhwd     %xmm9,%xmm1
+  102,15,114,241,13,                      //pslld         $0xd,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  102,15,56,51,211,                       //pmovzxwd      %xmm3,%xmm2
+  102,15,114,242,13,                      //pslld         $0xd,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  102,65,15,105,217,                      //punpckhwd     %xmm9,%xmm3
+  102,15,114,243,13,                      //pslld         $0xd,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_f16_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,110,66,96,                    //movd          0x60(%rdx),%xmm8
+  102,69,15,112,192,0,                    //pshufd        $0x0,%xmm8,%xmm8
+  102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  102,65,15,114,209,13,                   //psrld         $0xd,%xmm9
+  102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  102,65,15,114,210,13,                   //psrld         $0xd,%xmm10
+  102,69,15,111,216,                      //movdqa        %xmm8,%xmm11
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  102,65,15,114,211,13,                   //psrld         $0xd,%xmm11
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,65,15,114,208,13,                   //psrld         $0xd,%xmm8
+  102,65,15,115,250,2,                    //pslldq        $0x2,%xmm10
+  102,69,15,235,209,                      //por           %xmm9,%xmm10
+  102,65,15,115,248,2,                    //pslldq        $0x2,%xmm8
+  102,69,15,235,195,                      //por           %xmm11,%xmm8
+  102,69,15,111,202,                      //movdqa        %xmm10,%xmm9
+  102,69,15,98,200,                       //punpckldq     %xmm8,%xmm9
+  243,68,15,127,12,248,                   //movdqu        %xmm9,(%rax,%rdi,8)
+  102,69,15,106,208,                      //punpckhdq     %xmm8,%xmm10
+  243,68,15,127,84,248,16,                //movdqu        %xmm10,0x10(%rax,%rdi,8)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_f32_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,137,249,                             //mov           %rdi,%rcx
+  72,193,225,4,                           //shl           $0x4,%rcx
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  68,15,40,200,                           //movaps        %xmm0,%xmm9
+  68,15,20,201,                           //unpcklps      %xmm1,%xmm9
+  68,15,40,210,                           //movaps        %xmm2,%xmm10
+  68,15,40,218,                           //movaps        %xmm2,%xmm11
+  68,15,20,219,                           //unpcklps      %xmm3,%xmm11
+  68,15,21,193,                           //unpckhps      %xmm1,%xmm8
+  68,15,21,211,                           //unpckhps      %xmm3,%xmm10
+  69,15,40,225,                           //movaps        %xmm9,%xmm12
+  102,69,15,20,227,                       //unpcklpd      %xmm11,%xmm12
+  102,69,15,21,203,                       //unpckhpd      %xmm11,%xmm9
+  69,15,40,216,                           //movaps        %xmm8,%xmm11
+  102,69,15,20,218,                       //unpcklpd      %xmm10,%xmm11
+  102,69,15,21,194,                       //unpckhpd      %xmm10,%xmm8
+  102,68,15,17,36,8,                      //movupd        %xmm12,(%rax,%rcx,1)
+  102,68,15,17,76,8,16,                   //movupd        %xmm9,0x10(%rax,%rcx,1)
+  102,68,15,17,92,8,32,                   //movupd        %xmm11,0x20(%rax,%rcx,1)
+  102,68,15,17,68,8,48,                   //movupd        %xmm8,0x30(%rax,%rcx,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_x_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,95,192,                           //maxps         %xmm0,%xmm8
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  102,15,118,192,                         //pcmpeqd       %xmm0,%xmm0
+  102,65,15,254,193,                      //paddd         %xmm9,%xmm0
+  68,15,93,192,                           //minps         %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_y_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,95,193,                           //maxps         %xmm1,%xmm8
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  102,15,118,201,                         //pcmpeqd       %xmm1,%xmm1
+  102,65,15,254,201,                      //paddd         %xmm9,%xmm1
+  68,15,93,193,                           //minps         %xmm1,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,200,                           //movaps        %xmm8,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_x_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,200,                           //movaps        %xmm0,%xmm9
+  69,15,94,200,                           //divps         %xmm8,%xmm9
+  102,69,15,58,8,201,1,                   //roundps       $0x1,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  65,15,92,193,                           //subps         %xmm9,%xmm0
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,193,                           //minps         %xmm9,%xmm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_y_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  69,15,94,200,                           //divps         %xmm8,%xmm9
+  102,69,15,58,8,201,1,                   //roundps       $0x1,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  65,15,92,201,                           //subps         %xmm9,%xmm1
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,201,                           //minps         %xmm9,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_x_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  65,15,92,193,                           //subps         %xmm9,%xmm0
+  243,69,15,88,192,                       //addss         %xmm8,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,208,                           //movaps        %xmm0,%xmm10
+  69,15,94,208,                           //divps         %xmm8,%xmm10
+  102,69,15,58,8,210,1,                   //roundps       $0x1,%xmm10,%xmm10
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  65,15,92,194,                           //subps         %xmm10,%xmm0
+  65,15,92,193,                           //subps         %xmm9,%xmm0
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,92,192,                           //subps         %xmm0,%xmm8
+  65,15,84,192,                           //andps         %xmm8,%xmm0
+  102,69,15,118,192,                      //pcmpeqd       %xmm8,%xmm8
+  102,69,15,254,193,                      //paddd         %xmm9,%xmm8
+  65,15,93,192,                           //minps         %xmm8,%xmm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_y_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  65,15,92,201,                           //subps         %xmm9,%xmm1
+  243,69,15,88,192,                       //addss         %xmm8,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,209,                           //movaps        %xmm1,%xmm10
+  69,15,94,208,                           //divps         %xmm8,%xmm10
+  102,69,15,58,8,210,1,                   //roundps       $0x1,%xmm10,%xmm10
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  65,15,92,202,                           //subps         %xmm10,%xmm1
+  65,15,92,201,                           //subps         %xmm9,%xmm1
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,92,193,                           //subps         %xmm1,%xmm8
+  65,15,84,200,                           //andps         %xmm8,%xmm1
+  102,69,15,118,192,                      //pcmpeqd       %xmm8,%xmm8
+  102,69,15,254,193,                      //paddd         %xmm9,%xmm8
+  65,15,93,200,                           //minps         %xmm8,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_2x3_sse41[] = {
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,16,                     //movss         0x10(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,194,                           //addps         %xmm10,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,88,202,                           //addps         %xmm10,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_3x4_sse41[] = {
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,36,                     //movss         0x24(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,194,                           //addps         %xmm10,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,88,202,                           //addps         %xmm10,%xmm1
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  68,15,89,226,                           //mulps         %xmm2,%xmm12
+  69,15,88,229,                           //addps         %xmm13,%xmm12
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,210,                           //movaps        %xmm10,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_perspective_sse41[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,68,15,16,72,4,                      //movss         0x4(%rax),%xmm9
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,193,                           //addps         %xmm9,%xmm0
+  243,68,15,16,72,12,                     //movss         0xc(%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  243,68,15,16,80,24,                     //movss         0x18(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,217,                           //mulps         %xmm1,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,83,202,                           //rcpps         %xmm10,%xmm1
+  15,89,193,                              //mulps         %xmm1,%xmm0
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,201,                           //movaps        %xmm9,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_linear_gradient_2stops_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  68,15,16,8,                             //movups        (%rax),%xmm9
+  15,16,88,16,                            //movups        0x10(%rax),%xmm3
+  68,15,40,195,                           //movaps        %xmm3,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,40,201,                           //movaps        %xmm9,%xmm1
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  68,15,89,192,                           //mulps         %xmm0,%xmm8
+  68,15,88,193,                           //addps         %xmm1,%xmm8
+  15,40,203,                              //movaps        %xmm3,%xmm1
+  15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
+  65,15,40,209,                           //movaps        %xmm9,%xmm2
+  15,198,210,85,                          //shufps        $0x55,%xmm2,%xmm2
+  15,89,200,                              //mulps         %xmm0,%xmm1
+  15,88,202,                              //addps         %xmm2,%xmm1
+  15,40,211,                              //movaps        %xmm3,%xmm2
+  15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
+  69,15,40,209,                           //movaps        %xmm9,%xmm10
+  69,15,198,210,170,                      //shufps        $0xaa,%xmm10,%xmm10
+  15,89,208,                              //mulps         %xmm0,%xmm2
+  65,15,88,210,                           //addps         %xmm10,%xmm2
+  15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
+  69,15,198,201,255,                      //shufps        $0xff,%xmm9,%xmm9
+  15,89,216,                              //mulps         %xmm0,%xmm3
+  65,15,88,217,                           //addps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_start_pipeline_sse2[] = {
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  83,                                     //push          %rbx
+  73,137,207,                             //mov           %rcx,%r15
+  73,137,214,                             //mov           %rdx,%r14
+  72,137,251,                             //mov           %rdi,%rbx
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  73,137,196,                             //mov           %rax,%r12
+  73,137,245,                             //mov           %rsi,%r13
+  72,141,67,4,                            //lea           0x4(%rbx),%rax
+  76,57,248,                              //cmp           %r15,%rax
+  118,5,                                  //jbe           28 <_sk_start_pipeline_sse2+0x28>
+  72,137,216,                             //mov           %rbx,%rax
+  235,52,                                 //jmp           5c <_sk_start_pipeline_sse2+0x5c>
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  15,87,228,                              //xorps         %xmm4,%xmm4
+  15,87,237,                              //xorps         %xmm5,%xmm5
+  15,87,246,                              //xorps         %xmm6,%xmm6
+  15,87,255,                              //xorps         %xmm7,%xmm7
+  72,137,223,                             //mov           %rbx,%rdi
+  76,137,238,                             //mov           %r13,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,212,                             //callq         *%r12
+  72,141,67,4,                            //lea           0x4(%rbx),%rax
+  72,131,195,8,                           //add           $0x8,%rbx
+  76,57,251,                              //cmp           %r15,%rbx
+  72,137,195,                             //mov           %rax,%rbx
+  118,204,                                //jbe           28 <_sk_start_pipeline_sse2+0x28>
+  91,                                     //pop           %rbx
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_just_return_sse2[] = {
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_seed_shader_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  102,15,110,199,                         //movd          %edi,%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
+  243,15,16,18,                           //movss         (%rdx),%xmm2
+  243,15,16,90,4,                         //movss         0x4(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  15,88,203,                              //addps         %xmm3,%xmm1
+  15,16,66,20,                            //movups        0x14(%rdx),%xmm0
+  15,88,193,                              //addps         %xmm1,%xmm0
+  102,15,110,8,                           //movd          (%rax),%xmm1
+  102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
+  15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
+  15,88,203,                              //addps         %xmm3,%xmm1
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  15,87,228,                              //xorps         %xmm4,%xmm4
+  15,87,237,                              //xorps         %xmm5,%xmm5
+  15,87,246,                              //xorps         %xmm6,%xmm6
+  15,87,255,                              //xorps         %xmm7,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_constant_color_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,16,24,                               //movups        (%rax),%xmm3
+  15,40,195,                              //movaps        %xmm3,%xmm0
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,40,203,                              //movaps        %xmm3,%xmm1
+  15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
+  15,40,211,                              //movaps        %xmm3,%xmm2
+  15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
+  15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clear_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_plus__sse2[] = {
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_srcover_sse2[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,92,195,                           //subps         %xmm3,%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,204,                           //mulps         %xmm4,%xmm9
+  65,15,88,193,                           //addps         %xmm9,%xmm0
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,205,                           //mulps         %xmm5,%xmm9
+  65,15,88,201,                           //addps         %xmm9,%xmm1
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,206,                           //mulps         %xmm6,%xmm9
+  65,15,88,209,                           //addps         %xmm9,%xmm2
+  68,15,89,199,                           //mulps         %xmm7,%xmm8
+  65,15,88,216,                           //addps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_dstover_sse2[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,92,199,                           //subps         %xmm7,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_0_sse2[] = {
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  65,15,95,192,                           //maxps         %xmm8,%xmm0
+  65,15,95,200,                           //maxps         %xmm8,%xmm1
+  65,15,95,208,                           //maxps         %xmm8,%xmm2
+  65,15,95,216,                           //maxps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_1_sse2[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,93,192,                           //minps         %xmm8,%xmm0
+  65,15,93,200,                           //minps         %xmm8,%xmm1
+  65,15,93,208,                           //minps         %xmm8,%xmm2
+  65,15,93,216,                           //minps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_a_sse2[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,93,216,                           //minps         %xmm8,%xmm3
+  15,93,195,                              //minps         %xmm3,%xmm0
+  15,93,203,                              //minps         %xmm3,%xmm1
+  15,93,211,                              //minps         %xmm3,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_set_rgb_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,15,16,80,8,                         //movss         0x8(%rax),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_rb_sse2[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,194,                              //movaps        %xmm2,%xmm0
+  65,15,40,208,                           //movaps        %xmm8,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_sse2[] = {
+  68,15,40,195,                           //movaps        %xmm3,%xmm8
+  68,15,40,202,                           //movaps        %xmm2,%xmm9
+  68,15,40,209,                           //movaps        %xmm1,%xmm10
+  68,15,40,216,                           //movaps        %xmm0,%xmm11
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,196,                              //movaps        %xmm4,%xmm0
+  15,40,205,                              //movaps        %xmm5,%xmm1
+  15,40,214,                              //movaps        %xmm6,%xmm2
+  15,40,223,                              //movaps        %xmm7,%xmm3
+  65,15,40,227,                           //movaps        %xmm11,%xmm4
+  65,15,40,234,                           //movaps        %xmm10,%xmm5
+  65,15,40,241,                           //movaps        %xmm9,%xmm6
+  65,15,40,248,                           //movaps        %xmm8,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_src_dst_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,224,                              //movaps        %xmm0,%xmm4
+  15,40,233,                              //movaps        %xmm1,%xmm5
+  15,40,242,                              //movaps        %xmm2,%xmm6
+  15,40,251,                              //movaps        %xmm3,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_dst_src_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,196,                              //movaps        %xmm4,%xmm0
+  15,40,205,                              //movaps        %xmm5,%xmm1
+  15,40,214,                              //movaps        %xmm6,%xmm2
+  15,40,223,                              //movaps        %xmm7,%xmm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_premul_sse2[] = {
+  15,89,195,                              //mulps         %xmm3,%xmm0
+  15,89,203,                              //mulps         %xmm3,%xmm1
+  15,89,211,                              //mulps         %xmm3,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_unpremul_sse2[] = {
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,194,195,0,                        //cmpeqps       %xmm3,%xmm8
+  243,68,15,16,10,                        //movss         (%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,94,203,                           //divps         %xmm3,%xmm9
+  69,15,85,193,                           //andnps        %xmm9,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_from_srgb_sse2[] = {
+  243,68,15,16,66,64,                     //movss         0x40(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,232,                           //movaps        %xmm8,%xmm13
+  68,15,89,232,                           //mulps         %xmm0,%xmm13
+  68,15,40,224,                           //movaps        %xmm0,%xmm12
+  69,15,89,228,                           //mulps         %xmm12,%xmm12
+  243,68,15,16,74,60,                     //movss         0x3c(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,82,52,                     //movss         0x34(%rdx),%xmm10
+  243,68,15,16,90,56,                     //movss         0x38(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,40,241,                           //movaps        %xmm9,%xmm14
+  68,15,89,240,                           //mulps         %xmm0,%xmm14
+  69,15,88,243,                           //addps         %xmm11,%xmm14
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  69,15,89,244,                           //mulps         %xmm12,%xmm14
+  69,15,88,242,                           //addps         %xmm10,%xmm14
+  243,68,15,16,98,68,                     //movss         0x44(%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  65,15,194,196,1,                        //cmpltps       %xmm12,%xmm0
+  68,15,84,232,                           //andps         %xmm0,%xmm13
+  65,15,85,198,                           //andnps        %xmm14,%xmm0
+  65,15,86,197,                           //orps          %xmm13,%xmm0
+  69,15,40,232,                           //movaps        %xmm8,%xmm13
+  68,15,89,233,                           //mulps         %xmm1,%xmm13
+  68,15,40,241,                           //movaps        %xmm1,%xmm14
+  69,15,89,246,                           //mulps         %xmm14,%xmm14
+  69,15,40,249,                           //movaps        %xmm9,%xmm15
+  68,15,89,249,                           //mulps         %xmm1,%xmm15
+  69,15,88,251,                           //addps         %xmm11,%xmm15
+  69,15,89,254,                           //mulps         %xmm14,%xmm15
+  69,15,88,250,                           //addps         %xmm10,%xmm15
+  65,15,194,204,1,                        //cmpltps       %xmm12,%xmm1
+  68,15,84,233,                           //andps         %xmm1,%xmm13
+  65,15,85,207,                           //andnps        %xmm15,%xmm1
+  65,15,86,205,                           //orps          %xmm13,%xmm1
+  68,15,89,194,                           //mulps         %xmm2,%xmm8
+  68,15,40,234,                           //movaps        %xmm2,%xmm13
+  69,15,89,237,                           //mulps         %xmm13,%xmm13
+  68,15,89,202,                           //mulps         %xmm2,%xmm9
+  69,15,88,203,                           //addps         %xmm11,%xmm9
+  69,15,89,205,                           //mulps         %xmm13,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  65,15,194,212,1,                        //cmpltps       %xmm12,%xmm2
+  68,15,84,194,                           //andps         %xmm2,%xmm8
+  65,15,85,209,                           //andnps        %xmm9,%xmm2
+  65,15,86,208,                           //orps          %xmm8,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_to_srgb_sse2[] = {
+  72,131,236,40,                          //sub           $0x28,%rsp
+  15,41,124,36,16,                        //movaps        %xmm7,0x10(%rsp)
+  15,41,52,36,                            //movaps        %xmm6,(%rsp)
+  15,40,245,                              //movaps        %xmm5,%xmm6
+  15,40,236,                              //movaps        %xmm4,%xmm5
+  15,40,227,                              //movaps        %xmm3,%xmm4
+  68,15,82,192,                           //rsqrtps       %xmm0,%xmm8
+  69,15,83,232,                           //rcpps         %xmm8,%xmm13
+  69,15,82,248,                           //rsqrtps       %xmm8,%xmm15
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,68,15,16,66,72,                     //movss         0x48(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,240,                           //movaps        %xmm8,%xmm14
+  68,15,89,240,                           //mulps         %xmm0,%xmm14
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  243,68,15,16,82,76,                     //movss         0x4c(%rdx),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,90,80,                     //movss         0x50(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,98,84,                     //movss         0x54(%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  69,15,89,235,                           //mulps         %xmm11,%xmm13
+  69,15,88,236,                           //addps         %xmm12,%xmm13
+  69,15,89,250,                           //mulps         %xmm10,%xmm15
+  69,15,88,253,                           //addps         %xmm13,%xmm15
+  68,15,40,203,                           //movaps        %xmm3,%xmm9
+  69,15,93,207,                           //minps         %xmm15,%xmm9
+  243,68,15,16,106,88,                    //movss         0x58(%rdx),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  65,15,194,197,1,                        //cmpltps       %xmm13,%xmm0
+  68,15,84,240,                           //andps         %xmm0,%xmm14
+  65,15,85,193,                           //andnps        %xmm9,%xmm0
+  65,15,86,198,                           //orps          %xmm14,%xmm0
+  68,15,82,201,                           //rsqrtps       %xmm1,%xmm9
+  69,15,83,241,                           //rcpps         %xmm9,%xmm14
+  69,15,82,201,                           //rsqrtps       %xmm9,%xmm9
+  69,15,89,243,                           //mulps         %xmm11,%xmm14
+  69,15,88,244,                           //addps         %xmm12,%xmm14
+  69,15,89,202,                           //mulps         %xmm10,%xmm9
+  69,15,88,206,                           //addps         %xmm14,%xmm9
+  68,15,40,243,                           //movaps        %xmm3,%xmm14
+  69,15,93,241,                           //minps         %xmm9,%xmm14
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  65,15,194,205,1,                        //cmpltps       %xmm13,%xmm1
+  68,15,84,201,                           //andps         %xmm1,%xmm9
+  65,15,85,206,                           //andnps        %xmm14,%xmm1
+  65,15,86,201,                           //orps          %xmm9,%xmm1
+  68,15,82,202,                           //rsqrtps       %xmm2,%xmm9
+  69,15,83,241,                           //rcpps         %xmm9,%xmm14
+  69,15,89,243,                           //mulps         %xmm11,%xmm14
+  69,15,88,244,                           //addps         %xmm12,%xmm14
+  65,15,82,249,                           //rsqrtps       %xmm9,%xmm7
+  65,15,89,250,                           //mulps         %xmm10,%xmm7
+  65,15,88,254,                           //addps         %xmm14,%xmm7
+  15,93,223,                              //minps         %xmm7,%xmm3
+  68,15,89,194,                           //mulps         %xmm2,%xmm8
+  65,15,194,213,1,                        //cmpltps       %xmm13,%xmm2
+  68,15,84,194,                           //andps         %xmm2,%xmm8
+  15,85,211,                              //andnps        %xmm3,%xmm2
+  65,15,86,208,                           //orps          %xmm8,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,220,                              //movaps        %xmm4,%xmm3
+  15,40,229,                              //movaps        %xmm5,%xmm4
+  15,40,238,                              //movaps        %xmm6,%xmm5
+  15,40,52,36,                            //movaps        (%rsp),%xmm6
+  15,40,124,36,16,                        //movaps        0x10(%rsp),%xmm7
+  72,131,196,40,                          //add           $0x28,%rsp
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_1_float_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_u8_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,110,4,56,                     //movd          (%rax,%rdi,1),%xmm8
+  102,69,15,239,201,                      //pxor          %xmm9,%xmm9
+  102,69,15,96,193,                       //punpcklbw     %xmm9,%xmm8
+  102,69,15,97,193,                       //punpcklwd     %xmm9,%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,68,15,16,74,12,                     //movss         0xc(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_1_float_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,92,223,                              //subps         %xmm7,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_u8_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,110,4,56,                     //movd          (%rax,%rdi,1),%xmm8
+  102,69,15,239,201,                      //pxor          %xmm9,%xmm9
+  102,69,15,96,193,                       //punpcklbw     %xmm9,%xmm8
+  102,69,15,97,193,                       //punpcklwd     %xmm9,%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,68,15,16,74,12,                     //movss         0xc(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,92,223,                              //subps         %xmm7,%xmm3
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_565_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,126,4,120,                    //movq          (%rax,%rdi,2),%xmm8
+  102,15,239,219,                         //pxor          %xmm3,%xmm3
+  102,68,15,97,195,                       //punpcklwd     %xmm3,%xmm8
+  102,15,110,90,104,                      //movd          0x68(%rdx),%xmm3
+  102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
+  102,65,15,219,216,                      //pand          %xmm8,%xmm3
+  68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,68,15,16,82,116,                    //movss         0x74(%rdx),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  102,68,15,110,74,108,                   //movd          0x6c(%rdx),%xmm9
+  102,69,15,112,201,0,                    //pshufd        $0x0,%xmm9,%xmm9
+  102,69,15,219,200,                      //pand          %xmm8,%xmm9
+  69,15,91,201,                           //cvtdq2ps      %xmm9,%xmm9
+  243,68,15,16,90,120,                    //movss         0x78(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  102,68,15,110,74,112,                   //movd          0x70(%rdx),%xmm9
+  102,69,15,112,201,0,                    //pshufd        $0x0,%xmm9,%xmm9
+  102,69,15,219,200,                      //pand          %xmm8,%xmm9
+  69,15,91,193,                           //cvtdq2ps      %xmm9,%xmm8
+  243,68,15,16,74,124,                    //movss         0x7c(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,194,                           //mulps         %xmm10,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,203,                           //mulps         %xmm11,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_tables_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,8,                               //mov           (%rax),%rcx
+  76,139,64,8,                            //mov           0x8(%rax),%r8
+  243,68,15,111,4,185,                    //movdqu        (%rcx,%rdi,4),%xmm8
+  102,15,110,66,16,                       //movd          0x10(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
+  102,65,15,114,209,8,                    //psrld         $0x8,%xmm9
+  102,68,15,219,200,                      //pand          %xmm0,%xmm9
+  102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
+  102,65,15,114,210,16,                   //psrld         $0x10,%xmm10
+  102,68,15,219,208,                      //pand          %xmm0,%xmm10
+  102,65,15,219,192,                      //pand          %xmm8,%xmm0
+  102,15,112,216,78,                      //pshufd        $0x4e,%xmm0,%xmm3
+  102,72,15,126,217,                      //movq          %xmm3,%rcx
+  65,137,201,                             //mov           %ecx,%r9d
+  72,193,233,32,                          //shr           $0x20,%rcx
+  102,73,15,126,194,                      //movq          %xmm0,%r10
+  69,137,211,                             //mov           %r10d,%r11d
+  73,193,234,32,                          //shr           $0x20,%r10
+  243,67,15,16,28,144,                    //movss         (%r8,%r10,4),%xmm3
+  243,65,15,16,4,136,                     //movss         (%r8,%rcx,4),%xmm0
+  15,20,216,                              //unpcklps      %xmm0,%xmm3
+  243,67,15,16,4,152,                     //movss         (%r8,%r11,4),%xmm0
+  243,67,15,16,12,136,                    //movss         (%r8,%r9,4),%xmm1
+  15,20,193,                              //unpcklps      %xmm1,%xmm0
+  15,20,195,                              //unpcklps      %xmm3,%xmm0
+  72,139,72,16,                           //mov           0x10(%rax),%rcx
+  102,65,15,112,201,78,                   //pshufd        $0x4e,%xmm9,%xmm1
+  102,73,15,126,200,                      //movq          %xmm1,%r8
+  69,137,193,                             //mov           %r8d,%r9d
+  73,193,232,32,                          //shr           $0x20,%r8
+  102,77,15,126,202,                      //movq          %xmm9,%r10
+  69,137,211,                             //mov           %r10d,%r11d
+  73,193,234,32,                          //shr           $0x20,%r10
+  243,66,15,16,28,145,                    //movss         (%rcx,%r10,4),%xmm3
+  243,66,15,16,12,129,                    //movss         (%rcx,%r8,4),%xmm1
+  15,20,217,                              //unpcklps      %xmm1,%xmm3
+  243,66,15,16,12,153,                    //movss         (%rcx,%r11,4),%xmm1
+  243,66,15,16,20,137,                    //movss         (%rcx,%r9,4),%xmm2
+  15,20,202,                              //unpcklps      %xmm2,%xmm1
+  15,20,203,                              //unpcklps      %xmm3,%xmm1
+  72,139,64,24,                           //mov           0x18(%rax),%rax
+  102,65,15,112,210,78,                   //pshufd        $0x4e,%xmm10,%xmm2
+  102,72,15,126,209,                      //movq          %xmm2,%rcx
+  65,137,200,                             //mov           %ecx,%r8d
+  72,193,233,32,                          //shr           $0x20,%rcx
+  102,77,15,126,209,                      //movq          %xmm10,%r9
+  69,137,202,                             //mov           %r9d,%r10d
+  73,193,233,32,                          //shr           $0x20,%r9
+  243,70,15,16,12,136,                    //movss         (%rax,%r9,4),%xmm9
+  243,15,16,20,136,                       //movss         (%rax,%rcx,4),%xmm2
+  68,15,20,202,                           //unpcklps      %xmm2,%xmm9
+  243,66,15,16,20,144,                    //movss         (%rax,%r10,4),%xmm2
+  243,66,15,16,28,128,                    //movss         (%rax,%r8,4),%xmm3
+  15,20,211,                              //unpcklps      %xmm3,%xmm2
+  65,15,20,209,                           //unpcklps      %xmm9,%xmm2
+  102,65,15,114,208,24,                   //psrld         $0x18,%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,15,16,90,12,                        //movss         0xc(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_a8_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,15,110,4,56,                        //movd          (%rax,%rdi,1),%xmm0
+  102,15,239,201,                         //pxor          %xmm1,%xmm1
+  102,15,96,193,                          //punpcklbw     %xmm1,%xmm0
+  102,15,97,193,                          //punpcklwd     %xmm1,%xmm0
+  15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
+  243,15,16,90,12,                        //movss         0xc(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  15,89,216,                              //mulps         %xmm0,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  102,15,239,201,                         //pxor          %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_a8_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,66,8,                      //movss         0x8(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,65,15,114,240,16,                   //pslld         $0x10,%xmm8
+  102,65,15,114,224,16,                   //psrad         $0x10,%xmm8
+  102,69,15,107,192,                      //packssdw      %xmm8,%xmm8
+  102,69,15,103,192,                      //packuswb      %xmm8,%xmm8
+  102,68,15,126,4,56,                     //movd          %xmm8,(%rax,%rdi,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_565_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,126,12,120,                   //movq          (%rax,%rdi,2),%xmm9
+  102,15,239,192,                         //pxor          %xmm0,%xmm0
+  102,68,15,97,200,                       //punpcklwd     %xmm0,%xmm9
+  102,15,110,66,104,                      //movd          0x68(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,65,15,219,193,                      //pand          %xmm9,%xmm0
+  15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,15,16,66,116,                       //movss         0x74(%rdx),%xmm0
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,89,193,                              //mulps         %xmm1,%xmm0
+  102,15,110,74,108,                      //movd          0x6c(%rdx),%xmm1
+  102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
+  102,65,15,219,201,                      //pand          %xmm9,%xmm1
+  68,15,91,193,                           //cvtdq2ps      %xmm1,%xmm8
+  243,15,16,74,120,                       //movss         0x78(%rdx),%xmm1
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  102,15,110,82,112,                      //movd          0x70(%rdx),%xmm2
+  102,15,112,210,0,                       //pshufd        $0x0,%xmm2,%xmm2
+  102,65,15,219,209,                      //pand          %xmm9,%xmm2
+  68,15,91,194,                           //cvtdq2ps      %xmm2,%xmm8
+  243,15,16,82,124,                       //movss         0x7c(%rdx),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_565_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,130,128,0,0,0,             //movss         0x80(%rdx),%xmm8
+  243,68,15,16,138,132,0,0,0,             //movss         0x84(%rdx),%xmm9
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,208,                           //movaps        %xmm8,%xmm10
+  68,15,89,208,                           //mulps         %xmm0,%xmm10
+  102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
+  102,65,15,114,242,11,                   //pslld         $0xb,%xmm10
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  102,65,15,114,241,5,                    //pslld         $0x5,%xmm9
+  102,69,15,235,202,                      //por           %xmm10,%xmm9
+  68,15,89,194,                           //mulps         %xmm2,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,69,15,86,193,                       //orpd          %xmm9,%xmm8
+  102,65,15,114,240,16,                   //pslld         $0x10,%xmm8
+  102,65,15,114,224,16,                   //psrad         $0x10,%xmm8
+  102,69,15,107,192,                      //packssdw      %xmm8,%xmm8
+  102,68,15,214,4,120,                    //movq          %xmm8,(%rax,%rdi,2)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_8888_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,15,111,28,184,                      //movdqu        (%rax,%rdi,4),%xmm3
+  102,15,110,66,16,                       //movd          0x10(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,15,111,203,                         //movdqa        %xmm3,%xmm1
+  102,15,114,209,8,                       //psrld         $0x8,%xmm1
+  102,15,219,200,                         //pand          %xmm0,%xmm1
+  102,15,111,211,                         //movdqa        %xmm3,%xmm2
+  102,15,114,210,16,                      //psrld         $0x10,%xmm2
+  102,15,219,208,                         //pand          %xmm0,%xmm2
+  102,15,219,195,                         //pand          %xmm3,%xmm0
+  15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
+  243,68,15,16,66,12,                     //movss         0xc(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,91,210,                              //cvtdq2ps      %xmm2,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  102,15,114,211,24,                      //psrld         $0x18,%xmm3
+  15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_8888_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,66,8,                      //movss         0x8(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  69,15,40,208,                           //movaps        %xmm8,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
+  102,65,15,114,242,8,                    //pslld         $0x8,%xmm10
+  102,69,15,235,209,                      //por           %xmm9,%xmm10
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,202,                           //mulps         %xmm2,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  102,65,15,114,241,16,                   //pslld         $0x10,%xmm9
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,65,15,114,240,24,                   //pslld         $0x18,%xmm8
+  102,69,15,235,193,                      //por           %xmm9,%xmm8
+  102,69,15,235,194,                      //por           %xmm10,%xmm8
+  243,68,15,127,4,184,                    //movdqu        %xmm8,(%rax,%rdi,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_f16_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,15,111,4,248,                       //movdqu        (%rax,%rdi,8),%xmm0
+  243,15,111,76,248,16,                   //movdqu        0x10(%rax,%rdi,8),%xmm1
+  102,15,111,208,                         //movdqa        %xmm0,%xmm2
+  102,15,97,209,                          //punpcklwd     %xmm1,%xmm2
+  102,15,105,193,                         //punpckhwd     %xmm1,%xmm0
+  102,68,15,111,194,                      //movdqa        %xmm2,%xmm8
+  102,68,15,97,192,                       //punpcklwd     %xmm0,%xmm8
+  102,15,105,208,                         //punpckhwd     %xmm0,%xmm2
+  102,15,110,66,100,                      //movd          0x64(%rdx),%xmm0
+  102,15,112,216,0,                       //pshufd        $0x0,%xmm0,%xmm3
+  102,15,111,203,                         //movdqa        %xmm3,%xmm1
+  102,65,15,101,200,                      //pcmpgtw       %xmm8,%xmm1
+  102,65,15,223,200,                      //pandn         %xmm8,%xmm1
+  102,15,101,218,                         //pcmpgtw       %xmm2,%xmm3
+  102,15,223,218,                         //pandn         %xmm2,%xmm3
+  102,69,15,239,192,                      //pxor          %xmm8,%xmm8
+  102,15,111,193,                         //movdqa        %xmm1,%xmm0
+  102,65,15,97,192,                       //punpcklwd     %xmm8,%xmm0
+  102,15,114,240,13,                      //pslld         $0xd,%xmm0
+  102,15,110,82,92,                       //movd          0x5c(%rdx),%xmm2
+  102,68,15,112,202,0,                    //pshufd        $0x0,%xmm2,%xmm9
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  102,65,15,105,200,                      //punpckhwd     %xmm8,%xmm1
+  102,15,114,241,13,                      //pslld         $0xd,%xmm1
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  102,15,111,211,                         //movdqa        %xmm3,%xmm2
+  102,65,15,97,208,                       //punpcklwd     %xmm8,%xmm2
+  102,15,114,242,13,                      //pslld         $0xd,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  102,65,15,105,216,                      //punpckhwd     %xmm8,%xmm3
+  102,15,114,243,13,                      //pslld         $0xd,%xmm3
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_f16_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,110,66,96,                    //movd          0x60(%rdx),%xmm8
+  102,69,15,112,192,0,                    //pshufd        $0x0,%xmm8,%xmm8
+  102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  102,65,15,114,209,13,                   //psrld         $0xd,%xmm9
+  102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  102,65,15,114,210,13,                   //psrld         $0xd,%xmm10
+  102,69,15,111,216,                      //movdqa        %xmm8,%xmm11
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  102,65,15,114,211,13,                   //psrld         $0xd,%xmm11
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,65,15,114,208,13,                   //psrld         $0xd,%xmm8
+  102,65,15,115,250,2,                    //pslldq        $0x2,%xmm10
+  102,69,15,235,209,                      //por           %xmm9,%xmm10
+  102,65,15,115,248,2,                    //pslldq        $0x2,%xmm8
+  102,69,15,235,195,                      //por           %xmm11,%xmm8
+  102,69,15,111,202,                      //movdqa        %xmm10,%xmm9
+  102,69,15,98,200,                       //punpckldq     %xmm8,%xmm9
+  243,68,15,127,12,248,                   //movdqu        %xmm9,(%rax,%rdi,8)
+  102,69,15,106,208,                      //punpckhdq     %xmm8,%xmm10
+  243,68,15,127,84,248,16,                //movdqu        %xmm10,0x10(%rax,%rdi,8)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_f32_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,137,249,                             //mov           %rdi,%rcx
+  72,193,225,4,                           //shl           $0x4,%rcx
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  68,15,40,200,                           //movaps        %xmm0,%xmm9
+  68,15,20,201,                           //unpcklps      %xmm1,%xmm9
+  68,15,40,210,                           //movaps        %xmm2,%xmm10
+  68,15,40,218,                           //movaps        %xmm2,%xmm11
+  68,15,20,219,                           //unpcklps      %xmm3,%xmm11
+  68,15,21,193,                           //unpckhps      %xmm1,%xmm8
+  68,15,21,211,                           //unpckhps      %xmm3,%xmm10
+  69,15,40,225,                           //movaps        %xmm9,%xmm12
+  102,69,15,20,227,                       //unpcklpd      %xmm11,%xmm12
+  102,69,15,21,203,                       //unpckhpd      %xmm11,%xmm9
+  69,15,40,216,                           //movaps        %xmm8,%xmm11
+  102,69,15,20,218,                       //unpcklpd      %xmm10,%xmm11
+  102,69,15,21,194,                       //unpckhpd      %xmm10,%xmm8
+  102,68,15,17,36,8,                      //movupd        %xmm12,(%rax,%rcx,1)
+  102,68,15,17,76,8,16,                   //movupd        %xmm9,0x10(%rax,%rcx,1)
+  102,68,15,17,92,8,32,                   //movupd        %xmm11,0x20(%rax,%rcx,1)
+  102,68,15,17,68,8,48,                   //movupd        %xmm8,0x30(%rax,%rcx,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_x_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,95,192,                           //maxps         %xmm0,%xmm8
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  102,15,118,192,                         //pcmpeqd       %xmm0,%xmm0
+  102,65,15,254,193,                      //paddd         %xmm9,%xmm0
+  68,15,93,192,                           //minps         %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_y_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,95,193,                           //maxps         %xmm1,%xmm8
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  102,15,118,201,                         //pcmpeqd       %xmm1,%xmm1
+  102,65,15,254,201,                      //paddd         %xmm9,%xmm1
+  68,15,93,193,                           //minps         %xmm1,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,200,                           //movaps        %xmm8,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_x_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,200,                           //movaps        %xmm0,%xmm9
+  69,15,94,200,                           //divps         %xmm8,%xmm9
+  243,69,15,91,209,                       //cvttps2dq     %xmm9,%xmm10
+  69,15,91,210,                           //cvtdq2ps      %xmm10,%xmm10
+  69,15,194,202,1,                        //cmpltps       %xmm10,%xmm9
+  243,68,15,16,26,                        //movss         (%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,84,217,                           //andps         %xmm9,%xmm11
+  69,15,92,211,                           //subps         %xmm11,%xmm10
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  65,15,92,194,                           //subps         %xmm10,%xmm0
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,193,                           //minps         %xmm9,%xmm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_y_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  69,15,94,200,                           //divps         %xmm8,%xmm9
+  243,69,15,91,209,                       //cvttps2dq     %xmm9,%xmm10
+  69,15,91,210,                           //cvtdq2ps      %xmm10,%xmm10
+  69,15,194,202,1,                        //cmpltps       %xmm10,%xmm9
+  243,68,15,16,26,                        //movss         (%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,84,217,                           //andps         %xmm9,%xmm11
+  69,15,92,211,                           //subps         %xmm11,%xmm10
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  65,15,92,202,                           //subps         %xmm10,%xmm1
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,201,                           //minps         %xmm9,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_x_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,40,193,                           //movaps        %xmm9,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,92,192,                           //subps         %xmm8,%xmm0
+  243,69,15,88,201,                       //addss         %xmm9,%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,40,208,                           //movaps        %xmm0,%xmm10
+  69,15,94,209,                           //divps         %xmm9,%xmm10
+  243,69,15,91,218,                       //cvttps2dq     %xmm10,%xmm11
+  69,15,91,219,                           //cvtdq2ps      %xmm11,%xmm11
+  69,15,194,211,1,                        //cmpltps       %xmm11,%xmm10
+  243,68,15,16,34,                        //movss         (%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  69,15,84,226,                           //andps         %xmm10,%xmm12
+  69,15,87,210,                           //xorps         %xmm10,%xmm10
+  69,15,92,220,                           //subps         %xmm12,%xmm11
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  65,15,92,195,                           //subps         %xmm11,%xmm0
+  65,15,92,192,                           //subps         %xmm8,%xmm0
+  68,15,92,208,                           //subps         %xmm0,%xmm10
+  65,15,84,194,                           //andps         %xmm10,%xmm0
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,193,                           //minps         %xmm9,%xmm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_y_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,40,193,                           //movaps        %xmm9,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,92,200,                           //subps         %xmm8,%xmm1
+  243,69,15,88,201,                       //addss         %xmm9,%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,40,209,                           //movaps        %xmm1,%xmm10
+  69,15,94,209,                           //divps         %xmm9,%xmm10
+  243,69,15,91,218,                       //cvttps2dq     %xmm10,%xmm11
+  69,15,91,219,                           //cvtdq2ps      %xmm11,%xmm11
+  69,15,194,211,1,                        //cmpltps       %xmm11,%xmm10
+  243,68,15,16,34,                        //movss         (%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  69,15,84,226,                           //andps         %xmm10,%xmm12
+  69,15,87,210,                           //xorps         %xmm10,%xmm10
+  69,15,92,220,                           //subps         %xmm12,%xmm11
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  65,15,92,203,                           //subps         %xmm11,%xmm1
+  65,15,92,200,                           //subps         %xmm8,%xmm1
+  68,15,92,209,                           //subps         %xmm1,%xmm10
+  65,15,84,202,                           //andps         %xmm10,%xmm1
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,201,                           //minps         %xmm9,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_2x3_sse2[] = {
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,16,                     //movss         0x10(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,194,                           //addps         %xmm10,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,88,202,                           //addps         %xmm10,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_3x4_sse2[] = {
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,36,                     //movss         0x24(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,194,                           //addps         %xmm10,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,88,202,                           //addps         %xmm10,%xmm1
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  68,15,89,226,                           //mulps         %xmm2,%xmm12
+  69,15,88,229,                           //addps         %xmm13,%xmm12
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,210,                           //movaps        %xmm10,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_perspective_sse2[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,68,15,16,72,4,                      //movss         0x4(%rax),%xmm9
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,193,                           //addps         %xmm9,%xmm0
+  243,68,15,16,72,12,                     //movss         0xc(%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  243,68,15,16,80,24,                     //movss         0x18(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,217,                           //mulps         %xmm1,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,83,202,                           //rcpps         %xmm10,%xmm1
+  15,89,193,                              //mulps         %xmm1,%xmm0
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,201,                           //movaps        %xmm9,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_linear_gradient_2stops_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  68,15,16,8,                             //movups        (%rax),%xmm9
+  15,16,88,16,                            //movups        0x10(%rax),%xmm3
+  68,15,40,195,                           //movaps        %xmm3,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,40,201,                           //movaps        %xmm9,%xmm1
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  68,15,89,192,                           //mulps         %xmm0,%xmm8
+  68,15,88,193,                           //addps         %xmm1,%xmm8
+  15,40,203,                              //movaps        %xmm3,%xmm1
+  15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
+  65,15,40,209,                           //movaps        %xmm9,%xmm2
+  15,198,210,85,                          //shufps        $0x55,%xmm2,%xmm2
+  15,89,200,                              //mulps         %xmm0,%xmm1
+  15,88,202,                              //addps         %xmm2,%xmm1
+  15,40,211,                              //movaps        %xmm3,%xmm2
+  15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
+  69,15,40,209,                           //movaps        %xmm9,%xmm10
+  69,15,198,210,170,                      //shufps        $0xaa,%xmm10,%xmm10
+  15,89,208,                              //mulps         %xmm0,%xmm2
+  65,15,88,210,                           //addps         %xmm10,%xmm2
+  15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
+  69,15,198,201,255,                      //shufps        $0xff,%xmm9,%xmm9
+  15,89,216,                              //mulps         %xmm0,%xmm3
+  65,15,88,217,                           //addps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+#elif defined(_M_X64)
+
+CODE const uint8_t sk_start_pipeline_hsw[] = {
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  86,                                     //push          %rsi
+  87,                                     //push          %rdi
+  83,                                     //push          %rbx
+  72,129,236,160,0,0,0,                   //sub           $0xa0,%rsp
+  197,120,41,188,36,144,0,0,0,            //vmovaps       %xmm15,0x90(%rsp)
+  197,120,41,180,36,128,0,0,0,            //vmovaps       %xmm14,0x80(%rsp)
+  197,120,41,108,36,112,                  //vmovaps       %xmm13,0x70(%rsp)
+  197,120,41,100,36,96,                   //vmovaps       %xmm12,0x60(%rsp)
+  197,120,41,92,36,80,                    //vmovaps       %xmm11,0x50(%rsp)
+  197,120,41,84,36,64,                    //vmovaps       %xmm10,0x40(%rsp)
+  197,120,41,76,36,48,                    //vmovaps       %xmm9,0x30(%rsp)
+  197,120,41,68,36,32,                    //vmovaps       %xmm8,0x20(%rsp)
+  197,248,41,124,36,16,                   //vmovaps       %xmm7,0x10(%rsp)
+  197,248,41,52,36,                       //vmovaps       %xmm6,(%rsp)
+  77,137,205,                             //mov           %r9,%r13
+  77,137,198,                             //mov           %r8,%r14
+  72,137,203,                             //mov           %rcx,%rbx
+  72,137,214,                             //mov           %rdx,%rsi
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  73,137,199,                             //mov           %rax,%r15
+  73,137,244,                             //mov           %rsi,%r12
+  72,141,67,8,                            //lea           0x8(%rbx),%rax
+  76,57,232,                              //cmp           %r13,%rax
+  118,5,                                  //jbe           75 <_sk_start_pipeline_hsw+0x75>
+  72,137,223,                             //mov           %rbx,%rdi
+  235,65,                                 //jmp           b6 <_sk_start_pipeline_hsw+0xb6>
+  185,0,0,0,0,                            //mov           $0x0,%ecx
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  72,137,223,                             //mov           %rbx,%rdi
+  76,137,230,                             //mov           %r12,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,215,                             //callq         *%r15
+  72,141,123,8,                           //lea           0x8(%rbx),%rdi
+  72,131,195,16,                          //add           $0x10,%rbx
+  76,57,235,                              //cmp           %r13,%rbx
+  72,137,251,                             //mov           %rdi,%rbx
+  118,191,                                //jbe           75 <_sk_start_pipeline_hsw+0x75>
+  76,137,233,                             //mov           %r13,%rcx
+  72,41,249,                              //sub           %rdi,%rcx
+  116,41,                                 //je            e7 <_sk_start_pipeline_hsw+0xe7>
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  76,137,230,                             //mov           %r12,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,215,                             //callq         *%r15
+  76,137,232,                             //mov           %r13,%rax
+  197,248,40,52,36,                       //vmovaps       (%rsp),%xmm6
+  197,248,40,124,36,16,                   //vmovaps       0x10(%rsp),%xmm7
+  197,120,40,68,36,32,                    //vmovaps       0x20(%rsp),%xmm8
+  197,120,40,76,36,48,                    //vmovaps       0x30(%rsp),%xmm9
+  197,120,40,84,36,64,                    //vmovaps       0x40(%rsp),%xmm10
+  197,120,40,92,36,80,                    //vmovaps       0x50(%rsp),%xmm11
+  197,120,40,100,36,96,                   //vmovaps       0x60(%rsp),%xmm12
+  197,120,40,108,36,112,                  //vmovaps       0x70(%rsp),%xmm13
+  197,120,40,180,36,128,0,0,0,            //vmovaps       0x80(%rsp),%xmm14
+  197,120,40,188,36,144,0,0,0,            //vmovaps       0x90(%rsp),%xmm15
+  72,129,196,160,0,0,0,                   //add           $0xa0,%rsp
+  91,                                     //pop           %rbx
+  95,                                     //pop           %rdi
+  94,                                     //pop           %rsi
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  197,248,119,                            //vzeroupper
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_just_return_hsw[] = {
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_seed_shader_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,249,110,199,                        //vmovd         %edi,%xmm0
+  196,226,125,24,192,                     //vbroadcastss  %xmm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,4,                    //vbroadcastss  0x4(%rdx),%ymm1
+  197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
+  197,252,88,66,20,                       //vaddps        0x14(%rdx),%ymm0,%ymm0
+  196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
+  196,226,125,24,18,                      //vbroadcastss  (%rdx),%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_constant_color_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clear_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_plus__hsw[] = {
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_srcover_hsw[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  197,60,92,195,                          //vsubps        %ymm3,%ymm8,%ymm8
+  196,194,93,184,192,                     //vfmadd231ps   %ymm8,%ymm4,%ymm0
+  196,194,85,184,200,                     //vfmadd231ps   %ymm8,%ymm5,%ymm1
+  196,194,77,184,208,                     //vfmadd231ps   %ymm8,%ymm6,%ymm2
+  196,194,69,184,216,                     //vfmadd231ps   %ymm8,%ymm7,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_dstover_hsw[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  197,60,92,199,                          //vsubps        %ymm7,%ymm8,%ymm8
+  196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
+  196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
+  196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
+  196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_0_hsw[] = {
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  196,193,124,95,192,                     //vmaxps        %ymm8,%ymm0,%ymm0
+  196,193,116,95,200,                     //vmaxps        %ymm8,%ymm1,%ymm1
+  196,193,108,95,208,                     //vmaxps        %ymm8,%ymm2,%ymm2
+  196,193,100,95,216,                     //vmaxps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_1_hsw[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
+  196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
+  196,193,108,93,208,                     //vminps        %ymm8,%ymm2,%ymm2
+  196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_a_hsw[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
+  197,252,93,195,                         //vminps        %ymm3,%ymm0,%ymm0
+  197,244,93,203,                         //vminps        %ymm3,%ymm1,%ymm1
+  197,236,93,211,                         //vminps        %ymm3,%ymm2,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_set_rgb_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_rb_hsw[] = {
+  197,124,40,192,                         //vmovaps       %ymm0,%ymm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,194,                         //vmovaps       %ymm2,%ymm0
+  197,124,41,194,                         //vmovaps       %ymm8,%ymm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_hsw[] = {
+  197,124,40,195,                         //vmovaps       %ymm3,%ymm8
+  197,124,40,202,                         //vmovaps       %ymm2,%ymm9
+  197,124,40,209,                         //vmovaps       %ymm1,%ymm10
+  197,124,40,216,                         //vmovaps       %ymm0,%ymm11
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,196,                         //vmovaps       %ymm4,%ymm0
+  197,252,40,205,                         //vmovaps       %ymm5,%ymm1
+  197,252,40,214,                         //vmovaps       %ymm6,%ymm2
+  197,252,40,223,                         //vmovaps       %ymm7,%ymm3
+  197,124,41,220,                         //vmovaps       %ymm11,%ymm4
+  197,124,41,213,                         //vmovaps       %ymm10,%ymm5
+  197,124,41,206,                         //vmovaps       %ymm9,%ymm6
+  197,124,41,199,                         //vmovaps       %ymm8,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_src_dst_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,224,                         //vmovaps       %ymm0,%ymm4
+  197,252,40,233,                         //vmovaps       %ymm1,%ymm5
+  197,252,40,242,                         //vmovaps       %ymm2,%ymm6
+  197,252,40,251,                         //vmovaps       %ymm3,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_dst_src_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,196,                         //vmovaps       %ymm4,%ymm0
+  197,252,40,205,                         //vmovaps       %ymm5,%ymm1
+  197,252,40,214,                         //vmovaps       %ymm6,%ymm2
+  197,252,40,223,                         //vmovaps       %ymm7,%ymm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_premul_hsw[] = {
+  197,252,89,195,                         //vmulps        %ymm3,%ymm0,%ymm0
+  197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
+  197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_unpremul_hsw[] = {
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  196,65,100,194,200,0,                   //vcmpeqps      %ymm8,%ymm3,%ymm9
+  196,98,125,24,18,                       //vbroadcastss  (%rdx),%ymm10
+  197,44,94,211,                          //vdivps        %ymm3,%ymm10,%ymm10
+  196,67,45,74,192,144,                   //vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_from_srgb_hsw[] = {
+  196,98,125,24,66,64,                    //vbroadcastss  0x40(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  197,124,89,208,                         //vmulps        %ymm0,%ymm0,%ymm10
+  196,98,125,24,90,60,                    //vbroadcastss  0x3c(%rdx),%ymm11
+  196,98,125,24,98,56,                    //vbroadcastss  0x38(%rdx),%ymm12
+  196,65,124,40,235,                      //vmovaps       %ymm11,%ymm13
+  196,66,125,168,236,                     //vfmadd213ps   %ymm12,%ymm0,%ymm13
+  196,98,125,24,114,52,                   //vbroadcastss  0x34(%rdx),%ymm14
+  196,66,45,168,238,                      //vfmadd213ps   %ymm14,%ymm10,%ymm13
+  196,98,125,24,82,68,                    //vbroadcastss  0x44(%rdx),%ymm10
+  196,193,124,194,194,1,                  //vcmpltps      %ymm10,%ymm0,%ymm0
+  196,195,21,74,193,0,                    //vblendvps     %ymm0,%ymm9,%ymm13,%ymm0
+  197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
+  197,116,89,233,                         //vmulps        %ymm1,%ymm1,%ymm13
+  196,65,124,40,251,                      //vmovaps       %ymm11,%ymm15
+  196,66,117,168,252,                     //vfmadd213ps   %ymm12,%ymm1,%ymm15
+  196,66,21,168,254,                      //vfmadd213ps   %ymm14,%ymm13,%ymm15
+  196,193,116,194,202,1,                  //vcmpltps      %ymm10,%ymm1,%ymm1
+  196,195,5,74,201,16,                    //vblendvps     %ymm1,%ymm9,%ymm15,%ymm1
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  197,108,89,202,                         //vmulps        %ymm2,%ymm2,%ymm9
+  196,66,109,168,220,                     //vfmadd213ps   %ymm12,%ymm2,%ymm11
+  196,66,53,168,222,                      //vfmadd213ps   %ymm14,%ymm9,%ymm11
+  196,193,108,194,210,1,                  //vcmpltps      %ymm10,%ymm2,%ymm2
+  196,195,37,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm11,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_to_srgb_hsw[] = {
+  197,124,82,192,                         //vrsqrtps      %ymm0,%ymm8
+  196,65,124,83,200,                      //vrcpps        %ymm8,%ymm9
+  196,65,124,82,208,                      //vrsqrtps      %ymm8,%ymm10
+  196,98,125,24,66,72,                    //vbroadcastss  0x48(%rdx),%ymm8
+  197,60,89,216,                          //vmulps        %ymm0,%ymm8,%ymm11
+  196,98,125,24,34,                       //vbroadcastss  (%rdx),%ymm12
+  196,98,125,24,106,76,                   //vbroadcastss  0x4c(%rdx),%ymm13
+  196,98,125,24,114,80,                   //vbroadcastss  0x50(%rdx),%ymm14
+  196,98,125,24,122,84,                   //vbroadcastss  0x54(%rdx),%ymm15
+  196,66,13,168,207,                      //vfmadd213ps   %ymm15,%ymm14,%ymm9
+  196,66,21,184,202,                      //vfmadd231ps   %ymm10,%ymm13,%ymm9
+  196,65,28,93,201,                       //vminps        %ymm9,%ymm12,%ymm9
+  196,98,125,24,82,88,                    //vbroadcastss  0x58(%rdx),%ymm10
+  196,193,124,194,194,1,                  //vcmpltps      %ymm10,%ymm0,%ymm0
+  196,195,53,74,195,0,                    //vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
+  197,124,82,201,                         //vrsqrtps      %ymm1,%ymm9
+  196,65,124,83,217,                      //vrcpps        %ymm9,%ymm11
+  196,65,124,82,201,                      //vrsqrtps      %ymm9,%ymm9
+  196,66,13,168,223,                      //vfmadd213ps   %ymm15,%ymm14,%ymm11
+  196,66,21,184,217,                      //vfmadd231ps   %ymm9,%ymm13,%ymm11
+  197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
+  196,65,28,93,219,                       //vminps        %ymm11,%ymm12,%ymm11
+  196,193,116,194,202,1,                  //vcmpltps      %ymm10,%ymm1,%ymm1
+  196,195,37,74,201,16,                   //vblendvps     %ymm1,%ymm9,%ymm11,%ymm1
+  197,124,82,202,                         //vrsqrtps      %ymm2,%ymm9
+  196,65,124,83,217,                      //vrcpps        %ymm9,%ymm11
+  196,66,13,168,223,                      //vfmadd213ps   %ymm15,%ymm14,%ymm11
+  196,65,124,82,201,                      //vrsqrtps      %ymm9,%ymm9
+  196,66,21,184,217,                      //vfmadd231ps   %ymm9,%ymm13,%ymm11
+  196,65,28,93,203,                       //vminps        %ymm11,%ymm12,%ymm9
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  196,193,108,194,210,1,                  //vcmpltps      %ymm10,%ymm2,%ymm2
+  196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_1_float_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_u8_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,48,                                 //jne           4b1 <_sk_scale_u8_hsw+0x40>
+  197,123,16,0,                           //vmovsd        (%rax),%xmm8
+  196,66,125,49,192,                      //vpmovzxbd     %xmm8,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,12,                    //vbroadcastss  0xc(%rdx),%ymm9
+  196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           4b9 <_sk_scale_u8_hsw+0x48>
+  196,65,249,110,193,                     //vmovq         %r9,%xmm8
+  235,175,                                //jmp           485 <_sk_scale_u8_hsw+0x14>
+};
+
+CODE const uint8_t sk_lerp_1_float_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
+  197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
+  196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_u8_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,68,                                 //jne           559 <_sk_lerp_u8_hsw+0x54>
+  197,123,16,0,                           //vmovsd        (%rax),%xmm8
+  196,66,125,49,192,                      //vpmovzxbd     %xmm8,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,12,                    //vbroadcastss  0xc(%rdx),%ymm9
+  196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,226,61,168,205,                     //vfmadd213ps   %ymm5,%ymm8,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,226,61,168,214,                     //vfmadd213ps   %ymm6,%ymm8,%ymm2
+  197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
+  196,226,61,168,223,                     //vfmadd213ps   %ymm7,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           561 <_sk_lerp_u8_hsw+0x5c>
+  196,65,249,110,193,                     //vmovq         %r9,%xmm8
+  235,155,                                //jmp           519 <_sk_lerp_u8_hsw+0x14>
+};
+
+CODE const uint8_t sk_lerp_565_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  117,123,                                //jne           603 <_sk_lerp_565_hsw+0x85>
+  196,193,122,111,28,122,                 //vmovdqu       (%r10,%rdi,2),%xmm3
+  196,226,125,51,219,                     //vpmovzxwd     %xmm3,%ymm3
+  196,98,125,88,66,104,                   //vpbroadcastd  0x68(%rdx),%ymm8
+  197,61,219,195,                         //vpand         %ymm3,%ymm8,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,116,                   //vbroadcastss  0x74(%rdx),%ymm9
+  196,65,52,89,192,                       //vmulps        %ymm8,%ymm9,%ymm8
+  196,98,125,88,74,108,                   //vpbroadcastd  0x6c(%rdx),%ymm9
+  197,53,219,203,                         //vpand         %ymm3,%ymm9,%ymm9
+  196,65,124,91,201,                      //vcvtdq2ps     %ymm9,%ymm9
+  196,98,125,24,82,120,                   //vbroadcastss  0x78(%rdx),%ymm10
+  196,65,44,89,201,                       //vmulps        %ymm9,%ymm10,%ymm9
+  196,98,125,88,82,112,                   //vpbroadcastd  0x70(%rdx),%ymm10
+  197,173,219,219,                        //vpand         %ymm3,%ymm10,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,98,125,24,82,124,                   //vbroadcastss  0x7c(%rdx),%ymm10
+  197,172,89,219,                         //vmulps        %ymm3,%ymm10,%ymm3
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,226,61,168,196,                     //vfmadd213ps   %ymm4,%ymm8,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,226,53,168,205,                     //vfmadd213ps   %ymm5,%ymm9,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,226,101,168,214,                    //vfmadd213ps   %ymm6,%ymm3,%ymm2
+  196,226,125,24,26,                      //vbroadcastss  (%rdx),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  15,135,111,255,255,255,                 //ja            58e <_sk_lerp_565_hsw+0x10>
+  76,141,13,74,0,0,0,                     //lea           0x4a(%rip),%r9        # 670 <_sk_lerp_565_hsw+0xf2>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  196,193,97,196,92,122,12,6,             //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,10,5,             //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,8,4,              //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,6,3,              //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,4,2,              //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,92,122,2,1,              //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
+  196,193,97,196,28,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm3,%xmm3
+  233,31,255,255,255,                     //jmpq          58e <_sk_lerp_565_hsw+0x10>
+  144,                                    //nop
+  243,255,                                //repz          (bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  235,255,                                //jmp           675 <_sk_lerp_565_hsw+0xf7>
+  255,                                    //(bad)
+  255,227,                                //jmpq          *%rbx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  219,255,                                //(bad)
+  255,                                    //(bad)
+  255,211,                                //callq         *%rbx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,203,                                //dec           %ebx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  191,                                    //.byte         0xbf
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_tables_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
+  76,3,8,                                 //add           (%rax),%r9
+  77,133,192,                             //test          %r8,%r8
+  117,106,                                //jne           70b <_sk_load_tables_hsw+0x7f>
+  196,193,126,111,25,                     //vmovdqu       (%r9),%ymm3
+  196,226,125,88,82,16,                   //vpbroadcastd  0x10(%rdx),%ymm2
+  197,237,219,203,                        //vpand         %ymm3,%ymm2,%ymm1
+  196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
+  72,139,72,8,                            //mov           0x8(%rax),%rcx
+  76,139,72,16,                           //mov           0x10(%rax),%r9
+  196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
+  196,226,53,146,4,137,                   //vgatherdps    %ymm9,(%rcx,%ymm1,4),%ymm0
+  197,245,114,211,8,                      //vpsrld        $0x8,%ymm3,%ymm1
+  197,109,219,201,                        //vpand         %ymm1,%ymm2,%ymm9
+  196,65,45,118,210,                      //vpcmpeqd      %ymm10,%ymm10,%ymm10
+  196,130,45,146,12,137,                  //vgatherdps    %ymm10,(%r9,%ymm9,4),%ymm1
+  72,139,64,24,                           //mov           0x18(%rax),%rax
+  197,181,114,211,16,                     //vpsrld        $0x10,%ymm3,%ymm9
+  196,65,109,219,201,                     //vpand         %ymm9,%ymm2,%ymm9
+  196,162,61,146,20,136,                  //vgatherdps    %ymm8,(%rax,%ymm9,4),%ymm2
+  197,229,114,211,24,                     //vpsrld        $0x18,%ymm3,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,98,125,24,66,12,                    //vbroadcastss  0xc(%rdx),%ymm8
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  185,8,0,0,0,                            //mov           $0x8,%ecx
+  68,41,193,                              //sub           %r8d,%ecx
+  192,225,3,                              //shl           $0x3,%cl
+  73,199,194,255,255,255,255,             //mov           $0xffffffffffffffff,%r10
+  73,211,234,                             //shr           %cl,%r10
+  196,193,249,110,194,                    //vmovq         %r10,%xmm0
+  196,226,125,33,192,                     //vpmovsxbd     %xmm0,%ymm0
+  196,194,125,140,25,                     //vpmaskmovd    (%r9),%ymm0,%ymm3
+  233,114,255,255,255,                    //jmpq          6a6 <_sk_load_tables_hsw+0x1a>
+};
+
+CODE const uint8_t sk_load_a8_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,42,                                 //jne           76e <_sk_load_a8_hsw+0x3a>
+  197,251,16,0,                           //vmovsd        (%rax),%xmm0
+  196,226,125,49,192,                     //vpmovzxbd     %xmm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,12,                   //vbroadcastss  0xc(%rdx),%ymm1
+  197,252,89,217,                         //vmulps        %ymm1,%ymm0,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           776 <_sk_load_a8_hsw+0x42>
+  196,193,249,110,193,                    //vmovq         %r9,%xmm0
+  235,181,                                //jmp           748 <_sk_load_a8_hsw+0x14>
+};
+
+CODE const uint8_t sk_store_a8_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,66,8,                     //vbroadcastss  0x8(%rdx),%ymm8
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
+  196,65,57,103,192,                      //vpackuswb     %xmm8,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           7c6 <_sk_store_a8_hsw+0x33>
+  196,65,123,17,4,57,                     //vmovsd        %xmm8,(%r9,%rdi,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            7c2 <_sk_store_a8_hsw+0x2f>
+  196,66,121,48,192,                      //vpmovzxbw     %xmm8,%xmm8
+  76,141,21,66,0,0,0,                     //lea           0x42(%rip),%r10        # 824 <_sk_store_a8_hsw+0x91>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,121,20,68,57,6,12,               //vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
+  196,67,121,20,68,57,5,10,               //vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
+  196,67,121,20,68,57,4,8,                //vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
+  196,67,121,20,68,57,3,6,                //vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
+  196,67,121,20,68,57,2,4,                //vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
+  196,67,121,20,68,57,1,2,                //vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
+  196,67,121,20,4,57,0,                   //vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
+  235,158,                                //jmp           7c2 <_sk_store_a8_hsw+0x2f>
+  247,255,                                //idiv          %edi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  239,                                    //out           %eax,(%dx)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,231,                                //jmpq          *%rdi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  223,255,                                //(bad)
+  255,                                    //(bad)
+  255,215,                                //callq         *%rdi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,207,                                //dec           %edi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,199,                                //inc           %edi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_565_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  117,92,                                 //jne           8a6 <_sk_load_565_hsw+0x66>
+  196,193,122,111,4,122,                  //vmovdqu       (%r10,%rdi,2),%xmm0
+  196,226,125,51,208,                     //vpmovzxwd     %xmm0,%ymm2
+  196,226,125,88,66,104,                  //vpbroadcastd  0x68(%rdx),%ymm0
+  197,253,219,194,                        //vpand         %ymm2,%ymm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,116,                  //vbroadcastss  0x74(%rdx),%ymm1
+  197,244,89,192,                         //vmulps        %ymm0,%ymm1,%ymm0
+  196,226,125,88,74,108,                  //vpbroadcastd  0x6c(%rdx),%ymm1
+  197,245,219,202,                        //vpand         %ymm2,%ymm1,%ymm1
+  197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
+  196,226,125,24,90,120,                  //vbroadcastss  0x78(%rdx),%ymm3
+  197,228,89,201,                         //vmulps        %ymm1,%ymm3,%ymm1
+  196,226,125,88,90,112,                  //vpbroadcastd  0x70(%rdx),%ymm3
+  197,229,219,210,                        //vpand         %ymm2,%ymm3,%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  196,226,125,24,90,124,                  //vbroadcastss  0x7c(%rdx),%ymm3
+  197,228,89,210,                         //vmulps        %ymm2,%ymm3,%ymm2
+  196,226,125,24,26,                      //vbroadcastss  (%rdx),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,146,                                //ja            850 <_sk_load_565_hsw+0x10>
+  76,141,13,75,0,0,0,                     //lea           0x4b(%rip),%r9        # 910 <_sk_load_565_hsw+0xd0>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
+  196,193,121,196,68,122,12,6,            //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,10,5,            //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,8,4,             //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,6,3,             //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,4,2,             //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,2,1,             //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,4,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
+  233,66,255,255,255,                     //jmpq          850 <_sk_load_565_hsw+0x10>
+  102,144,                                //xchg          %ax,%ax
+  242,255,                                //repnz         (bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  234,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,226,                                //jmpq          *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  218,255,                                //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,202,                                //dec           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  190,                                    //.byte         0xbe
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_store_565_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,130,128,0,0,0,            //vbroadcastss  0x80(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
+  196,193,53,114,241,11,                  //vpslld        $0xb,%ymm9,%ymm9
+  196,98,125,24,146,132,0,0,0,            //vbroadcastss  0x84(%rdx),%ymm10
+  197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,45,114,242,5,                   //vpslld        $0x5,%ymm10,%ymm10
+  196,65,45,235,201,                      //vpor          %ymm9,%ymm10,%ymm9
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,65,53,235,192,                      //vpor          %ymm8,%ymm9,%ymm8
+  196,67,125,57,193,1,                    //vextracti128  $0x1,%ymm8,%xmm9
+  196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           98e <_sk_store_565_hsw+0x62>
+  196,65,122,127,4,121,                   //vmovdqu       %xmm8,(%r9,%rdi,2)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            98a <_sk_store_565_hsw+0x5e>
+  76,141,21,71,0,0,0,                     //lea           0x47(%rip),%r10        # 9ec <_sk_store_565_hsw+0xc0>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,121,21,68,121,12,6,              //vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
+  196,67,121,21,68,121,10,5,              //vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
+  196,67,121,21,68,121,8,4,               //vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
+  196,67,121,21,68,121,6,3,               //vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
+  196,67,121,21,68,121,4,2,               //vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
+  196,67,121,21,68,121,2,1,               //vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
+  197,121,126,192,                        //vmovd         %xmm8,%eax
+  102,65,137,4,121,                       //mov           %ax,(%r9,%rdi,2)
+  235,161,                                //jmp           98a <_sk_store_565_hsw+0x5e>
+  15,31,0,                                //nopl          (%rax)
+  242,255,                                //repnz         (bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  234,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,226,                                //jmpq          *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  218,255,                                //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,202,                                //dec           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,194,                                //inc           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_8888_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
+  76,3,8,                                 //add           (%rax),%r9
+  77,133,192,                             //test          %r8,%r8
+  117,85,                                 //jne           a72 <_sk_load_8888_hsw+0x6a>
+  196,193,126,111,25,                     //vmovdqu       (%r9),%ymm3
+  196,226,125,88,82,16,                   //vpbroadcastd  0x10(%rdx),%ymm2
+  197,237,219,195,                        //vpand         %ymm3,%ymm2,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,98,125,24,66,12,                    //vbroadcastss  0xc(%rdx),%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,245,114,211,8,                      //vpsrld        $0x8,%ymm3,%ymm1
+  197,237,219,201,                        //vpand         %ymm1,%ymm2,%ymm1
+  197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,181,114,211,16,                     //vpsrld        $0x10,%ymm3,%ymm9
+  196,193,109,219,209,                    //vpand         %ymm9,%ymm2,%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,229,114,211,24,                     //vpsrld        $0x18,%ymm3,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  185,8,0,0,0,                            //mov           $0x8,%ecx
+  68,41,193,                              //sub           %r8d,%ecx
+  192,225,3,                              //shl           $0x3,%cl
+  72,199,192,255,255,255,255,             //mov           $0xffffffffffffffff,%rax
+  72,211,232,                             //shr           %cl,%rax
+  196,225,249,110,192,                    //vmovq         %rax,%xmm0
+  196,226,125,33,192,                     //vpmovsxbd     %xmm0,%ymm0
+  196,194,125,140,25,                     //vpmaskmovd    (%r9),%ymm0,%ymm3
+  235,138,                                //jmp           a22 <_sk_load_8888_hsw+0x1a>
+};
+
+CODE const uint8_t sk_store_8888_hsw[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,141,12,189,0,0,0,0,                  //lea           0x0(,%rdi,4),%r9
+  76,3,8,                                 //add           (%rax),%r9
+  196,98,125,24,66,8,                     //vbroadcastss  0x8(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
+  197,60,89,209,                          //vmulps        %ymm1,%ymm8,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,45,114,242,8,                   //vpslld        $0x8,%ymm10,%ymm10
+  196,65,45,235,201,                      //vpor          %ymm9,%ymm10,%ymm9
+  197,60,89,210,                          //vmulps        %ymm2,%ymm8,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,45,114,242,16,                  //vpslld        $0x10,%ymm10,%ymm10
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,193,61,114,240,24,                  //vpslld        $0x18,%ymm8,%ymm8
+  196,65,45,235,192,                      //vpor          %ymm8,%ymm10,%ymm8
+  196,65,53,235,192,                      //vpor          %ymm8,%ymm9,%ymm8
+  77,133,192,                             //test          %r8,%r8
+  117,12,                                 //jne           b04 <_sk_store_8888_hsw+0x6c>
+  196,65,126,127,1,                       //vmovdqu       %ymm8,(%r9)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  185,8,0,0,0,                            //mov           $0x8,%ecx
+  68,41,193,                              //sub           %r8d,%ecx
+  192,225,3,                              //shl           $0x3,%cl
+  72,199,192,255,255,255,255,             //mov           $0xffffffffffffffff,%rax
+  72,211,232,                             //shr           %cl,%rax
+  196,97,249,110,200,                     //vmovq         %rax,%xmm9
+  196,66,125,33,201,                      //vpmovsxbd     %xmm9,%ymm9
+  196,66,53,142,1,                        //vpmaskmovd    %ymm8,%ymm9,(%r9)
+  235,211,                                //jmp           afd <_sk_store_8888_hsw+0x65>
+};
+
+CODE const uint8_t sk_load_f16_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,133,201,                             //test          %rcx,%rcx
+  117,97,                                 //jne           b95 <_sk_load_f16_hsw+0x6b>
+  197,249,16,12,248,                      //vmovupd       (%rax,%rdi,8),%xmm1
+  197,249,16,84,248,16,                   //vmovupd       0x10(%rax,%rdi,8),%xmm2
+  197,249,16,92,248,32,                   //vmovupd       0x20(%rax,%rdi,8),%xmm3
+  197,121,16,68,248,48,                   //vmovupd       0x30(%rax,%rdi,8),%xmm8
+  197,241,97,194,                         //vpunpcklwd    %xmm2,%xmm1,%xmm0
+  197,241,105,202,                        //vpunpckhwd    %xmm2,%xmm1,%xmm1
+  196,193,97,97,208,                      //vpunpcklwd    %xmm8,%xmm3,%xmm2
+  196,193,97,105,216,                     //vpunpckhwd    %xmm8,%xmm3,%xmm3
+  197,121,97,193,                         //vpunpcklwd    %xmm1,%xmm0,%xmm8
+  197,121,105,201,                        //vpunpckhwd    %xmm1,%xmm0,%xmm9
+  197,233,97,203,                         //vpunpcklwd    %xmm3,%xmm2,%xmm1
+  197,233,105,219,                        //vpunpckhwd    %xmm3,%xmm2,%xmm3
+  197,185,108,193,                        //vpunpcklqdq   %xmm1,%xmm8,%xmm0
+  196,226,125,19,192,                     //vcvtph2ps     %xmm0,%ymm0
+  197,185,109,201,                        //vpunpckhqdq   %xmm1,%xmm8,%xmm1
+  196,226,125,19,201,                     //vcvtph2ps     %xmm1,%ymm1
+  197,177,108,211,                        //vpunpcklqdq   %xmm3,%xmm9,%xmm2
+  196,226,125,19,210,                     //vcvtph2ps     %xmm2,%ymm2
+  197,177,109,219,                        //vpunpckhqdq   %xmm3,%xmm9,%xmm3
+  196,226,125,19,219,                     //vcvtph2ps     %xmm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  197,251,16,12,248,                      //vmovsd        (%rax,%rdi,8),%xmm1
+  196,65,57,87,192,                       //vxorpd        %xmm8,%xmm8,%xmm8
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  117,6,                                  //jne           bab <_sk_load_f16_hsw+0x81>
+  197,250,126,201,                        //vmovq         %xmm1,%xmm1
+  235,30,                                 //jmp           bc9 <_sk_load_f16_hsw+0x9f>
+  197,241,22,76,248,8,                    //vmovhpd       0x8(%rax,%rdi,8),%xmm1,%xmm1
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,18,                                 //jb            bc9 <_sk_load_f16_hsw+0x9f>
+  197,251,16,84,248,16,                   //vmovsd        0x10(%rax,%rdi,8),%xmm2
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  117,19,                                 //jne           bd6 <_sk_load_f16_hsw+0xac>
+  197,250,126,210,                        //vmovq         %xmm2,%xmm2
+  235,46,                                 //jmp           bf7 <_sk_load_f16_hsw+0xcd>
+  197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
+  197,233,87,210,                         //vxorpd        %xmm2,%xmm2,%xmm2
+  233,117,255,255,255,                    //jmpq          b4b <_sk_load_f16_hsw+0x21>
+  197,233,22,84,248,24,                   //vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,21,                                 //jb            bf7 <_sk_load_f16_hsw+0xcd>
+  197,251,16,92,248,32,                   //vmovsd        0x20(%rax,%rdi,8),%xmm3
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  117,18,                                 //jne           c00 <_sk_load_f16_hsw+0xd6>
+  197,250,126,219,                        //vmovq         %xmm3,%xmm3
+  233,84,255,255,255,                     //jmpq          b4b <_sk_load_f16_hsw+0x21>
+  197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
+  233,75,255,255,255,                     //jmpq          b4b <_sk_load_f16_hsw+0x21>
+  197,225,22,92,248,40,                   //vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  15,130,59,255,255,255,                  //jb            b4b <_sk_load_f16_hsw+0x21>
+  197,123,16,68,248,48,                   //vmovsd        0x30(%rax,%rdi,8),%xmm8
+  233,48,255,255,255,                     //jmpq          b4b <_sk_load_f16_hsw+0x21>
+};
+
+CODE const uint8_t sk_store_f16_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  196,195,125,29,192,4,                   //vcvtps2ph     $0x4,%ymm0,%xmm8
+  196,195,125,29,201,4,                   //vcvtps2ph     $0x4,%ymm1,%xmm9
+  196,195,125,29,210,4,                   //vcvtps2ph     $0x4,%ymm2,%xmm10
+  196,195,125,29,219,4,                   //vcvtps2ph     $0x4,%ymm3,%xmm11
+  196,65,57,97,225,                       //vpunpcklwd    %xmm9,%xmm8,%xmm12
+  196,65,57,105,193,                      //vpunpckhwd    %xmm9,%xmm8,%xmm8
+  196,65,41,97,203,                       //vpunpcklwd    %xmm11,%xmm10,%xmm9
+  196,65,41,105,235,                      //vpunpckhwd    %xmm11,%xmm10,%xmm13
+  196,65,25,98,217,                       //vpunpckldq    %xmm9,%xmm12,%xmm11
+  196,65,25,106,209,                      //vpunpckhdq    %xmm9,%xmm12,%xmm10
+  196,65,57,98,205,                       //vpunpckldq    %xmm13,%xmm8,%xmm9
+  196,65,57,106,197,                      //vpunpckhdq    %xmm13,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,27,                                 //jne           c80 <_sk_store_f16_hsw+0x65>
+  197,120,17,28,248,                      //vmovups       %xmm11,(%rax,%rdi,8)
+  197,120,17,84,248,16,                   //vmovups       %xmm10,0x10(%rax,%rdi,8)
+  197,120,17,76,248,32,                   //vmovups       %xmm9,0x20(%rax,%rdi,8)
+  197,122,127,68,248,48,                  //vmovdqu       %xmm8,0x30(%rax,%rdi,8)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  197,121,214,28,248,                     //vmovq         %xmm11,(%rax,%rdi,8)
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  116,241,                                //je            c7c <_sk_store_f16_hsw+0x61>
+  197,121,23,92,248,8,                    //vmovhpd       %xmm11,0x8(%rax,%rdi,8)
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,229,                                //jb            c7c <_sk_store_f16_hsw+0x61>
+  197,121,214,84,248,16,                  //vmovq         %xmm10,0x10(%rax,%rdi,8)
+  116,221,                                //je            c7c <_sk_store_f16_hsw+0x61>
+  197,121,23,84,248,24,                   //vmovhpd       %xmm10,0x18(%rax,%rdi,8)
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,209,                                //jb            c7c <_sk_store_f16_hsw+0x61>
+  197,121,214,76,248,32,                  //vmovq         %xmm9,0x20(%rax,%rdi,8)
+  116,201,                                //je            c7c <_sk_store_f16_hsw+0x61>
+  197,121,23,76,248,40,                   //vmovhpd       %xmm9,0x28(%rax,%rdi,8)
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  114,189,                                //jb            c7c <_sk_store_f16_hsw+0x61>
+  197,121,214,68,248,48,                  //vmovq         %xmm8,0x30(%rax,%rdi,8)
+  235,181,                                //jmp           c7c <_sk_store_f16_hsw+0x61>
+};
+
+CODE const uint8_t sk_store_f32_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,0,                               //mov           (%rax),%r8
+  72,141,4,189,0,0,0,0,                   //lea           0x0(,%rdi,4),%rax
+  197,124,20,193,                         //vunpcklps     %ymm1,%ymm0,%ymm8
+  197,124,21,217,                         //vunpckhps     %ymm1,%ymm0,%ymm11
+  197,108,20,203,                         //vunpcklps     %ymm3,%ymm2,%ymm9
+  197,108,21,227,                         //vunpckhps     %ymm3,%ymm2,%ymm12
+  196,65,61,20,209,                       //vunpcklpd     %ymm9,%ymm8,%ymm10
+  196,65,61,21,201,                       //vunpckhpd     %ymm9,%ymm8,%ymm9
+  196,65,37,20,196,                       //vunpcklpd     %ymm12,%ymm11,%ymm8
+  196,65,37,21,220,                       //vunpckhpd     %ymm12,%ymm11,%ymm11
+  72,133,201,                             //test          %rcx,%rcx
+  117,55,                                 //jne           d34 <_sk_store_f32_hsw+0x6d>
+  196,67,45,24,225,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
+  196,67,61,24,235,1,                     //vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
+  196,67,45,6,201,49,                     //vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
+  196,67,61,6,195,49,                     //vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
+  196,65,125,17,36,128,                   //vmovupd       %ymm12,(%r8,%rax,4)
+  196,65,125,17,108,128,32,               //vmovupd       %ymm13,0x20(%r8,%rax,4)
+  196,65,125,17,76,128,64,                //vmovupd       %ymm9,0x40(%r8,%rax,4)
+  196,65,125,17,68,128,96,                //vmovupd       %ymm8,0x60(%r8,%rax,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  196,65,121,17,20,128,                   //vmovupd       %xmm10,(%r8,%rax,4)
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  116,240,                                //je            d30 <_sk_store_f32_hsw+0x69>
+  196,65,121,17,76,128,16,                //vmovupd       %xmm9,0x10(%r8,%rax,4)
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,227,                                //jb            d30 <_sk_store_f32_hsw+0x69>
+  196,65,121,17,68,128,32,                //vmovupd       %xmm8,0x20(%r8,%rax,4)
+  116,218,                                //je            d30 <_sk_store_f32_hsw+0x69>
+  196,65,121,17,92,128,48,                //vmovupd       %xmm11,0x30(%r8,%rax,4)
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,205,                                //jb            d30 <_sk_store_f32_hsw+0x69>
+  196,67,125,25,84,128,64,1,              //vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
+  116,195,                                //je            d30 <_sk_store_f32_hsw+0x69>
+  196,67,125,25,76,128,80,1,              //vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  114,181,                                //jb            d30 <_sk_store_f32_hsw+0x69>
+  196,67,125,25,68,128,96,1,              //vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
+  235,171,                                //jmp           d30 <_sk_store_f32_hsw+0x69>
+};
+
+CODE const uint8_t sk_clamp_x_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,188,95,192,                         //vmaxps        %ymm0,%ymm8,%ymm0
+  196,98,125,88,0,                        //vpbroadcastd  (%rax),%ymm8
+  196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
+  196,65,61,254,193,                      //vpaddd        %ymm9,%ymm8,%ymm8
+  196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_y_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,188,95,201,                         //vmaxps        %ymm1,%ymm8,%ymm1
+  196,98,125,88,0,                        //vpbroadcastd  (%rax),%ymm8
+  196,65,53,118,201,                      //vpcmpeqd      %ymm9,%ymm9,%ymm9
+  196,65,61,254,193,                      //vpaddd        %ymm9,%ymm8,%ymm8
+  196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_x_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,65,124,94,200,                      //vdivps        %ymm8,%ymm0,%ymm9
+  196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
+  196,98,61,172,200,                      //vfnmadd213ps  %ymm0,%ymm8,%ymm9
+  197,253,118,192,                        //vpcmpeqd      %ymm0,%ymm0,%ymm0
+  197,189,254,192,                        //vpaddd        %ymm0,%ymm8,%ymm0
+  197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_y_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,65,116,94,200,                      //vdivps        %ymm8,%ymm1,%ymm9
+  196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
+  196,98,61,172,201,                      //vfnmadd213ps  %ymm1,%ymm8,%ymm9
+  197,245,118,201,                        //vpcmpeqd      %ymm1,%ymm1,%ymm1
+  197,189,254,201,                        //vpaddd        %ymm1,%ymm8,%ymm1
+  197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_x_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,122,16,0,                           //vmovss        (%rax),%xmm8
+  196,66,125,24,200,                      //vbroadcastss  %xmm8,%ymm9
+  196,65,124,92,209,                      //vsubps        %ymm9,%ymm0,%ymm10
+  196,193,58,88,192,                      //vaddss        %xmm8,%xmm8,%xmm0
+  196,226,125,24,192,                     //vbroadcastss  %xmm0,%ymm0
+  197,44,94,192,                          //vdivps        %ymm0,%ymm10,%ymm8
+  196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
+  196,66,125,172,194,                     //vfnmadd213ps  %ymm10,%ymm0,%ymm8
+  196,193,60,92,193,                      //vsubps        %ymm9,%ymm8,%ymm0
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,92,192,                          //vsubps        %ymm0,%ymm8,%ymm8
+  197,188,84,192,                         //vandps        %ymm0,%ymm8,%ymm0
+  196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
+  196,65,53,254,192,                      //vpaddd        %ymm8,%ymm9,%ymm8
+  196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_y_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,122,16,0,                           //vmovss        (%rax),%xmm8
+  196,66,125,24,200,                      //vbroadcastss  %xmm8,%ymm9
+  196,65,116,92,209,                      //vsubps        %ymm9,%ymm1,%ymm10
+  196,193,58,88,200,                      //vaddss        %xmm8,%xmm8,%xmm1
+  196,226,125,24,201,                     //vbroadcastss  %xmm1,%ymm1
+  197,44,94,193,                          //vdivps        %ymm1,%ymm10,%ymm8
+  196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
+  196,66,117,172,194,                     //vfnmadd213ps  %ymm10,%ymm1,%ymm8
+  196,193,60,92,201,                      //vsubps        %ymm9,%ymm8,%ymm1
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,92,193,                          //vsubps        %ymm1,%ymm8,%ymm8
+  197,188,84,201,                         //vandps        %ymm1,%ymm8,%ymm1
+  196,65,61,118,192,                      //vpcmpeqd      %ymm8,%ymm8,%ymm8
+  196,65,53,254,192,                      //vpaddd        %ymm8,%ymm9,%ymm8
+  196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_2x3_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
+  196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
+  196,98,125,24,64,16,                    //vbroadcastss  0x10(%rax),%ymm8
+  196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
+  196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
+  196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
+  196,98,125,24,88,12,                    //vbroadcastss  0xc(%rax),%ymm11
+  196,98,125,24,72,20,                    //vbroadcastss  0x14(%rax),%ymm9
+  196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
+  196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  197,124,41,201,                         //vmovaps       %ymm9,%ymm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_3x4_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,8,                        //vbroadcastss  (%rax),%ymm9
+  196,98,125,24,80,12,                    //vbroadcastss  0xc(%rax),%ymm10
+  196,98,125,24,88,24,                    //vbroadcastss  0x18(%rax),%ymm11
+  196,98,125,24,64,36,                    //vbroadcastss  0x24(%rax),%ymm8
+  196,66,109,184,195,                     //vfmadd231ps   %ymm11,%ymm2,%ymm8
+  196,66,117,184,194,                     //vfmadd231ps   %ymm10,%ymm1,%ymm8
+  196,66,125,184,193,                     //vfmadd231ps   %ymm9,%ymm0,%ymm8
+  196,98,125,24,80,4,                     //vbroadcastss  0x4(%rax),%ymm10
+  196,98,125,24,88,16,                    //vbroadcastss  0x10(%rax),%ymm11
+  196,98,125,24,96,28,                    //vbroadcastss  0x1c(%rax),%ymm12
+  196,98,125,24,72,40,                    //vbroadcastss  0x28(%rax),%ymm9
+  196,66,109,184,204,                     //vfmadd231ps   %ymm12,%ymm2,%ymm9
+  196,66,117,184,203,                     //vfmadd231ps   %ymm11,%ymm1,%ymm9
+  196,66,125,184,202,                     //vfmadd231ps   %ymm10,%ymm0,%ymm9
+  196,98,125,24,88,8,                     //vbroadcastss  0x8(%rax),%ymm11
+  196,98,125,24,96,20,                    //vbroadcastss  0x14(%rax),%ymm12
+  196,98,125,24,104,32,                   //vbroadcastss  0x20(%rax),%ymm13
+  196,98,125,24,80,44,                    //vbroadcastss  0x2c(%rax),%ymm10
+  196,66,109,184,213,                     //vfmadd231ps   %ymm13,%ymm2,%ymm10
+  196,66,117,184,212,                     //vfmadd231ps   %ymm12,%ymm1,%ymm10
+  196,66,125,184,211,                     //vfmadd231ps   %ymm11,%ymm0,%ymm10
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  197,124,41,201,                         //vmovaps       %ymm9,%ymm1
+  197,124,41,210,                         //vmovaps       %ymm10,%ymm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_perspective_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
+  196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
+  196,66,117,184,209,                     //vfmadd231ps   %ymm9,%ymm1,%ymm10
+  196,66,125,184,208,                     //vfmadd231ps   %ymm8,%ymm0,%ymm10
+  196,98,125,24,64,12,                    //vbroadcastss  0xc(%rax),%ymm8
+  196,98,125,24,72,16,                    //vbroadcastss  0x10(%rax),%ymm9
+  196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
+  196,66,117,184,217,                     //vfmadd231ps   %ymm9,%ymm1,%ymm11
+  196,66,125,184,216,                     //vfmadd231ps   %ymm8,%ymm0,%ymm11
+  196,98,125,24,64,24,                    //vbroadcastss  0x18(%rax),%ymm8
+  196,98,125,24,72,28,                    //vbroadcastss  0x1c(%rax),%ymm9
+  196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
+  196,66,117,184,225,                     //vfmadd231ps   %ymm9,%ymm1,%ymm12
+  196,66,125,184,224,                     //vfmadd231ps   %ymm8,%ymm0,%ymm12
+  196,193,124,83,204,                     //vrcpps        %ymm12,%ymm1
+  197,172,89,193,                         //vmulps        %ymm1,%ymm10,%ymm0
+  197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_linear_gradient_2stops_hsw[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,72,16,                   //vbroadcastss  0x10(%rax),%ymm1
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,184,193,                     //vfmadd231ps   %ymm1,%ymm0,%ymm8
+  196,226,125,24,80,20,                   //vbroadcastss  0x14(%rax),%ymm2
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,184,202,                    //vfmadd231ps   %ymm2,%ymm0,%ymm1
+  196,226,125,24,88,24,                   //vbroadcastss  0x18(%rax),%ymm3
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  196,226,125,184,211,                    //vfmadd231ps   %ymm3,%ymm0,%ymm2
+  196,98,125,24,72,28,                    //vbroadcastss  0x1c(%rax),%ymm9
+  196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
+  196,194,125,184,217,                    //vfmadd231ps   %ymm9,%ymm0,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_start_pipeline_avx[] = {
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  86,                                     //push          %rsi
+  87,                                     //push          %rdi
+  83,                                     //push          %rbx
+  72,129,236,160,0,0,0,                   //sub           $0xa0,%rsp
+  197,120,41,188,36,144,0,0,0,            //vmovaps       %xmm15,0x90(%rsp)
+  197,120,41,180,36,128,0,0,0,            //vmovaps       %xmm14,0x80(%rsp)
+  197,120,41,108,36,112,                  //vmovaps       %xmm13,0x70(%rsp)
+  197,120,41,100,36,96,                   //vmovaps       %xmm12,0x60(%rsp)
+  197,120,41,92,36,80,                    //vmovaps       %xmm11,0x50(%rsp)
+  197,120,41,84,36,64,                    //vmovaps       %xmm10,0x40(%rsp)
+  197,120,41,76,36,48,                    //vmovaps       %xmm9,0x30(%rsp)
+  197,120,41,68,36,32,                    //vmovaps       %xmm8,0x20(%rsp)
+  197,248,41,124,36,16,                   //vmovaps       %xmm7,0x10(%rsp)
+  197,248,41,52,36,                       //vmovaps       %xmm6,(%rsp)
+  77,137,205,                             //mov           %r9,%r13
+  77,137,198,                             //mov           %r8,%r14
+  72,137,203,                             //mov           %rcx,%rbx
+  72,137,214,                             //mov           %rdx,%rsi
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  73,137,199,                             //mov           %rax,%r15
+  73,137,244,                             //mov           %rsi,%r12
+  72,141,67,8,                            //lea           0x8(%rbx),%rax
+  76,57,232,                              //cmp           %r13,%rax
+  118,5,                                  //jbe           75 <_sk_start_pipeline_avx+0x75>
+  72,137,223,                             //mov           %rbx,%rdi
+  235,65,                                 //jmp           b6 <_sk_start_pipeline_avx+0xb6>
+  185,0,0,0,0,                            //mov           $0x0,%ecx
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  72,137,223,                             //mov           %rbx,%rdi
+  76,137,230,                             //mov           %r12,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,215,                             //callq         *%r15
+  72,141,123,8,                           //lea           0x8(%rbx),%rdi
+  72,131,195,16,                          //add           $0x10,%rbx
+  76,57,235,                              //cmp           %r13,%rbx
+  72,137,251,                             //mov           %rdi,%rbx
+  118,191,                                //jbe           75 <_sk_start_pipeline_avx+0x75>
+  76,137,233,                             //mov           %r13,%rcx
+  72,41,249,                              //sub           %rdi,%rcx
+  116,41,                                 //je            e7 <_sk_start_pipeline_avx+0xe7>
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  76,137,230,                             //mov           %r12,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,215,                             //callq         *%r15
+  76,137,232,                             //mov           %r13,%rax
+  197,248,40,52,36,                       //vmovaps       (%rsp),%xmm6
+  197,248,40,124,36,16,                   //vmovaps       0x10(%rsp),%xmm7
+  197,120,40,68,36,32,                    //vmovaps       0x20(%rsp),%xmm8
+  197,120,40,76,36,48,                    //vmovaps       0x30(%rsp),%xmm9
+  197,120,40,84,36,64,                    //vmovaps       0x40(%rsp),%xmm10
+  197,120,40,92,36,80,                    //vmovaps       0x50(%rsp),%xmm11
+  197,120,40,100,36,96,                   //vmovaps       0x60(%rsp),%xmm12
+  197,120,40,108,36,112,                  //vmovaps       0x70(%rsp),%xmm13
+  197,120,40,180,36,128,0,0,0,            //vmovaps       0x80(%rsp),%xmm14
+  197,120,40,188,36,144,0,0,0,            //vmovaps       0x90(%rsp),%xmm15
+  72,129,196,160,0,0,0,                   //add           $0xa0,%rsp
+  91,                                     //pop           %rbx
+  95,                                     //pop           %rdi
+  94,                                     //pop           %rsi
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  197,248,119,                            //vzeroupper
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_just_return_avx[] = {
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_seed_shader_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,249,110,199,                        //vmovd         %edi,%xmm0
+  197,249,112,192,0,                      //vpshufd       $0x0,%xmm0,%xmm0
+  196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,4,                    //vbroadcastss  0x4(%rdx),%ymm1
+  197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
+  197,252,88,66,20,                       //vaddps        0x14(%rdx),%ymm0,%ymm0
+  196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
+  196,226,125,24,18,                      //vbroadcastss  (%rdx),%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  197,220,87,228,                         //vxorps        %ymm4,%ymm4,%ymm4
+  197,212,87,237,                         //vxorps        %ymm5,%ymm5,%ymm5
+  197,204,87,246,                         //vxorps        %ymm6,%ymm6,%ymm6
+  197,196,87,255,                         //vxorps        %ymm7,%ymm7,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_constant_color_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  196,226,125,24,88,12,                   //vbroadcastss  0xc(%rax),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clear_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  197,228,87,219,                         //vxorps        %ymm3,%ymm3,%ymm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_plus__avx[] = {
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_srcover_avx[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  197,60,92,195,                          //vsubps        %ymm3,%ymm8,%ymm8
+  197,60,89,204,                          //vmulps        %ymm4,%ymm8,%ymm9
+  197,180,88,192,                         //vaddps        %ymm0,%ymm9,%ymm0
+  197,60,89,205,                          //vmulps        %ymm5,%ymm8,%ymm9
+  197,180,88,201,                         //vaddps        %ymm1,%ymm9,%ymm1
+  197,60,89,206,                          //vmulps        %ymm6,%ymm8,%ymm9
+  197,180,88,210,                         //vaddps        %ymm2,%ymm9,%ymm2
+  197,60,89,199,                          //vmulps        %ymm7,%ymm8,%ymm8
+  197,188,88,219,                         //vaddps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_dstover_avx[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  197,60,92,199,                          //vsubps        %ymm7,%ymm8,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_0_avx[] = {
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  196,193,124,95,192,                     //vmaxps        %ymm8,%ymm0,%ymm0
+  196,193,116,95,200,                     //vmaxps        %ymm8,%ymm1,%ymm1
+  196,193,108,95,208,                     //vmaxps        %ymm8,%ymm2,%ymm2
+  196,193,100,95,216,                     //vmaxps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_1_avx[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  196,193,124,93,192,                     //vminps        %ymm8,%ymm0,%ymm0
+  196,193,116,93,200,                     //vminps        %ymm8,%ymm1,%ymm1
+  196,193,108,93,208,                     //vminps        %ymm8,%ymm2,%ymm2
+  196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_a_avx[] = {
+  196,98,125,24,2,                        //vbroadcastss  (%rdx),%ymm8
+  196,193,100,93,216,                     //vminps        %ymm8,%ymm3,%ymm3
+  197,252,93,195,                         //vminps        %ymm3,%ymm0,%ymm0
+  197,244,93,203,                         //vminps        %ymm3,%ymm1,%ymm1
+  197,236,93,211,                         //vminps        %ymm3,%ymm2,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_set_rgb_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,0,                       //vbroadcastss  (%rax),%ymm0
+  196,226,125,24,72,4,                    //vbroadcastss  0x4(%rax),%ymm1
+  196,226,125,24,80,8,                    //vbroadcastss  0x8(%rax),%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_rb_avx[] = {
+  197,124,40,192,                         //vmovaps       %ymm0,%ymm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,194,                         //vmovaps       %ymm2,%ymm0
+  197,124,41,194,                         //vmovaps       %ymm8,%ymm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_avx[] = {
+  197,124,40,195,                         //vmovaps       %ymm3,%ymm8
+  197,124,40,202,                         //vmovaps       %ymm2,%ymm9
+  197,124,40,209,                         //vmovaps       %ymm1,%ymm10
+  197,124,40,216,                         //vmovaps       %ymm0,%ymm11
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,196,                         //vmovaps       %ymm4,%ymm0
+  197,252,40,205,                         //vmovaps       %ymm5,%ymm1
+  197,252,40,214,                         //vmovaps       %ymm6,%ymm2
+  197,252,40,223,                         //vmovaps       %ymm7,%ymm3
+  197,124,41,220,                         //vmovaps       %ymm11,%ymm4
+  197,124,41,213,                         //vmovaps       %ymm10,%ymm5
+  197,124,41,206,                         //vmovaps       %ymm9,%ymm6
+  197,124,41,199,                         //vmovaps       %ymm8,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_src_dst_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,224,                         //vmovaps       %ymm0,%ymm4
+  197,252,40,233,                         //vmovaps       %ymm1,%ymm5
+  197,252,40,242,                         //vmovaps       %ymm2,%ymm6
+  197,252,40,251,                         //vmovaps       %ymm3,%ymm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_dst_src_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,40,196,                         //vmovaps       %ymm4,%ymm0
+  197,252,40,205,                         //vmovaps       %ymm5,%ymm1
+  197,252,40,214,                         //vmovaps       %ymm6,%ymm2
+  197,252,40,223,                         //vmovaps       %ymm7,%ymm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_premul_avx[] = {
+  197,252,89,195,                         //vmulps        %ymm3,%ymm0,%ymm0
+  197,244,89,203,                         //vmulps        %ymm3,%ymm1,%ymm1
+  197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_unpremul_avx[] = {
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  196,65,100,194,200,0,                   //vcmpeqps      %ymm8,%ymm3,%ymm9
+  196,98,125,24,18,                       //vbroadcastss  (%rdx),%ymm10
+  197,44,94,211,                          //vdivps        %ymm3,%ymm10,%ymm10
+  196,67,45,74,192,144,                   //vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_from_srgb_avx[] = {
+  196,98,125,24,66,64,                    //vbroadcastss  0x40(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  197,124,89,208,                         //vmulps        %ymm0,%ymm0,%ymm10
+  196,98,125,24,90,60,                    //vbroadcastss  0x3c(%rdx),%ymm11
+  196,98,125,24,98,56,                    //vbroadcastss  0x38(%rdx),%ymm12
+  197,36,89,232,                          //vmulps        %ymm0,%ymm11,%ymm13
+  196,65,20,88,236,                       //vaddps        %ymm12,%ymm13,%ymm13
+  196,98,125,24,114,52,                   //vbroadcastss  0x34(%rdx),%ymm14
+  196,65,44,89,213,                       //vmulps        %ymm13,%ymm10,%ymm10
+  196,65,12,88,210,                       //vaddps        %ymm10,%ymm14,%ymm10
+  196,98,125,24,106,68,                   //vbroadcastss  0x44(%rdx),%ymm13
+  196,193,124,194,197,1,                  //vcmpltps      %ymm13,%ymm0,%ymm0
+  196,195,45,74,193,0,                    //vblendvps     %ymm0,%ymm9,%ymm10,%ymm0
+  197,60,89,201,                          //vmulps        %ymm1,%ymm8,%ymm9
+  197,116,89,209,                         //vmulps        %ymm1,%ymm1,%ymm10
+  197,36,89,249,                          //vmulps        %ymm1,%ymm11,%ymm15
+  196,65,4,88,252,                        //vaddps        %ymm12,%ymm15,%ymm15
+  196,65,44,89,215,                       //vmulps        %ymm15,%ymm10,%ymm10
+  196,65,12,88,210,                       //vaddps        %ymm10,%ymm14,%ymm10
+  196,193,116,194,205,1,                  //vcmpltps      %ymm13,%ymm1,%ymm1
+  196,195,45,74,201,16,                   //vblendvps     %ymm1,%ymm9,%ymm10,%ymm1
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  197,108,89,202,                         //vmulps        %ymm2,%ymm2,%ymm9
+  197,36,89,210,                          //vmulps        %ymm2,%ymm11,%ymm10
+  196,65,44,88,212,                       //vaddps        %ymm12,%ymm10,%ymm10
+  196,65,52,89,202,                       //vmulps        %ymm10,%ymm9,%ymm9
+  196,65,12,88,201,                       //vaddps        %ymm9,%ymm14,%ymm9
+  196,193,108,194,213,1,                  //vcmpltps      %ymm13,%ymm2,%ymm2
+  196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_to_srgb_avx[] = {
+  197,124,82,192,                         //vrsqrtps      %ymm0,%ymm8
+  196,65,124,83,200,                      //vrcpps        %ymm8,%ymm9
+  196,65,124,82,208,                      //vrsqrtps      %ymm8,%ymm10
+  196,98,125,24,66,72,                    //vbroadcastss  0x48(%rdx),%ymm8
+  197,60,89,216,                          //vmulps        %ymm0,%ymm8,%ymm11
+  196,98,125,24,34,                       //vbroadcastss  (%rdx),%ymm12
+  196,98,125,24,106,76,                   //vbroadcastss  0x4c(%rdx),%ymm13
+  196,98,125,24,114,80,                   //vbroadcastss  0x50(%rdx),%ymm14
+  196,98,125,24,122,84,                   //vbroadcastss  0x54(%rdx),%ymm15
+  196,65,52,89,206,                       //vmulps        %ymm14,%ymm9,%ymm9
+  196,65,52,88,207,                       //vaddps        %ymm15,%ymm9,%ymm9
+  196,65,44,89,213,                       //vmulps        %ymm13,%ymm10,%ymm10
+  196,65,44,88,201,                       //vaddps        %ymm9,%ymm10,%ymm9
+  196,65,28,93,201,                       //vminps        %ymm9,%ymm12,%ymm9
+  196,98,125,24,82,88,                    //vbroadcastss  0x58(%rdx),%ymm10
+  196,193,124,194,194,1,                  //vcmpltps      %ymm10,%ymm0,%ymm0
+  196,195,53,74,195,0,                    //vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
+  197,124,82,201,                         //vrsqrtps      %ymm1,%ymm9
+  196,65,124,83,217,                      //vrcpps        %ymm9,%ymm11
+  196,65,124,82,201,                      //vrsqrtps      %ymm9,%ymm9
+  196,65,12,89,219,                       //vmulps        %ymm11,%ymm14,%ymm11
+  196,65,4,88,219,                        //vaddps        %ymm11,%ymm15,%ymm11
+  196,65,20,89,201,                       //vmulps        %ymm9,%ymm13,%ymm9
+  196,65,52,88,203,                       //vaddps        %ymm11,%ymm9,%ymm9
+  197,60,89,217,                          //vmulps        %ymm1,%ymm8,%ymm11
+  196,65,28,93,201,                       //vminps        %ymm9,%ymm12,%ymm9
+  196,193,116,194,202,1,                  //vcmpltps      %ymm10,%ymm1,%ymm1
+  196,195,53,74,203,16,                   //vblendvps     %ymm1,%ymm11,%ymm9,%ymm1
+  197,124,82,202,                         //vrsqrtps      %ymm2,%ymm9
+  196,65,124,83,217,                      //vrcpps        %ymm9,%ymm11
+  196,65,12,89,219,                       //vmulps        %ymm11,%ymm14,%ymm11
+  196,65,4,88,219,                        //vaddps        %ymm11,%ymm15,%ymm11
+  196,65,124,82,201,                      //vrsqrtps      %ymm9,%ymm9
+  196,65,20,89,201,                       //vmulps        %ymm9,%ymm13,%ymm9
+  196,65,52,88,203,                       //vaddps        %ymm11,%ymm9,%ymm9
+  196,65,28,93,201,                       //vminps        %ymm9,%ymm12,%ymm9
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  196,193,108,194,210,1,                  //vcmpltps      %ymm10,%ymm2,%ymm2
+  196,195,53,74,208,32,                   //vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_1_float_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_u8_avx[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,65,                                 //jne           50f <_sk_scale_u8_avx+0x51>
+  197,123,16,0,                           //vmovsd        (%rax),%xmm8
+  196,66,121,49,200,                      //vpmovzxbd     %xmm8,%xmm9
+  196,67,121,4,192,229,                   //vpermilps     $0xe5,%xmm8,%xmm8
+  196,66,121,49,192,                      //vpmovzxbd     %xmm8,%xmm8
+  196,67,53,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,12,                    //vbroadcastss  0xc(%rdx),%ymm9
+  196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  197,188,89,219,                         //vmulps        %ymm3,%ymm8,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           517 <_sk_scale_u8_avx+0x59>
+  196,65,249,110,193,                     //vmovq         %r9,%xmm8
+  235,158,                                //jmp           4d2 <_sk_scale_u8_avx+0x14>
+};
+
+CODE const uint8_t sk_lerp_1_float_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_u8_avx[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,101,                                //jne           5e8 <_sk_lerp_u8_avx+0x75>
+  197,123,16,0,                           //vmovsd        (%rax),%xmm8
+  196,66,121,49,200,                      //vpmovzxbd     %xmm8,%xmm9
+  196,67,121,4,192,229,                   //vpermilps     $0xe5,%xmm8,%xmm8
+  196,66,121,49,192,                      //vpmovzxbd     %xmm8,%xmm8
+  196,67,53,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,12,                    //vbroadcastss  0xc(%rdx),%ymm9
+  196,65,60,89,193,                       //vmulps        %ymm9,%ymm8,%ymm8
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,193,116,89,200,                     //vmulps        %ymm8,%ymm1,%ymm1
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  196,193,108,89,208,                     //vmulps        %ymm8,%ymm2,%ymm2
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  197,228,92,223,                         //vsubps        %ymm7,%ymm3,%ymm3
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  197,228,88,223,                         //vaddps        %ymm7,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           5f0 <_sk_lerp_u8_avx+0x7d>
+  196,65,249,110,193,                     //vmovq         %r9,%xmm8
+  233,119,255,255,255,                    //jmpq          587 <_sk_lerp_u8_avx+0x14>
+};
+
+CODE const uint8_t sk_lerp_565_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  15,133,148,0,0,0,                       //jne           6b2 <_sk_lerp_565_avx+0xa2>
+  196,65,122,111,4,122,                   //vmovdqu       (%r10,%rdi,2),%xmm8
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  197,185,105,219,                        //vpunpckhwd    %xmm3,%xmm8,%xmm3
+  196,66,121,51,192,                      //vpmovzxwd     %xmm8,%xmm8
+  196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
+  196,98,125,24,66,104,                   //vbroadcastss  0x68(%rdx),%ymm8
+  197,60,84,195,                          //vandps        %ymm3,%ymm8,%ymm8
+  196,65,124,91,192,                      //vcvtdq2ps     %ymm8,%ymm8
+  196,98,125,24,74,116,                   //vbroadcastss  0x74(%rdx),%ymm9
+  196,65,52,89,192,                       //vmulps        %ymm8,%ymm9,%ymm8
+  196,98,125,24,74,108,                   //vbroadcastss  0x6c(%rdx),%ymm9
+  197,52,84,203,                          //vandps        %ymm3,%ymm9,%ymm9
+  196,65,124,91,201,                      //vcvtdq2ps     %ymm9,%ymm9
+  196,98,125,24,82,120,                   //vbroadcastss  0x78(%rdx),%ymm10
+  196,65,44,89,201,                       //vmulps        %ymm9,%ymm10,%ymm9
+  196,98,125,24,82,112,                   //vbroadcastss  0x70(%rdx),%ymm10
+  197,172,84,219,                         //vandps        %ymm3,%ymm10,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,98,125,24,82,124,                   //vbroadcastss  0x7c(%rdx),%ymm10
+  197,172,89,219,                         //vmulps        %ymm3,%ymm10,%ymm3
+  197,252,92,196,                         //vsubps        %ymm4,%ymm0,%ymm0
+  196,193,124,89,192,                     //vmulps        %ymm8,%ymm0,%ymm0
+  197,252,88,196,                         //vaddps        %ymm4,%ymm0,%ymm0
+  197,244,92,205,                         //vsubps        %ymm5,%ymm1,%ymm1
+  196,193,116,89,201,                     //vmulps        %ymm9,%ymm1,%ymm1
+  197,244,88,205,                         //vaddps        %ymm5,%ymm1,%ymm1
+  197,236,92,214,                         //vsubps        %ymm6,%ymm2,%ymm2
+  197,236,89,211,                         //vmulps        %ymm3,%ymm2,%ymm2
+  197,236,88,214,                         //vaddps        %ymm6,%ymm2,%ymm2
+  196,226,125,24,26,                      //vbroadcastss  (%rdx),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  196,65,57,239,192,                      //vpxor         %xmm8,%xmm8,%xmm8
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  15,135,85,255,255,255,                  //ja            624 <_sk_lerp_565_avx+0x14>
+  76,141,13,74,0,0,0,                     //lea           0x4a(%rip),%r9        # 720 <_sk_lerp_565_avx+0x110>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  196,65,97,196,68,122,12,6,              //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8
+  196,65,57,196,68,122,10,5,              //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,68,122,8,4,               //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,68,122,6,3,               //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,68,122,4,2,               //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,68,122,2,1,               //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
+  196,65,57,196,4,122,0,                  //vpinsrw       $0x0,(%r10,%rdi,2),%xmm8,%xmm8
+  233,5,255,255,255,                      //jmpq          624 <_sk_lerp_565_avx+0x14>
+  144,                                    //nop
+  243,255,                                //repz          (bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  235,255,                                //jmp           725 <_sk_lerp_565_avx+0x115>
+  255,                                    //(bad)
+  255,227,                                //jmpq          *%rbx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  219,255,                                //(bad)
+  255,                                    //(bad)
+  255,211,                                //callq         *%rbx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,203,                                //dec           %ebx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  191,                                    //.byte         0xbf
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_tables_avx[] = {
+  85,                                     //push          %rbp
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  83,                                     //push          %rbx
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,0,                               //mov           (%rax),%r8
+  72,133,201,                             //test          %rcx,%rcx
+  15,133,18,2,0,0,                        //jne           966 <_sk_load_tables_avx+0x22a>
+  196,65,124,16,4,184,                    //vmovups       (%r8,%rdi,4),%ymm8
+  196,98,125,24,74,16,                    //vbroadcastss  0x10(%rdx),%ymm9
+  196,193,52,84,192,                      //vandps        %ymm8,%ymm9,%ymm0
+  196,193,249,126,193,                    //vmovq         %xmm0,%r9
+  69,137,203,                             //mov           %r9d,%r11d
+  196,195,249,22,194,1,                   //vpextrq       $0x1,%xmm0,%r10
+  69,137,214,                             //mov           %r10d,%r14d
+  73,193,234,32,                          //shr           $0x20,%r10
+  73,193,233,32,                          //shr           $0x20,%r9
+  196,227,125,25,192,1,                   //vextractf128  $0x1,%ymm0,%xmm0
+  196,193,249,126,196,                    //vmovq         %xmm0,%r12
+  69,137,231,                             //mov           %r12d,%r15d
+  196,227,249,22,195,1,                   //vpextrq       $0x1,%xmm0,%rbx
+  65,137,221,                             //mov           %ebx,%r13d
+  72,193,235,32,                          //shr           $0x20,%rbx
+  73,193,236,32,                          //shr           $0x20,%r12
+  72,139,104,8,                           //mov           0x8(%rax),%rbp
+  76,139,64,16,                           //mov           0x10(%rax),%r8
+  196,161,122,16,68,189,0,                //vmovss        0x0(%rbp,%r15,4),%xmm0
+  196,163,121,33,68,165,0,16,             //vinsertps     $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0
+  196,163,121,33,68,173,0,32,             //vinsertps     $0x20,0x0(%rbp,%r13,4),%xmm0,%xmm0
+  197,250,16,76,157,0,                    //vmovss        0x0(%rbp,%rbx,4),%xmm1
+  196,227,121,33,193,48,                  //vinsertps     $0x30,%xmm1,%xmm0,%xmm0
+  196,161,122,16,76,157,0,                //vmovss        0x0(%rbp,%r11,4),%xmm1
+  196,163,113,33,76,141,0,16,             //vinsertps     $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1
+  196,163,113,33,76,181,0,32,             //vinsertps     $0x20,0x0(%rbp,%r14,4),%xmm1,%xmm1
+  196,161,122,16,92,149,0,                //vmovss        0x0(%rbp,%r10,4),%xmm3
+  196,227,113,33,203,48,                  //vinsertps     $0x30,%xmm3,%xmm1,%xmm1
+  196,227,117,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
+  196,193,113,114,208,8,                  //vpsrld        $0x8,%xmm8,%xmm1
+  196,67,125,25,194,1,                    //vextractf128  $0x1,%ymm8,%xmm10
+  196,193,105,114,210,8,                  //vpsrld        $0x8,%xmm10,%xmm2
+  196,227,117,24,202,1,                   //vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
+  197,180,84,201,                         //vandps        %ymm1,%ymm9,%ymm1
+  196,193,249,126,201,                    //vmovq         %xmm1,%r9
+  69,137,203,                             //mov           %r9d,%r11d
+  196,195,249,22,202,1,                   //vpextrq       $0x1,%xmm1,%r10
+  69,137,214,                             //mov           %r10d,%r14d
+  73,193,234,32,                          //shr           $0x20,%r10
+  73,193,233,32,                          //shr           $0x20,%r9
+  196,227,125,25,201,1,                   //vextractf128  $0x1,%ymm1,%xmm1
+  196,225,249,126,205,                    //vmovq         %xmm1,%rbp
+  65,137,239,                             //mov           %ebp,%r15d
+  196,227,249,22,203,1,                   //vpextrq       $0x1,%xmm1,%rbx
+  65,137,220,                             //mov           %ebx,%r12d
+  72,193,235,32,                          //shr           $0x20,%rbx
+  72,193,237,32,                          //shr           $0x20,%rbp
+  196,129,122,16,12,184,                  //vmovss        (%r8,%r15,4),%xmm1
+  196,195,113,33,12,168,16,               //vinsertps     $0x10,(%r8,%rbp,4),%xmm1,%xmm1
+  196,129,122,16,20,160,                  //vmovss        (%r8,%r12,4),%xmm2
+  196,227,113,33,202,32,                  //vinsertps     $0x20,%xmm2,%xmm1,%xmm1
+  196,193,122,16,20,152,                  //vmovss        (%r8,%rbx,4),%xmm2
+  196,227,113,33,202,48,                  //vinsertps     $0x30,%xmm2,%xmm1,%xmm1
+  196,129,122,16,20,152,                  //vmovss        (%r8,%r11,4),%xmm2
+  196,131,105,33,20,136,16,               //vinsertps     $0x10,(%r8,%r9,4),%xmm2,%xmm2
+  196,129,122,16,28,176,                  //vmovss        (%r8,%r14,4),%xmm3
+  196,227,105,33,211,32,                  //vinsertps     $0x20,%xmm3,%xmm2,%xmm2
+  196,129,122,16,28,144,                  //vmovss        (%r8,%r10,4),%xmm3
+  196,227,105,33,211,48,                  //vinsertps     $0x30,%xmm3,%xmm2,%xmm2
+  196,227,109,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
+  72,139,64,24,                           //mov           0x18(%rax),%rax
+  196,193,105,114,208,16,                 //vpsrld        $0x10,%xmm8,%xmm2
+  196,193,97,114,210,16,                  //vpsrld        $0x10,%xmm10,%xmm3
+  196,227,109,24,211,1,                   //vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
+  197,180,84,210,                         //vandps        %ymm2,%ymm9,%ymm2
+  196,193,249,126,208,                    //vmovq         %xmm2,%r8
+  69,137,194,                             //mov           %r8d,%r10d
+  196,195,249,22,209,1,                   //vpextrq       $0x1,%xmm2,%r9
+  69,137,203,                             //mov           %r9d,%r11d
+  73,193,233,32,                          //shr           $0x20,%r9
+  73,193,232,32,                          //shr           $0x20,%r8
+  196,227,125,25,210,1,                   //vextractf128  $0x1,%ymm2,%xmm2
+  196,225,249,126,213,                    //vmovq         %xmm2,%rbp
+  65,137,238,                             //mov           %ebp,%r14d
+  196,227,249,22,211,1,                   //vpextrq       $0x1,%xmm2,%rbx
+  65,137,223,                             //mov           %ebx,%r15d
+  72,193,235,32,                          //shr           $0x20,%rbx
+  72,193,237,32,                          //shr           $0x20,%rbp
+  196,161,122,16,20,176,                  //vmovss        (%rax,%r14,4),%xmm2
+  196,227,105,33,20,168,16,               //vinsertps     $0x10,(%rax,%rbp,4),%xmm2,%xmm2
+  196,161,122,16,28,184,                  //vmovss        (%rax,%r15,4),%xmm3
+  196,227,105,33,211,32,                  //vinsertps     $0x20,%xmm3,%xmm2,%xmm2
+  197,250,16,28,152,                      //vmovss        (%rax,%rbx,4),%xmm3
+  196,99,105,33,203,48,                   //vinsertps     $0x30,%xmm3,%xmm2,%xmm9
+  196,161,122,16,28,144,                  //vmovss        (%rax,%r10,4),%xmm3
+  196,163,97,33,28,128,16,                //vinsertps     $0x10,(%rax,%r8,4),%xmm3,%xmm3
+  196,161,122,16,20,152,                  //vmovss        (%rax,%r11,4),%xmm2
+  196,227,97,33,210,32,                   //vinsertps     $0x20,%xmm2,%xmm3,%xmm2
+  196,161,122,16,28,136,                  //vmovss        (%rax,%r9,4),%xmm3
+  196,227,105,33,211,48,                  //vinsertps     $0x30,%xmm3,%xmm2,%xmm2
+  196,195,109,24,209,1,                   //vinsertf128   $0x1,%xmm9,%ymm2,%ymm2
+  196,193,57,114,208,24,                  //vpsrld        $0x18,%xmm8,%xmm8
+  196,193,97,114,210,24,                  //vpsrld        $0x18,%xmm10,%xmm3
+  196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,98,125,24,66,12,                    //vbroadcastss  0xc(%rdx),%ymm8
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  91,                                     //pop           %rbx
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  93,                                     //pop           %rbp
+  255,224,                                //jmpq          *%rax
+  65,137,201,                             //mov           %ecx,%r9d
+  65,128,225,7,                           //and           $0x7,%r9b
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  65,254,201,                             //dec           %r9b
+  69,15,182,201,                          //movzbl        %r9b,%r9d
+  65,128,249,6,                           //cmp           $0x6,%r9b
+  15,135,215,253,255,255,                 //ja            75a <_sk_load_tables_avx+0x1e>
+  76,141,21,138,0,0,0,                    //lea           0x8a(%rip),%r10        # a14 <_sk_load_tables_avx+0x2d8>
+  79,99,12,138,                           //movslq        (%r10,%r9,4),%r9
+  77,1,209,                               //add           %r10,%r9
+  65,255,225,                             //jmpq          *%r9
+  196,193,121,110,68,184,24,              //vmovd         0x18(%r8,%rdi,4),%xmm0
+  197,249,112,192,68,                     //vpshufd       $0x44,%xmm0,%xmm0
+  196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  196,99,117,12,192,64,                   //vblendps      $0x40,%ymm0,%ymm1,%ymm8
+  196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
+  196,195,121,34,68,184,20,1,             //vpinsrd       $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0
+  196,99,61,24,192,1,                     //vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
+  196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
+  196,195,121,34,68,184,16,0,             //vpinsrd       $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0
+  196,99,61,24,192,1,                     //vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
+  196,195,57,34,68,184,12,3,              //vpinsrd       $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0
+  196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
+  196,195,57,34,68,184,8,2,               //vpinsrd       $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0
+  196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
+  196,195,57,34,68,184,4,1,               //vpinsrd       $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0
+  196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
+  196,195,57,34,4,184,0,                  //vpinsrd       $0x0,(%r8,%rdi,4),%xmm8,%xmm0
+  196,99,61,12,192,15,                    //vblendps      $0xf,%ymm0,%ymm8,%ymm8
+  233,70,253,255,255,                     //jmpq          75a <_sk_load_tables_avx+0x1e>
+  238,                                    //out           %al,(%dx)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,224,                                //jmpq          *%rax
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,196,                                //inc           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,176,255,255,255,156,                //pushq         -0x63000001(%rax)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+  128,255,255,                            //cmp           $0xff,%bh
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_a8_avx[] = {
+  73,137,200,                             //mov           %rcx,%r8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,1,248,                               //add           %rdi,%rax
+  77,133,192,                             //test          %r8,%r8
+  117,59,                                 //jne           a7b <_sk_load_a8_avx+0x4b>
+  197,251,16,0,                           //vmovsd        (%rax),%xmm0
+  196,226,121,49,200,                     //vpmovzxbd     %xmm0,%xmm1
+  196,227,121,4,192,229,                  //vpermilps     $0xe5,%xmm0,%xmm0
+  196,226,121,49,192,                     //vpmovzxbd     %xmm0,%xmm0
+  196,227,117,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,12,                   //vbroadcastss  0xc(%rdx),%ymm1
+  197,252,89,217,                         //vmulps        %ymm1,%ymm0,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,252,87,192,                         //vxorps        %ymm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  197,236,87,210,                         //vxorps        %ymm2,%ymm2,%ymm2
+  76,137,193,                             //mov           %r8,%rcx
+  255,224,                                //jmpq          *%rax
+  49,201,                                 //xor           %ecx,%ecx
+  77,137,194,                             //mov           %r8,%r10
+  69,49,201,                              //xor           %r9d,%r9d
+  68,15,182,24,                           //movzbl        (%rax),%r11d
+  72,255,192,                             //inc           %rax
+  73,211,227,                             //shl           %cl,%r11
+  77,9,217,                               //or            %r11,%r9
+  72,131,193,8,                           //add           $0x8,%rcx
+  73,255,202,                             //dec           %r10
+  117,234,                                //jne           a83 <_sk_load_a8_avx+0x53>
+  196,193,249,110,193,                    //vmovq         %r9,%xmm0
+  235,164,                                //jmp           a44 <_sk_load_a8_avx+0x14>
+};
+
+CODE const uint8_t sk_store_a8_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,66,8,                     //vbroadcastss  0x8(%rdx),%ymm8
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
+  196,65,57,103,192,                      //vpackuswb     %xmm8,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           ad3 <_sk_store_a8_avx+0x33>
+  196,65,123,17,4,57,                     //vmovsd        %xmm8,(%r9,%rdi,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            acf <_sk_store_a8_avx+0x2f>
+  196,66,121,48,192,                      //vpmovzxbw     %xmm8,%xmm8
+  76,141,21,69,0,0,0,                     //lea           0x45(%rip),%r10        # b34 <_sk_store_a8_avx+0x94>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,121,20,68,57,6,12,               //vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
+  196,67,121,20,68,57,5,10,               //vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
+  196,67,121,20,68,57,4,8,                //vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
+  196,67,121,20,68,57,3,6,                //vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
+  196,67,121,20,68,57,2,4,                //vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
+  196,67,121,20,68,57,1,2,                //vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
+  196,67,121,20,4,57,0,                   //vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
+  235,158,                                //jmp           acf <_sk_store_a8_avx+0x2f>
+  15,31,0,                                //nopl          (%rax)
+  244,                                    //hlt
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  236,                                    //in            (%dx),%al
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,228,                                //jmpq          *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  220,255,                                //fdivr         %st,%st(7)
+  255,                                    //(bad)
+  255,212,                                //callq         *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,204,                                //dec           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,196,                                //inc           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_565_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  117,106,                                //jne           bc4 <_sk_load_565_avx+0x74>
+  196,193,122,111,4,122,                  //vmovdqu       (%r10,%rdi,2),%xmm0
+  197,241,239,201,                        //vpxor         %xmm1,%xmm1,%xmm1
+  197,249,105,201,                        //vpunpckhwd    %xmm1,%xmm0,%xmm1
+  196,226,121,51,192,                     //vpmovzxwd     %xmm0,%xmm0
+  196,227,125,24,209,1,                   //vinsertf128   $0x1,%xmm1,%ymm0,%ymm2
+  196,226,125,24,66,104,                  //vbroadcastss  0x68(%rdx),%ymm0
+  197,252,84,194,                         //vandps        %ymm2,%ymm0,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,226,125,24,74,116,                  //vbroadcastss  0x74(%rdx),%ymm1
+  197,244,89,192,                         //vmulps        %ymm0,%ymm1,%ymm0
+  196,226,125,24,74,108,                  //vbroadcastss  0x6c(%rdx),%ymm1
+  197,244,84,202,                         //vandps        %ymm2,%ymm1,%ymm1
+  197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
+  196,226,125,24,90,120,                  //vbroadcastss  0x78(%rdx),%ymm3
+  197,228,89,201,                         //vmulps        %ymm1,%ymm3,%ymm1
+  196,226,125,24,90,112,                  //vbroadcastss  0x70(%rdx),%ymm3
+  197,228,84,210,                         //vandps        %ymm2,%ymm3,%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  196,226,125,24,90,124,                  //vbroadcastss  0x7c(%rdx),%ymm3
+  197,228,89,210,                         //vmulps        %ymm2,%ymm3,%ymm2
+  196,226,125,24,26,                      //vbroadcastss  (%rdx),%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,132,                                //ja            b60 <_sk_load_565_avx+0x10>
+  76,141,13,73,0,0,0,                     //lea           0x49(%rip),%r9        # c2c <_sk_load_565_avx+0xdc>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  197,249,239,192,                        //vpxor         %xmm0,%xmm0,%xmm0
+  196,193,121,196,68,122,12,6,            //vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,10,5,            //vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,8,4,             //vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,6,3,             //vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,4,2,             //vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,68,122,2,1,             //vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
+  196,193,121,196,4,122,0,                //vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
+  233,52,255,255,255,                     //jmpq          b60 <_sk_load_565_avx+0x10>
+  244,                                    //hlt
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  236,                                    //in            (%dx),%al
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,228,                                //jmpq          *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  220,255,                                //fdivr         %st,%st(7)
+  255,                                    //(bad)
+  255,212,                                //callq         *%rsp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,204,                                //dec           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,192,                                //inc           %eax
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_store_565_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,130,128,0,0,0,            //vbroadcastss  0x80(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
+  196,193,41,114,241,11,                  //vpslld        $0xb,%xmm9,%xmm10
+  196,67,125,25,201,1,                    //vextractf128  $0x1,%ymm9,%xmm9
+  196,193,49,114,241,11,                  //vpslld        $0xb,%xmm9,%xmm9
+  196,67,45,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm9
+  196,98,125,24,146,132,0,0,0,            //vbroadcastss  0x84(%rdx),%ymm10
+  197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,33,114,242,5,                   //vpslld        $0x5,%xmm10,%xmm11
+  196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
+  196,193,41,114,242,5,                   //vpslld        $0x5,%xmm10,%xmm10
+  196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  196,65,45,86,201,                       //vorpd         %ymm9,%ymm10,%ymm9
+  197,60,89,194,                          //vmulps        %ymm2,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,65,53,86,192,                       //vorpd         %ymm8,%ymm9,%ymm8
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,66,57,43,193,                       //vpackusdw     %xmm9,%xmm8,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           cce <_sk_store_565_avx+0x86>
+  196,65,122,127,4,121,                   //vmovdqu       %xmm8,(%r9,%rdi,2)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            cca <_sk_store_565_avx+0x82>
+  76,141,21,71,0,0,0,                     //lea           0x47(%rip),%r10        # d2c <_sk_store_565_avx+0xe4>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,121,21,68,121,12,6,              //vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
+  196,67,121,21,68,121,10,5,              //vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
+  196,67,121,21,68,121,8,4,               //vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
+  196,67,121,21,68,121,6,3,               //vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
+  196,67,121,21,68,121,4,2,               //vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
+  196,67,121,21,68,121,2,1,               //vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
+  197,121,126,192,                        //vmovd         %xmm8,%eax
+  102,65,137,4,121,                       //mov           %ax,(%r9,%rdi,2)
+  235,161,                                //jmp           cca <_sk_store_565_avx+0x82>
+  15,31,0,                                //nopl          (%rax)
+  242,255,                                //repnz         (bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  234,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,226,                                //jmpq          *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  218,255,                                //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,202,                                //dec           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,194,                                //inc           %edx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_8888_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,16,                              //mov           (%rax),%r10
+  72,133,201,                             //test          %rcx,%rcx
+  117,125,                                //jne           dcf <_sk_load_8888_avx+0x87>
+  196,65,124,16,12,186,                   //vmovups       (%r10,%rdi,4),%ymm9
+  196,98,125,24,90,16,                    //vbroadcastss  0x10(%rdx),%ymm11
+  196,193,36,84,193,                      //vandps        %ymm9,%ymm11,%ymm0
+  197,252,91,192,                         //vcvtdq2ps     %ymm0,%ymm0
+  196,98,125,24,66,12,                    //vbroadcastss  0xc(%rdx),%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  196,193,41,114,209,8,                   //vpsrld        $0x8,%xmm9,%xmm10
+  196,99,125,25,203,1,                    //vextractf128  $0x1,%ymm9,%xmm3
+  197,241,114,211,8,                      //vpsrld        $0x8,%xmm3,%xmm1
+  196,227,45,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm10,%ymm1
+  197,164,84,201,                         //vandps        %ymm1,%ymm11,%ymm1
+  197,252,91,201,                         //vcvtdq2ps     %ymm1,%ymm1
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  196,193,41,114,209,16,                  //vpsrld        $0x10,%xmm9,%xmm10
+  197,233,114,211,16,                     //vpsrld        $0x10,%xmm3,%xmm2
+  196,227,45,24,210,1,                    //vinsertf128   $0x1,%xmm2,%ymm10,%ymm2
+  197,164,84,210,                         //vandps        %ymm2,%ymm11,%ymm2
+  197,252,91,210,                         //vcvtdq2ps     %ymm2,%ymm2
+  197,188,89,210,                         //vmulps        %ymm2,%ymm8,%ymm2
+  196,193,49,114,209,24,                  //vpsrld        $0x18,%xmm9,%xmm9
+  197,225,114,211,24,                     //vpsrld        $0x18,%xmm3,%xmm3
+  196,227,53,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm9,%ymm3
+  197,252,91,219,                         //vcvtdq2ps     %ymm3,%ymm3
+  196,193,100,89,216,                     //vmulps        %ymm8,%ymm3,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  65,137,200,                             //mov           %ecx,%r8d
+  65,128,224,7,                           //and           $0x7,%r8b
+  196,65,52,87,201,                       //vxorps        %ymm9,%ymm9,%ymm9
+  65,254,200,                             //dec           %r8b
+  69,15,182,192,                          //movzbl        %r8b,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  15,135,108,255,255,255,                 //ja            d58 <_sk_load_8888_avx+0x10>
+  76,141,13,137,0,0,0,                    //lea           0x89(%rip),%r9        # e7c <_sk_load_8888_avx+0x134>
+  75,99,4,129,                            //movslq        (%r9,%r8,4),%rax
+  76,1,200,                               //add           %r9,%rax
+  255,224,                                //jmpq          *%rax
+  196,193,121,110,68,186,24,              //vmovd         0x18(%r10,%rdi,4),%xmm0
+  197,249,112,192,68,                     //vpshufd       $0x44,%xmm0,%xmm0
+  196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  197,244,87,201,                         //vxorps        %ymm1,%ymm1,%ymm1
+  196,99,117,12,200,64,                   //vblendps      $0x40,%ymm0,%ymm1,%ymm9
+  196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
+  196,195,121,34,68,186,20,1,             //vpinsrd       $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0
+  196,99,53,24,200,1,                     //vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
+  196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
+  196,195,121,34,68,186,16,0,             //vpinsrd       $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0
+  196,99,53,24,200,1,                     //vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
+  196,195,49,34,68,186,12,3,              //vpinsrd       $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0
+  196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
+  196,195,49,34,68,186,8,2,               //vpinsrd       $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0
+  196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
+  196,195,49,34,68,186,4,1,               //vpinsrd       $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0
+  196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
+  196,195,49,34,4,186,0,                  //vpinsrd       $0x0,(%r10,%rdi,4),%xmm9,%xmm0
+  196,99,53,12,200,15,                    //vblendps      $0xf,%ymm0,%ymm9,%ymm9
+  233,220,254,255,255,                    //jmpq          d58 <_sk_load_8888_avx+0x10>
+  238,                                    //out           %al,(%dx)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,224,                                //jmpq          *%rax
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,210,                                //callq         *%rdx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,196,                                //inc           %esp
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,176,255,255,255,156,                //pushq         -0x63000001(%rax)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+  128,255,255,                            //cmp           $0xff,%bh
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_store_8888_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,8,                               //mov           (%rax),%r9
+  196,98,125,24,66,8,                     //vbroadcastss  0x8(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,65,125,91,201,                      //vcvtps2dq     %ymm9,%ymm9
+  197,60,89,209,                          //vmulps        %ymm1,%ymm8,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,33,114,242,8,                   //vpslld        $0x8,%xmm10,%xmm11
+  196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
+  196,193,41,114,242,8,                   //vpslld        $0x8,%xmm10,%xmm10
+  196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  196,65,45,86,201,                       //vorpd         %ymm9,%ymm10,%ymm9
+  197,60,89,210,                          //vmulps        %ymm2,%ymm8,%ymm10
+  196,65,125,91,210,                      //vcvtps2dq     %ymm10,%ymm10
+  196,193,33,114,242,16,                  //vpslld        $0x10,%xmm10,%xmm11
+  196,67,125,25,210,1,                    //vextractf128  $0x1,%ymm10,%xmm10
+  196,193,41,114,242,16,                  //vpslld        $0x10,%xmm10,%xmm10
+  196,67,37,24,210,1,                     //vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,65,125,91,192,                      //vcvtps2dq     %ymm8,%ymm8
+  196,193,33,114,240,24,                  //vpslld        $0x18,%xmm8,%xmm11
+  196,67,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm8
+  196,193,57,114,240,24,                  //vpslld        $0x18,%xmm8,%xmm8
+  196,67,37,24,192,1,                     //vinsertf128   $0x1,%xmm8,%ymm11,%ymm8
+  196,65,45,86,192,                       //vorpd         %ymm8,%ymm10,%ymm8
+  196,65,53,86,192,                       //vorpd         %ymm8,%ymm9,%ymm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,10,                                 //jne           f2d <_sk_store_8888_avx+0x95>
+  196,65,124,17,4,185,                    //vmovups       %ymm8,(%r9,%rdi,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  137,200,                                //mov           %ecx,%eax
+  36,7,                                   //and           $0x7,%al
+  254,200,                                //dec           %al
+  68,15,182,192,                          //movzbl        %al,%r8d
+  65,128,248,6,                           //cmp           $0x6,%r8b
+  119,236,                                //ja            f29 <_sk_store_8888_avx+0x91>
+  76,141,21,84,0,0,0,                     //lea           0x54(%rip),%r10        # f98 <_sk_store_8888_avx+0x100>
+  75,99,4,130,                            //movslq        (%r10,%r8,4),%rax
+  76,1,208,                               //add           %r10,%rax
+  255,224,                                //jmpq          *%rax
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,67,121,22,76,185,24,2,              //vpextrd       $0x2,%xmm9,0x18(%r9,%rdi,4)
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,67,121,22,76,185,20,1,              //vpextrd       $0x1,%xmm9,0x14(%r9,%rdi,4)
+  196,67,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm9
+  196,65,121,126,76,185,16,               //vmovd         %xmm9,0x10(%r9,%rdi,4)
+  196,67,121,22,68,185,12,3,              //vpextrd       $0x3,%xmm8,0xc(%r9,%rdi,4)
+  196,67,121,22,68,185,8,2,               //vpextrd       $0x2,%xmm8,0x8(%r9,%rdi,4)
+  196,67,121,22,68,185,4,1,               //vpextrd       $0x1,%xmm8,0x4(%r9,%rdi,4)
+  196,65,121,126,4,185,                   //vmovd         %xmm8,(%r9,%rdi,4)
+  235,147,                                //jmp           f29 <_sk_store_8888_avx+0x91>
+  102,144,                                //xchg          %ax,%ax
+  246,255,                                //idiv          %bh
+  255,                                    //(bad)
+  255,                                    //(bad)
+  238,                                    //out           %al,(%dx)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,230,                                //jmpq          *%rsi
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //(bad)
+  222,255,                                //fdivrp        %st,%st(7)
+  255,                                    //(bad)
+  255,209,                                //callq         *%rcx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,195,                                //inc           %ebx
+  255,                                    //(bad)
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+  181,255,                                //mov           $0xff,%ch
+  255,                                    //(bad)
+  255,                                    //.byte         0xff
+};
+
+CODE const uint8_t sk_load_f16_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,133,201,                             //test          %rcx,%rcx
+  15,133,240,0,0,0,                       //jne           10b2 <_sk_load_f16_avx+0xfe>
+  197,249,16,12,248,                      //vmovupd       (%rax,%rdi,8),%xmm1
+  197,249,16,84,248,16,                   //vmovupd       0x10(%rax,%rdi,8),%xmm2
+  197,249,16,92,248,32,                   //vmovupd       0x20(%rax,%rdi,8),%xmm3
+  197,121,16,68,248,48,                   //vmovupd       0x30(%rax,%rdi,8),%xmm8
+  197,241,97,194,                         //vpunpcklwd    %xmm2,%xmm1,%xmm0
+  197,241,105,202,                        //vpunpckhwd    %xmm2,%xmm1,%xmm1
+  196,193,97,97,208,                      //vpunpcklwd    %xmm8,%xmm3,%xmm2
+  196,193,97,105,216,                     //vpunpckhwd    %xmm8,%xmm3,%xmm3
+  197,121,97,193,                         //vpunpcklwd    %xmm1,%xmm0,%xmm8
+  197,249,105,193,                        //vpunpckhwd    %xmm1,%xmm0,%xmm0
+  197,233,97,203,                         //vpunpcklwd    %xmm3,%xmm2,%xmm1
+  197,105,105,203,                        //vpunpckhwd    %xmm3,%xmm2,%xmm9
+  197,249,110,90,100,                     //vmovd         0x64(%rdx),%xmm3
+  197,249,112,219,0,                      //vpshufd       $0x0,%xmm3,%xmm3
+  196,193,97,101,208,                     //vpcmpgtw      %xmm8,%xmm3,%xmm2
+  196,65,105,223,192,                     //vpandn        %xmm8,%xmm2,%xmm8
+  197,225,101,208,                        //vpcmpgtw      %xmm0,%xmm3,%xmm2
+  197,233,223,192,                        //vpandn        %xmm0,%xmm2,%xmm0
+  197,225,101,209,                        //vpcmpgtw      %xmm1,%xmm3,%xmm2
+  197,233,223,201,                        //vpandn        %xmm1,%xmm2,%xmm1
+  196,193,97,101,209,                     //vpcmpgtw      %xmm9,%xmm3,%xmm2
+  196,193,105,223,209,                    //vpandn        %xmm9,%xmm2,%xmm2
+  196,66,121,51,208,                      //vpmovzxwd     %xmm8,%xmm10
+  196,98,121,51,201,                      //vpmovzxwd     %xmm1,%xmm9
+  197,225,239,219,                        //vpxor         %xmm3,%xmm3,%xmm3
+  197,57,105,195,                         //vpunpckhwd    %xmm3,%xmm8,%xmm8
+  197,241,105,203,                        //vpunpckhwd    %xmm3,%xmm1,%xmm1
+  196,98,121,51,216,                      //vpmovzxwd     %xmm0,%xmm11
+  196,98,121,51,226,                      //vpmovzxwd     %xmm2,%xmm12
+  197,121,105,235,                        //vpunpckhwd    %xmm3,%xmm0,%xmm13
+  197,105,105,243,                        //vpunpckhwd    %xmm3,%xmm2,%xmm14
+  196,193,121,114,242,13,                 //vpslld        $0xd,%xmm10,%xmm0
+  196,193,105,114,241,13,                 //vpslld        $0xd,%xmm9,%xmm2
+  196,227,125,24,194,1,                   //vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
+  196,98,125,24,74,92,                    //vbroadcastss  0x5c(%rdx),%ymm9
+  197,180,89,192,                         //vmulps        %ymm0,%ymm9,%ymm0
+  196,193,105,114,240,13,                 //vpslld        $0xd,%xmm8,%xmm2
+  197,241,114,241,13,                     //vpslld        $0xd,%xmm1,%xmm1
+  196,227,109,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
+  197,180,89,201,                         //vmulps        %ymm1,%ymm9,%ymm1
+  196,193,105,114,243,13,                 //vpslld        $0xd,%xmm11,%xmm2
+  196,193,97,114,244,13,                  //vpslld        $0xd,%xmm12,%xmm3
+  196,227,109,24,211,1,                   //vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
+  197,180,89,210,                         //vmulps        %ymm2,%ymm9,%ymm2
+  196,193,57,114,245,13,                  //vpslld        $0xd,%xmm13,%xmm8
+  196,193,97,114,246,13,                  //vpslld        $0xd,%xmm14,%xmm3
+  196,227,61,24,219,1,                    //vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
+  197,180,89,219,                         //vmulps        %ymm3,%ymm9,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  197,251,16,12,248,                      //vmovsd        (%rax,%rdi,8),%xmm1
+  196,65,57,87,192,                       //vxorpd        %xmm8,%xmm8,%xmm8
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  117,6,                                  //jne           10c8 <_sk_load_f16_avx+0x114>
+  197,250,126,201,                        //vmovq         %xmm1,%xmm1
+  235,30,                                 //jmp           10e6 <_sk_load_f16_avx+0x132>
+  197,241,22,76,248,8,                    //vmovhpd       0x8(%rax,%rdi,8),%xmm1,%xmm1
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,18,                                 //jb            10e6 <_sk_load_f16_avx+0x132>
+  197,251,16,84,248,16,                   //vmovsd        0x10(%rax,%rdi,8),%xmm2
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  117,19,                                 //jne           10f3 <_sk_load_f16_avx+0x13f>
+  197,250,126,210,                        //vmovq         %xmm2,%xmm2
+  235,46,                                 //jmp           1114 <_sk_load_f16_avx+0x160>
+  197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
+  197,233,87,210,                         //vxorpd        %xmm2,%xmm2,%xmm2
+  233,230,254,255,255,                    //jmpq          fd9 <_sk_load_f16_avx+0x25>
+  197,233,22,84,248,24,                   //vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,21,                                 //jb            1114 <_sk_load_f16_avx+0x160>
+  197,251,16,92,248,32,                   //vmovsd        0x20(%rax,%rdi,8),%xmm3
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  117,18,                                 //jne           111d <_sk_load_f16_avx+0x169>
+  197,250,126,219,                        //vmovq         %xmm3,%xmm3
+  233,197,254,255,255,                    //jmpq          fd9 <_sk_load_f16_avx+0x25>
+  197,225,87,219,                         //vxorpd        %xmm3,%xmm3,%xmm3
+  233,188,254,255,255,                    //jmpq          fd9 <_sk_load_f16_avx+0x25>
+  197,225,22,92,248,40,                   //vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  15,130,172,254,255,255,                 //jb            fd9 <_sk_load_f16_avx+0x25>
+  197,123,16,68,248,48,                   //vmovsd        0x30(%rax,%rdi,8),%xmm8
+  233,161,254,255,255,                    //jmpq          fd9 <_sk_load_f16_avx+0x25>
+};
+
+CODE const uint8_t sk_store_f16_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  196,98,125,24,66,96,                    //vbroadcastss  0x60(%rdx),%ymm8
+  197,60,89,200,                          //vmulps        %ymm0,%ymm8,%ymm9
+  196,67,125,25,202,1,                    //vextractf128  $0x1,%ymm9,%xmm10
+  196,193,41,114,210,13,                  //vpsrld        $0xd,%xmm10,%xmm10
+  196,193,49,114,209,13,                  //vpsrld        $0xd,%xmm9,%xmm9
+  197,60,89,217,                          //vmulps        %ymm1,%ymm8,%ymm11
+  196,67,125,25,220,1,                    //vextractf128  $0x1,%ymm11,%xmm12
+  196,193,25,114,212,13,                  //vpsrld        $0xd,%xmm12,%xmm12
+  196,193,33,114,211,13,                  //vpsrld        $0xd,%xmm11,%xmm11
+  197,60,89,234,                          //vmulps        %ymm2,%ymm8,%ymm13
+  196,67,125,25,238,1,                    //vextractf128  $0x1,%ymm13,%xmm14
+  196,193,9,114,214,13,                   //vpsrld        $0xd,%xmm14,%xmm14
+  196,193,17,114,213,13,                  //vpsrld        $0xd,%xmm13,%xmm13
+  197,60,89,195,                          //vmulps        %ymm3,%ymm8,%ymm8
+  196,67,125,25,199,1,                    //vextractf128  $0x1,%ymm8,%xmm15
+  196,193,1,114,215,13,                   //vpsrld        $0xd,%xmm15,%xmm15
+  196,193,57,114,208,13,                  //vpsrld        $0xd,%xmm8,%xmm8
+  196,193,33,115,251,2,                   //vpslldq       $0x2,%xmm11,%xmm11
+  196,65,33,235,201,                      //vpor          %xmm9,%xmm11,%xmm9
+  196,193,33,115,252,2,                   //vpslldq       $0x2,%xmm12,%xmm11
+  196,65,33,235,226,                      //vpor          %xmm10,%xmm11,%xmm12
+  196,193,57,115,248,2,                   //vpslldq       $0x2,%xmm8,%xmm8
+  196,65,57,235,197,                      //vpor          %xmm13,%xmm8,%xmm8
+  196,193,41,115,255,2,                   //vpslldq       $0x2,%xmm15,%xmm10
+  196,65,41,235,238,                      //vpor          %xmm14,%xmm10,%xmm13
+  196,65,49,98,216,                       //vpunpckldq    %xmm8,%xmm9,%xmm11
+  196,65,49,106,208,                      //vpunpckhdq    %xmm8,%xmm9,%xmm10
+  196,65,25,98,205,                       //vpunpckldq    %xmm13,%xmm12,%xmm9
+  196,65,25,106,197,                      //vpunpckhdq    %xmm13,%xmm12,%xmm8
+  72,133,201,                             //test          %rcx,%rcx
+  117,27,                                 //jne           11fb <_sk_store_f16_avx+0xc3>
+  197,120,17,28,248,                      //vmovups       %xmm11,(%rax,%rdi,8)
+  197,120,17,84,248,16,                   //vmovups       %xmm10,0x10(%rax,%rdi,8)
+  197,120,17,76,248,32,                   //vmovups       %xmm9,0x20(%rax,%rdi,8)
+  197,122,127,68,248,48,                  //vmovdqu       %xmm8,0x30(%rax,%rdi,8)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  197,121,214,28,248,                     //vmovq         %xmm11,(%rax,%rdi,8)
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  116,241,                                //je            11f7 <_sk_store_f16_avx+0xbf>
+  197,121,23,92,248,8,                    //vmovhpd       %xmm11,0x8(%rax,%rdi,8)
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,229,                                //jb            11f7 <_sk_store_f16_avx+0xbf>
+  197,121,214,84,248,16,                  //vmovq         %xmm10,0x10(%rax,%rdi,8)
+  116,221,                                //je            11f7 <_sk_store_f16_avx+0xbf>
+  197,121,23,84,248,24,                   //vmovhpd       %xmm10,0x18(%rax,%rdi,8)
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,209,                                //jb            11f7 <_sk_store_f16_avx+0xbf>
+  197,121,214,76,248,32,                  //vmovq         %xmm9,0x20(%rax,%rdi,8)
+  116,201,                                //je            11f7 <_sk_store_f16_avx+0xbf>
+  197,121,23,76,248,40,                   //vmovhpd       %xmm9,0x28(%rax,%rdi,8)
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  114,189,                                //jb            11f7 <_sk_store_f16_avx+0xbf>
+  197,121,214,68,248,48,                  //vmovq         %xmm8,0x30(%rax,%rdi,8)
+  235,181,                                //jmp           11f7 <_sk_store_f16_avx+0xbf>
+};
+
+CODE const uint8_t sk_store_f32_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  76,139,0,                               //mov           (%rax),%r8
+  72,141,4,189,0,0,0,0,                   //lea           0x0(,%rdi,4),%rax
+  197,124,20,193,                         //vunpcklps     %ymm1,%ymm0,%ymm8
+  197,124,21,217,                         //vunpckhps     %ymm1,%ymm0,%ymm11
+  197,108,20,203,                         //vunpcklps     %ymm3,%ymm2,%ymm9
+  197,108,21,227,                         //vunpckhps     %ymm3,%ymm2,%ymm12
+  196,65,61,20,209,                       //vunpcklpd     %ymm9,%ymm8,%ymm10
+  196,65,61,21,201,                       //vunpckhpd     %ymm9,%ymm8,%ymm9
+  196,65,37,20,196,                       //vunpcklpd     %ymm12,%ymm11,%ymm8
+  196,65,37,21,220,                       //vunpckhpd     %ymm12,%ymm11,%ymm11
+  72,133,201,                             //test          %rcx,%rcx
+  117,55,                                 //jne           12af <_sk_store_f32_avx+0x6d>
+  196,67,45,24,225,1,                     //vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
+  196,67,61,24,235,1,                     //vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
+  196,67,45,6,201,49,                     //vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
+  196,67,61,6,195,49,                     //vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
+  196,65,125,17,36,128,                   //vmovupd       %ymm12,(%r8,%rax,4)
+  196,65,125,17,108,128,32,               //vmovupd       %ymm13,0x20(%r8,%rax,4)
+  196,65,125,17,76,128,64,                //vmovupd       %ymm9,0x40(%r8,%rax,4)
+  196,65,125,17,68,128,96,                //vmovupd       %ymm8,0x60(%r8,%rax,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+  196,65,121,17,20,128,                   //vmovupd       %xmm10,(%r8,%rax,4)
+  72,131,249,1,                           //cmp           $0x1,%rcx
+  116,240,                                //je            12ab <_sk_store_f32_avx+0x69>
+  196,65,121,17,76,128,16,                //vmovupd       %xmm9,0x10(%r8,%rax,4)
+  72,131,249,3,                           //cmp           $0x3,%rcx
+  114,227,                                //jb            12ab <_sk_store_f32_avx+0x69>
+  196,65,121,17,68,128,32,                //vmovupd       %xmm8,0x20(%r8,%rax,4)
+  116,218,                                //je            12ab <_sk_store_f32_avx+0x69>
+  196,65,121,17,92,128,48,                //vmovupd       %xmm11,0x30(%r8,%rax,4)
+  72,131,249,5,                           //cmp           $0x5,%rcx
+  114,205,                                //jb            12ab <_sk_store_f32_avx+0x69>
+  196,67,125,25,84,128,64,1,              //vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
+  116,195,                                //je            12ab <_sk_store_f32_avx+0x69>
+  196,67,125,25,76,128,80,1,              //vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
+  72,131,249,7,                           //cmp           $0x7,%rcx
+  114,181,                                //jb            12ab <_sk_store_f32_avx+0x69>
+  196,67,125,25,68,128,96,1,              //vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
+  235,171,                                //jmp           12ab <_sk_store_f32_avx+0x69>
+};
+
+CODE const uint8_t sk_clamp_x_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,95,200,                          //vmaxps        %ymm0,%ymm8,%ymm9
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
+  196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
+  196,227,61,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
+  197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_y_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,95,201,                          //vmaxps        %ymm1,%ymm8,%ymm9
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,99,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm1
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
+  196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
+  196,227,61,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
+  197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_x_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,65,124,94,200,                      //vdivps        %ymm8,%ymm0,%ymm9
+  196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
+  196,65,52,89,200,                       //vmulps        %ymm8,%ymm9,%ymm9
+  196,65,124,92,201,                      //vsubps        %ymm9,%ymm0,%ymm9
+  196,99,125,25,192,1,                    //vextractf128  $0x1,%ymm8,%xmm0
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
+  196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
+  196,227,61,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
+  197,180,93,192,                         //vminps        %ymm0,%ymm9,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_y_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,65,116,94,200,                      //vdivps        %ymm8,%ymm1,%ymm9
+  196,67,125,8,201,1,                     //vroundps      $0x1,%ymm9,%ymm9
+  196,65,52,89,200,                       //vmulps        %ymm8,%ymm9,%ymm9
+  196,65,116,92,201,                      //vsubps        %ymm9,%ymm1,%ymm9
+  196,99,125,25,193,1,                    //vextractf128  $0x1,%ymm8,%xmm1
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
+  196,65,57,254,194,                      //vpaddd        %xmm10,%xmm8,%xmm8
+  196,227,61,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
+  197,180,93,201,                         //vminps        %ymm1,%ymm9,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_x_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,122,16,0,                           //vmovss        (%rax),%xmm8
+  196,65,121,112,200,0,                   //vpshufd       $0x0,%xmm8,%xmm9
+  196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
+  196,65,124,92,209,                      //vsubps        %ymm9,%ymm0,%ymm10
+  196,193,58,88,192,                      //vaddss        %xmm8,%xmm8,%xmm0
+  196,227,121,4,192,0,                    //vpermilps     $0x0,%xmm0,%xmm0
+  196,227,125,24,192,1,                   //vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  197,44,94,192,                          //vdivps        %ymm0,%ymm10,%ymm8
+  196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
+  197,188,89,192,                         //vmulps        %ymm0,%ymm8,%ymm0
+  197,172,92,192,                         //vsubps        %ymm0,%ymm10,%ymm0
+  196,193,124,92,193,                     //vsubps        %ymm9,%ymm0,%ymm0
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,92,192,                          //vsubps        %ymm0,%ymm8,%ymm8
+  197,60,84,192,                          //vandps        %ymm0,%ymm8,%ymm8
+  196,99,125,25,200,1,                    //vextractf128  $0x1,%ymm9,%xmm0
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,121,254,194,                    //vpaddd        %xmm10,%xmm0,%xmm0
+  196,65,49,254,202,                      //vpaddd        %xmm10,%xmm9,%xmm9
+  196,227,53,24,192,1,                    //vinsertf128   $0x1,%xmm0,%ymm9,%ymm0
+  197,188,93,192,                         //vminps        %ymm0,%ymm8,%ymm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_y_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,122,16,0,                           //vmovss        (%rax),%xmm8
+  196,65,121,112,200,0,                   //vpshufd       $0x0,%xmm8,%xmm9
+  196,67,53,24,201,1,                     //vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
+  196,65,116,92,209,                      //vsubps        %ymm9,%ymm1,%ymm10
+  196,193,58,88,200,                      //vaddss        %xmm8,%xmm8,%xmm1
+  196,227,121,4,201,0,                    //vpermilps     $0x0,%xmm1,%xmm1
+  196,227,117,24,201,1,                   //vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
+  197,44,94,193,                          //vdivps        %ymm1,%ymm10,%ymm8
+  196,67,125,8,192,1,                     //vroundps      $0x1,%ymm8,%ymm8
+  197,188,89,201,                         //vmulps        %ymm1,%ymm8,%ymm1
+  197,172,92,201,                         //vsubps        %ymm1,%ymm10,%ymm1
+  196,193,116,92,201,                     //vsubps        %ymm9,%ymm1,%ymm1
+  196,65,60,87,192,                       //vxorps        %ymm8,%ymm8,%ymm8
+  197,60,92,193,                          //vsubps        %ymm1,%ymm8,%ymm8
+  197,60,84,193,                          //vandps        %ymm1,%ymm8,%ymm8
+  196,99,125,25,201,1,                    //vextractf128  $0x1,%ymm9,%xmm1
+  196,65,41,118,210,                      //vpcmpeqd      %xmm10,%xmm10,%xmm10
+  196,193,113,254,202,                    //vpaddd        %xmm10,%xmm1,%xmm1
+  196,65,49,254,202,                      //vpaddd        %xmm10,%xmm9,%xmm9
+  196,227,53,24,201,1,                    //vinsertf128   $0x1,%xmm1,%ymm9,%ymm1
+  197,188,93,201,                         //vminps        %ymm1,%ymm8,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_2x3_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,24,72,8,                     //vbroadcastss  0x8(%rax),%ymm9
+  196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
+  197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
+  196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
+  196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
+  196,98,125,24,80,12,                    //vbroadcastss  0xc(%rax),%ymm10
+  196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
+  197,172,89,201,                         //vmulps        %ymm1,%ymm10,%ymm1
+  196,193,116,88,203,                     //vaddps        %ymm11,%ymm1,%ymm1
+  197,180,89,192,                         //vmulps        %ymm0,%ymm9,%ymm0
+  197,252,88,201,                         //vaddps        %ymm1,%ymm0,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_3x4_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
+  196,98,125,24,80,24,                    //vbroadcastss  0x18(%rax),%ymm10
+  196,98,125,24,88,36,                    //vbroadcastss  0x24(%rax),%ymm11
+  197,44,89,210,                          //vmulps        %ymm2,%ymm10,%ymm10
+  196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
+  197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
+  196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
+  196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
+  196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
+  196,98,125,24,88,28,                    //vbroadcastss  0x1c(%rax),%ymm11
+  196,98,125,24,96,40,                    //vbroadcastss  0x28(%rax),%ymm12
+  197,36,89,218,                          //vmulps        %ymm2,%ymm11,%ymm11
+  196,65,36,88,220,                       //vaddps        %ymm12,%ymm11,%ymm11
+  197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
+  196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
+  197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
+  196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
+  196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
+  196,98,125,24,104,44,                   //vbroadcastss  0x2c(%rax),%ymm13
+  197,156,89,210,                         //vmulps        %ymm2,%ymm12,%ymm2
+  196,193,108,88,213,                     //vaddps        %ymm13,%ymm2,%ymm2
+  197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
+  197,244,88,202,                         //vaddps        %ymm2,%ymm1,%ymm1
+  197,172,89,192,                         //vmulps        %ymm0,%ymm10,%ymm0
+  197,252,88,209,                         //vaddps        %ymm1,%ymm0,%ymm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  197,124,41,201,                         //vmovaps       %ymm9,%ymm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_perspective_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,98,125,24,0,                        //vbroadcastss  (%rax),%ymm8
+  196,98,125,24,72,4,                     //vbroadcastss  0x4(%rax),%ymm9
+  196,98,125,24,80,8,                     //vbroadcastss  0x8(%rax),%ymm10
+  197,52,89,201,                          //vmulps        %ymm1,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  197,60,89,192,                          //vmulps        %ymm0,%ymm8,%ymm8
+  196,65,60,88,193,                       //vaddps        %ymm9,%ymm8,%ymm8
+  196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
+  196,98,125,24,80,16,                    //vbroadcastss  0x10(%rax),%ymm10
+  196,98,125,24,88,20,                    //vbroadcastss  0x14(%rax),%ymm11
+  197,44,89,209,                          //vmulps        %ymm1,%ymm10,%ymm10
+  196,65,44,88,211,                       //vaddps        %ymm11,%ymm10,%ymm10
+  197,52,89,200,                          //vmulps        %ymm0,%ymm9,%ymm9
+  196,65,52,88,202,                       //vaddps        %ymm10,%ymm9,%ymm9
+  196,98,125,24,80,24,                    //vbroadcastss  0x18(%rax),%ymm10
+  196,98,125,24,88,28,                    //vbroadcastss  0x1c(%rax),%ymm11
+  196,98,125,24,96,32,                    //vbroadcastss  0x20(%rax),%ymm12
+  197,164,89,201,                         //vmulps        %ymm1,%ymm11,%ymm1
+  196,193,116,88,204,                     //vaddps        %ymm12,%ymm1,%ymm1
+  197,172,89,192,                         //vmulps        %ymm0,%ymm10,%ymm0
+  197,252,88,193,                         //vaddps        %ymm1,%ymm0,%ymm0
+  197,252,83,200,                         //vrcpps        %ymm0,%ymm1
+  197,188,89,193,                         //vmulps        %ymm1,%ymm8,%ymm0
+  197,180,89,201,                         //vmulps        %ymm1,%ymm9,%ymm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_linear_gradient_2stops_avx[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  196,226,125,24,72,16,                   //vbroadcastss  0x10(%rax),%ymm1
+  196,226,125,24,16,                      //vbroadcastss  (%rax),%ymm2
+  197,244,89,200,                         //vmulps        %ymm0,%ymm1,%ymm1
+  197,108,88,193,                         //vaddps        %ymm1,%ymm2,%ymm8
+  196,226,125,24,72,20,                   //vbroadcastss  0x14(%rax),%ymm1
+  196,226,125,24,80,4,                    //vbroadcastss  0x4(%rax),%ymm2
+  197,244,89,200,                         //vmulps        %ymm0,%ymm1,%ymm1
+  197,236,88,201,                         //vaddps        %ymm1,%ymm2,%ymm1
+  196,226,125,24,80,24,                   //vbroadcastss  0x18(%rax),%ymm2
+  196,226,125,24,88,8,                    //vbroadcastss  0x8(%rax),%ymm3
+  197,236,89,208,                         //vmulps        %ymm0,%ymm2,%ymm2
+  197,228,88,210,                         //vaddps        %ymm2,%ymm3,%ymm2
+  196,226,125,24,88,28,                   //vbroadcastss  0x1c(%rax),%ymm3
+  196,98,125,24,72,12,                    //vbroadcastss  0xc(%rax),%ymm9
+  197,228,89,192,                         //vmulps        %ymm0,%ymm3,%ymm0
+  197,180,88,216,                         //vaddps        %ymm0,%ymm9,%ymm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  197,124,41,192,                         //vmovaps       %ymm8,%ymm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_start_pipeline_sse41[] = {
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  86,                                     //push          %rsi
+  87,                                     //push          %rdi
+  83,                                     //push          %rbx
+  72,129,236,160,0,0,0,                   //sub           $0xa0,%rsp
+  68,15,41,188,36,144,0,0,0,              //movaps        %xmm15,0x90(%rsp)
+  68,15,41,180,36,128,0,0,0,              //movaps        %xmm14,0x80(%rsp)
+  68,15,41,108,36,112,                    //movaps        %xmm13,0x70(%rsp)
+  68,15,41,100,36,96,                     //movaps        %xmm12,0x60(%rsp)
+  68,15,41,92,36,80,                      //movaps        %xmm11,0x50(%rsp)
+  68,15,41,84,36,64,                      //movaps        %xmm10,0x40(%rsp)
+  68,15,41,76,36,48,                      //movaps        %xmm9,0x30(%rsp)
+  68,15,41,68,36,32,                      //movaps        %xmm8,0x20(%rsp)
+  15,41,124,36,16,                        //movaps        %xmm7,0x10(%rsp)
+  15,41,52,36,                            //movaps        %xmm6,(%rsp)
+  77,137,207,                             //mov           %r9,%r15
+  77,137,198,                             //mov           %r8,%r14
+  72,137,203,                             //mov           %rcx,%rbx
+  72,137,214,                             //mov           %rdx,%rsi
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  73,137,196,                             //mov           %rax,%r12
+  73,137,245,                             //mov           %rsi,%r13
+  72,141,67,4,                            //lea           0x4(%rbx),%rax
+  76,57,248,                              //cmp           %r15,%rax
+  118,5,                                  //jbe           73 <_sk_start_pipeline_sse41+0x73>
+  72,137,216,                             //mov           %rbx,%rax
+  235,52,                                 //jmp           a7 <_sk_start_pipeline_sse41+0xa7>
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  15,87,228,                              //xorps         %xmm4,%xmm4
+  15,87,237,                              //xorps         %xmm5,%xmm5
+  15,87,246,                              //xorps         %xmm6,%xmm6
+  15,87,255,                              //xorps         %xmm7,%xmm7
+  72,137,223,                             //mov           %rbx,%rdi
+  76,137,238,                             //mov           %r13,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,212,                             //callq         *%r12
+  72,141,67,4,                            //lea           0x4(%rbx),%rax
+  72,131,195,8,                           //add           $0x8,%rbx
+  76,57,251,                              //cmp           %r15,%rbx
+  72,137,195,                             //mov           %rax,%rbx
+  118,204,                                //jbe           73 <_sk_start_pipeline_sse41+0x73>
+  15,40,52,36,                            //movaps        (%rsp),%xmm6
+  15,40,124,36,16,                        //movaps        0x10(%rsp),%xmm7
+  68,15,40,68,36,32,                      //movaps        0x20(%rsp),%xmm8
+  68,15,40,76,36,48,                      //movaps        0x30(%rsp),%xmm9
+  68,15,40,84,36,64,                      //movaps        0x40(%rsp),%xmm10
+  68,15,40,92,36,80,                      //movaps        0x50(%rsp),%xmm11
+  68,15,40,100,36,96,                     //movaps        0x60(%rsp),%xmm12
+  68,15,40,108,36,112,                    //movaps        0x70(%rsp),%xmm13
+  68,15,40,180,36,128,0,0,0,              //movaps        0x80(%rsp),%xmm14
+  68,15,40,188,36,144,0,0,0,              //movaps        0x90(%rsp),%xmm15
+  72,129,196,160,0,0,0,                   //add           $0xa0,%rsp
+  91,                                     //pop           %rbx
+  95,                                     //pop           %rdi
+  94,                                     //pop           %rsi
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_just_return_sse41[] = {
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_seed_shader_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  102,15,110,199,                         //movd          %edi,%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
+  243,15,16,18,                           //movss         (%rdx),%xmm2
+  243,15,16,90,4,                         //movss         0x4(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  15,88,203,                              //addps         %xmm3,%xmm1
+  15,16,66,20,                            //movups        0x14(%rdx),%xmm0
+  15,88,193,                              //addps         %xmm1,%xmm0
+  102,15,110,8,                           //movd          (%rax),%xmm1
+  102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
+  15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
+  15,88,203,                              //addps         %xmm3,%xmm1
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  15,87,228,                              //xorps         %xmm4,%xmm4
+  15,87,237,                              //xorps         %xmm5,%xmm5
+  15,87,246,                              //xorps         %xmm6,%xmm6
+  15,87,255,                              //xorps         %xmm7,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_constant_color_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,16,24,                               //movups        (%rax),%xmm3
+  15,40,195,                              //movaps        %xmm3,%xmm0
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,40,203,                              //movaps        %xmm3,%xmm1
+  15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
+  15,40,211,                              //movaps        %xmm3,%xmm2
+  15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
+  15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clear_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_plus__sse41[] = {
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_srcover_sse41[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,92,195,                           //subps         %xmm3,%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,204,                           //mulps         %xmm4,%xmm9
+  65,15,88,193,                           //addps         %xmm9,%xmm0
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,205,                           //mulps         %xmm5,%xmm9
+  65,15,88,201,                           //addps         %xmm9,%xmm1
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,206,                           //mulps         %xmm6,%xmm9
+  65,15,88,209,                           //addps         %xmm9,%xmm2
+  68,15,89,199,                           //mulps         %xmm7,%xmm8
+  65,15,88,216,                           //addps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_dstover_sse41[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,92,199,                           //subps         %xmm7,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_0_sse41[] = {
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  65,15,95,192,                           //maxps         %xmm8,%xmm0
+  65,15,95,200,                           //maxps         %xmm8,%xmm1
+  65,15,95,208,                           //maxps         %xmm8,%xmm2
+  65,15,95,216,                           //maxps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_1_sse41[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,93,192,                           //minps         %xmm8,%xmm0
+  65,15,93,200,                           //minps         %xmm8,%xmm1
+  65,15,93,208,                           //minps         %xmm8,%xmm2
+  65,15,93,216,                           //minps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_a_sse41[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,93,216,                           //minps         %xmm8,%xmm3
+  15,93,195,                              //minps         %xmm3,%xmm0
+  15,93,203,                              //minps         %xmm3,%xmm1
+  15,93,211,                              //minps         %xmm3,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_set_rgb_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,15,16,80,8,                         //movss         0x8(%rax),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_rb_sse41[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,194,                              //movaps        %xmm2,%xmm0
+  65,15,40,208,                           //movaps        %xmm8,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_sse41[] = {
+  68,15,40,195,                           //movaps        %xmm3,%xmm8
+  68,15,40,202,                           //movaps        %xmm2,%xmm9
+  68,15,40,209,                           //movaps        %xmm1,%xmm10
+  68,15,40,216,                           //movaps        %xmm0,%xmm11
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,196,                              //movaps        %xmm4,%xmm0
+  15,40,205,                              //movaps        %xmm5,%xmm1
+  15,40,214,                              //movaps        %xmm6,%xmm2
+  15,40,223,                              //movaps        %xmm7,%xmm3
+  65,15,40,227,                           //movaps        %xmm11,%xmm4
+  65,15,40,234,                           //movaps        %xmm10,%xmm5
+  65,15,40,241,                           //movaps        %xmm9,%xmm6
+  65,15,40,248,                           //movaps        %xmm8,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_src_dst_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,224,                              //movaps        %xmm0,%xmm4
+  15,40,233,                              //movaps        %xmm1,%xmm5
+  15,40,242,                              //movaps        %xmm2,%xmm6
+  15,40,251,                              //movaps        %xmm3,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_dst_src_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,196,                              //movaps        %xmm4,%xmm0
+  15,40,205,                              //movaps        %xmm5,%xmm1
+  15,40,214,                              //movaps        %xmm6,%xmm2
+  15,40,223,                              //movaps        %xmm7,%xmm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_premul_sse41[] = {
+  15,89,195,                              //mulps         %xmm3,%xmm0
+  15,89,203,                              //mulps         %xmm3,%xmm1
+  15,89,211,                              //mulps         %xmm3,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_unpremul_sse41[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  69,15,87,201,                           //xorps         %xmm9,%xmm9
+  243,68,15,16,18,                        //movss         (%rdx),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  68,15,94,211,                           //divps         %xmm3,%xmm10
+  15,40,195,                              //movaps        %xmm3,%xmm0
+  65,15,194,193,0,                        //cmpeqps       %xmm9,%xmm0
+  102,69,15,56,20,209,                    //blendvps      %xmm0,%xmm9,%xmm10
+  69,15,89,194,                           //mulps         %xmm10,%xmm8
+  65,15,89,202,                           //mulps         %xmm10,%xmm1
+  65,15,89,210,                           //mulps         %xmm10,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_from_srgb_sse41[] = {
+  68,15,40,194,                           //movaps        %xmm2,%xmm8
+  243,68,15,16,90,64,                     //movss         0x40(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,40,211,                           //movaps        %xmm11,%xmm10
+  68,15,89,208,                           //mulps         %xmm0,%xmm10
+  68,15,40,240,                           //movaps        %xmm0,%xmm14
+  69,15,89,246,                           //mulps         %xmm14,%xmm14
+  243,15,16,82,60,                        //movss         0x3c(%rdx),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  243,68,15,16,98,52,                     //movss         0x34(%rdx),%xmm12
+  243,68,15,16,106,56,                    //movss         0x38(%rdx),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  68,15,40,202,                           //movaps        %xmm2,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  69,15,88,205,                           //addps         %xmm13,%xmm9
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  69,15,89,206,                           //mulps         %xmm14,%xmm9
+  69,15,88,204,                           //addps         %xmm12,%xmm9
+  243,68,15,16,114,68,                    //movss         0x44(%rdx),%xmm14
+  69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
+  65,15,194,198,1,                        //cmpltps       %xmm14,%xmm0
+  102,69,15,56,20,202,                    //blendvps      %xmm0,%xmm10,%xmm9
+  69,15,40,251,                           //movaps        %xmm11,%xmm15
+  68,15,89,249,                           //mulps         %xmm1,%xmm15
+  15,40,193,                              //movaps        %xmm1,%xmm0
+  15,89,192,                              //mulps         %xmm0,%xmm0
+  68,15,40,210,                           //movaps        %xmm2,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  69,15,88,213,                           //addps         %xmm13,%xmm10
+  68,15,89,208,                           //mulps         %xmm0,%xmm10
+  69,15,88,212,                           //addps         %xmm12,%xmm10
+  65,15,194,206,1,                        //cmpltps       %xmm14,%xmm1
+  15,40,193,                              //movaps        %xmm1,%xmm0
+  102,69,15,56,20,215,                    //blendvps      %xmm0,%xmm15,%xmm10
+  69,15,89,216,                           //mulps         %xmm8,%xmm11
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  15,89,192,                              //mulps         %xmm0,%xmm0
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  65,15,88,213,                           //addps         %xmm13,%xmm2
+  15,89,208,                              //mulps         %xmm0,%xmm2
+  65,15,88,212,                           //addps         %xmm12,%xmm2
+  69,15,194,198,1,                        //cmpltps       %xmm14,%xmm8
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  102,65,15,56,20,211,                    //blendvps      %xmm0,%xmm11,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,193,                           //movaps        %xmm9,%xmm0
+  65,15,40,202,                           //movaps        %xmm10,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_to_srgb_sse41[] = {
+  72,131,236,24,                          //sub           $0x18,%rsp
+  15,41,60,36,                            //movaps        %xmm7,(%rsp)
+  15,40,254,                              //movaps        %xmm6,%xmm7
+  15,40,245,                              //movaps        %xmm5,%xmm6
+  15,40,236,                              //movaps        %xmm4,%xmm5
+  15,40,227,                              //movaps        %xmm3,%xmm4
+  68,15,40,194,                           //movaps        %xmm2,%xmm8
+  15,40,217,                              //movaps        %xmm1,%xmm3
+  15,82,208,                              //rsqrtps       %xmm0,%xmm2
+  68,15,83,202,                           //rcpps         %xmm2,%xmm9
+  68,15,82,210,                           //rsqrtps       %xmm2,%xmm10
+  243,15,16,18,                           //movss         (%rdx),%xmm2
+  243,68,15,16,90,72,                     //movss         0x48(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  65,15,40,203,                           //movaps        %xmm11,%xmm1
+  15,89,200,                              //mulps         %xmm0,%xmm1
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  243,68,15,16,98,76,                     //movss         0x4c(%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  243,68,15,16,106,80,                    //movss         0x50(%rdx),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  243,68,15,16,114,84,                    //movss         0x54(%rdx),%xmm14
+  69,15,198,246,0,                        //shufps        $0x0,%xmm14,%xmm14
+  69,15,89,205,                           //mulps         %xmm13,%xmm9
+  69,15,88,206,                           //addps         %xmm14,%xmm9
+  69,15,89,212,                           //mulps         %xmm12,%xmm10
+  69,15,88,209,                           //addps         %xmm9,%xmm10
+  68,15,40,202,                           //movaps        %xmm2,%xmm9
+  69,15,93,202,                           //minps         %xmm10,%xmm9
+  243,68,15,16,122,88,                    //movss         0x58(%rdx),%xmm15
+  69,15,198,255,0,                        //shufps        $0x0,%xmm15,%xmm15
+  65,15,194,199,1,                        //cmpltps       %xmm15,%xmm0
+  102,68,15,56,20,201,                    //blendvps      %xmm0,%xmm1,%xmm9
+  15,82,195,                              //rsqrtps       %xmm3,%xmm0
+  15,83,200,                              //rcpps         %xmm0,%xmm1
+  15,82,192,                              //rsqrtps       %xmm0,%xmm0
+  65,15,89,205,                           //mulps         %xmm13,%xmm1
+  65,15,88,206,                           //addps         %xmm14,%xmm1
+  65,15,89,196,                           //mulps         %xmm12,%xmm0
+  15,88,193,                              //addps         %xmm1,%xmm0
+  68,15,40,210,                           //movaps        %xmm2,%xmm10
+  68,15,93,208,                           //minps         %xmm0,%xmm10
+  65,15,40,203,                           //movaps        %xmm11,%xmm1
+  15,89,203,                              //mulps         %xmm3,%xmm1
+  65,15,194,223,1,                        //cmpltps       %xmm15,%xmm3
+  15,40,195,                              //movaps        %xmm3,%xmm0
+  102,68,15,56,20,209,                    //blendvps      %xmm0,%xmm1,%xmm10
+  65,15,82,192,                           //rsqrtps       %xmm8,%xmm0
+  15,83,200,                              //rcpps         %xmm0,%xmm1
+  65,15,89,205,                           //mulps         %xmm13,%xmm1
+  65,15,88,206,                           //addps         %xmm14,%xmm1
+  15,82,192,                              //rsqrtps       %xmm0,%xmm0
+  65,15,89,196,                           //mulps         %xmm12,%xmm0
+  15,88,193,                              //addps         %xmm1,%xmm0
+  15,93,208,                              //minps         %xmm0,%xmm2
+  69,15,89,216,                           //mulps         %xmm8,%xmm11
+  69,15,194,199,1,                        //cmpltps       %xmm15,%xmm8
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  102,65,15,56,20,211,                    //blendvps      %xmm0,%xmm11,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,193,                           //movaps        %xmm9,%xmm0
+  65,15,40,202,                           //movaps        %xmm10,%xmm1
+  15,40,220,                              //movaps        %xmm4,%xmm3
+  15,40,229,                              //movaps        %xmm5,%xmm4
+  15,40,238,                              //movaps        %xmm6,%xmm5
+  15,40,247,                              //movaps        %xmm7,%xmm6
+  15,40,60,36,                            //movaps        (%rsp),%xmm7
+  72,131,196,24,                          //add           $0x18,%rsp
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_1_float_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_u8_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,56,49,4,56,                   //pmovzxbd      (%rax,%rdi,1),%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,68,15,16,74,12,                     //movss         0xc(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_1_float_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,92,223,                              //subps         %xmm7,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_u8_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,56,49,4,56,                   //pmovzxbd      (%rax,%rdi,1),%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,68,15,16,74,12,                     //movss         0xc(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,92,223,                              //subps         %xmm7,%xmm3
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_565_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,56,51,4,120,                  //pmovzxwd      (%rax,%rdi,2),%xmm8
+  102,15,110,90,104,                      //movd          0x68(%rdx),%xmm3
+  102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
+  102,65,15,219,216,                      //pand          %xmm8,%xmm3
+  68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,68,15,16,82,116,                    //movss         0x74(%rdx),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  102,68,15,110,74,108,                   //movd          0x6c(%rdx),%xmm9
+  102,69,15,112,201,0,                    //pshufd        $0x0,%xmm9,%xmm9
+  102,69,15,219,200,                      //pand          %xmm8,%xmm9
+  69,15,91,201,                           //cvtdq2ps      %xmm9,%xmm9
+  243,68,15,16,90,120,                    //movss         0x78(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  102,68,15,110,74,112,                   //movd          0x70(%rdx),%xmm9
+  102,69,15,112,201,0,                    //pshufd        $0x0,%xmm9,%xmm9
+  102,69,15,219,200,                      //pand          %xmm8,%xmm9
+  69,15,91,193,                           //cvtdq2ps      %xmm9,%xmm8
+  243,68,15,16,74,124,                    //movss         0x7c(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,194,                           //mulps         %xmm10,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,203,                           //mulps         %xmm11,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_tables_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,8,                               //mov           (%rax),%rcx
+  76,139,64,8,                            //mov           0x8(%rax),%r8
+  243,68,15,111,4,185,                    //movdqu        (%rcx,%rdi,4),%xmm8
+  102,15,110,66,16,                       //movd          0x10(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,65,15,111,200,                      //movdqa        %xmm8,%xmm1
+  102,15,114,209,8,                       //psrld         $0x8,%xmm1
+  102,15,219,200,                         //pand          %xmm0,%xmm1
+  102,65,15,111,208,                      //movdqa        %xmm8,%xmm2
+  102,15,114,210,16,                      //psrld         $0x10,%xmm2
+  102,15,219,208,                         //pand          %xmm0,%xmm2
+  102,65,15,219,192,                      //pand          %xmm8,%xmm0
+  102,72,15,58,22,193,1,                  //pextrq        $0x1,%xmm0,%rcx
+  65,137,201,                             //mov           %ecx,%r9d
+  72,193,233,32,                          //shr           $0x20,%rcx
+  102,73,15,126,194,                      //movq          %xmm0,%r10
+  69,137,211,                             //mov           %r10d,%r11d
+  73,193,234,32,                          //shr           $0x20,%r10
+  243,67,15,16,4,152,                     //movss         (%r8,%r11,4),%xmm0
+  102,67,15,58,33,4,144,16,               //insertps      $0x10,(%r8,%r10,4),%xmm0
+  102,67,15,58,33,4,136,32,               //insertps      $0x20,(%r8,%r9,4),%xmm0
+  102,65,15,58,33,4,136,48,               //insertps      $0x30,(%r8,%rcx,4),%xmm0
+  72,139,72,16,                           //mov           0x10(%rax),%rcx
+  102,73,15,58,22,200,1,                  //pextrq        $0x1,%xmm1,%r8
+  69,137,193,                             //mov           %r8d,%r9d
+  73,193,232,32,                          //shr           $0x20,%r8
+  102,73,15,126,202,                      //movq          %xmm1,%r10
+  69,137,211,                             //mov           %r10d,%r11d
+  73,193,234,32,                          //shr           $0x20,%r10
+  243,66,15,16,12,153,                    //movss         (%rcx,%r11,4),%xmm1
+  102,66,15,58,33,12,145,16,              //insertps      $0x10,(%rcx,%r10,4),%xmm1
+  243,66,15,16,28,137,                    //movss         (%rcx,%r9,4),%xmm3
+  102,15,58,33,203,32,                    //insertps      $0x20,%xmm3,%xmm1
+  243,66,15,16,28,129,                    //movss         (%rcx,%r8,4),%xmm3
+  102,15,58,33,203,48,                    //insertps      $0x30,%xmm3,%xmm1
+  72,139,64,24,                           //mov           0x18(%rax),%rax
+  102,72,15,58,22,209,1,                  //pextrq        $0x1,%xmm2,%rcx
+  65,137,200,                             //mov           %ecx,%r8d
+  72,193,233,32,                          //shr           $0x20,%rcx
+  102,73,15,126,209,                      //movq          %xmm2,%r9
+  69,137,202,                             //mov           %r9d,%r10d
+  73,193,233,32,                          //shr           $0x20,%r9
+  243,66,15,16,20,144,                    //movss         (%rax,%r10,4),%xmm2
+  102,66,15,58,33,20,136,16,              //insertps      $0x10,(%rax,%r9,4),%xmm2
+  243,66,15,16,28,128,                    //movss         (%rax,%r8,4),%xmm3
+  102,15,58,33,211,32,                    //insertps      $0x20,%xmm3,%xmm2
+  243,15,16,28,136,                       //movss         (%rax,%rcx,4),%xmm3
+  102,15,58,33,211,48,                    //insertps      $0x30,%xmm3,%xmm2
+  102,65,15,114,208,24,                   //psrld         $0x18,%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,15,16,90,12,                        //movss         0xc(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_a8_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,15,56,49,4,56,                      //pmovzxbd      (%rax,%rdi,1),%xmm0
+  15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
+  243,15,16,90,12,                        //movss         0xc(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  15,89,216,                              //mulps         %xmm0,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_a8_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,66,8,                      //movss         0x8(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,69,15,56,43,192,                    //packusdw      %xmm8,%xmm8
+  102,69,15,103,192,                      //packuswb      %xmm8,%xmm8
+  102,68,15,126,4,56,                     //movd          %xmm8,(%rax,%rdi,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_565_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,56,51,12,120,                 //pmovzxwd      (%rax,%rdi,2),%xmm9
+  102,15,110,66,104,                      //movd          0x68(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,65,15,219,193,                      //pand          %xmm9,%xmm0
+  15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,15,16,66,116,                       //movss         0x74(%rdx),%xmm0
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,89,193,                              //mulps         %xmm1,%xmm0
+  102,15,110,74,108,                      //movd          0x6c(%rdx),%xmm1
+  102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
+  102,65,15,219,201,                      //pand          %xmm9,%xmm1
+  68,15,91,193,                           //cvtdq2ps      %xmm1,%xmm8
+  243,15,16,74,120,                       //movss         0x78(%rdx),%xmm1
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  102,15,110,82,112,                      //movd          0x70(%rdx),%xmm2
+  102,15,112,210,0,                       //pshufd        $0x0,%xmm2,%xmm2
+  102,65,15,219,209,                      //pand          %xmm9,%xmm2
+  68,15,91,194,                           //cvtdq2ps      %xmm2,%xmm8
+  243,15,16,82,124,                       //movss         0x7c(%rdx),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_565_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,130,128,0,0,0,             //movss         0x80(%rdx),%xmm8
+  243,68,15,16,138,132,0,0,0,             //movss         0x84(%rdx),%xmm9
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,208,                           //movaps        %xmm8,%xmm10
+  68,15,89,208,                           //mulps         %xmm0,%xmm10
+  102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
+  102,65,15,114,242,11,                   //pslld         $0xb,%xmm10
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  102,65,15,114,241,5,                    //pslld         $0x5,%xmm9
+  102,69,15,235,202,                      //por           %xmm10,%xmm9
+  68,15,89,194,                           //mulps         %xmm2,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,69,15,86,193,                       //orpd          %xmm9,%xmm8
+  102,69,15,56,43,192,                    //packusdw      %xmm8,%xmm8
+  102,68,15,214,4,120,                    //movq          %xmm8,(%rax,%rdi,2)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_8888_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,15,111,28,184,                      //movdqu        (%rax,%rdi,4),%xmm3
+  102,15,110,66,16,                       //movd          0x10(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,15,111,203,                         //movdqa        %xmm3,%xmm1
+  102,15,114,209,8,                       //psrld         $0x8,%xmm1
+  102,15,219,200,                         //pand          %xmm0,%xmm1
+  102,15,111,211,                         //movdqa        %xmm3,%xmm2
+  102,15,114,210,16,                      //psrld         $0x10,%xmm2
+  102,15,219,208,                         //pand          %xmm0,%xmm2
+  102,15,219,195,                         //pand          %xmm3,%xmm0
+  15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
+  243,68,15,16,66,12,                     //movss         0xc(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,91,210,                              //cvtdq2ps      %xmm2,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  102,15,114,211,24,                      //psrld         $0x18,%xmm3
+  15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_8888_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,66,8,                      //movss         0x8(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  69,15,40,208,                           //movaps        %xmm8,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
+  102,65,15,114,242,8,                    //pslld         $0x8,%xmm10
+  102,69,15,235,209,                      //por           %xmm9,%xmm10
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,202,                           //mulps         %xmm2,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  102,65,15,114,241,16,                   //pslld         $0x10,%xmm9
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,65,15,114,240,24,                   //pslld         $0x18,%xmm8
+  102,69,15,235,193,                      //por           %xmm9,%xmm8
+  102,69,15,235,194,                      //por           %xmm10,%xmm8
+  243,68,15,127,4,184,                    //movdqu        %xmm8,(%rax,%rdi,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_f16_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,15,111,4,248,                       //movdqu        (%rax,%rdi,8),%xmm0
+  243,15,111,76,248,16,                   //movdqu        0x10(%rax,%rdi,8),%xmm1
+  102,15,111,208,                         //movdqa        %xmm0,%xmm2
+  102,15,97,209,                          //punpcklwd     %xmm1,%xmm2
+  102,15,105,193,                         //punpckhwd     %xmm1,%xmm0
+  102,68,15,111,194,                      //movdqa        %xmm2,%xmm8
+  102,68,15,97,192,                       //punpcklwd     %xmm0,%xmm8
+  102,15,105,208,                         //punpckhwd     %xmm0,%xmm2
+  102,15,110,66,100,                      //movd          0x64(%rdx),%xmm0
+  102,15,112,216,0,                       //pshufd        $0x0,%xmm0,%xmm3
+  102,15,111,203,                         //movdqa        %xmm3,%xmm1
+  102,65,15,101,200,                      //pcmpgtw       %xmm8,%xmm1
+  102,65,15,223,200,                      //pandn         %xmm8,%xmm1
+  102,15,101,218,                         //pcmpgtw       %xmm2,%xmm3
+  102,15,223,218,                         //pandn         %xmm2,%xmm3
+  102,15,56,51,193,                       //pmovzxwd      %xmm1,%xmm0
+  102,15,114,240,13,                      //pslld         $0xd,%xmm0
+  102,15,110,82,92,                       //movd          0x5c(%rdx),%xmm2
+  102,68,15,112,194,0,                    //pshufd        $0x0,%xmm2,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  102,69,15,239,201,                      //pxor          %xmm9,%xmm9
+  102,65,15,105,201,                      //punpckhwd     %xmm9,%xmm1
+  102,15,114,241,13,                      //pslld         $0xd,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  102,15,56,51,211,                       //pmovzxwd      %xmm3,%xmm2
+  102,15,114,242,13,                      //pslld         $0xd,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  102,65,15,105,217,                      //punpckhwd     %xmm9,%xmm3
+  102,15,114,243,13,                      //pslld         $0xd,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_f16_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,110,66,96,                    //movd          0x60(%rdx),%xmm8
+  102,69,15,112,192,0,                    //pshufd        $0x0,%xmm8,%xmm8
+  102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  102,65,15,114,209,13,                   //psrld         $0xd,%xmm9
+  102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  102,65,15,114,210,13,                   //psrld         $0xd,%xmm10
+  102,69,15,111,216,                      //movdqa        %xmm8,%xmm11
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  102,65,15,114,211,13,                   //psrld         $0xd,%xmm11
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,65,15,114,208,13,                   //psrld         $0xd,%xmm8
+  102,65,15,115,250,2,                    //pslldq        $0x2,%xmm10
+  102,69,15,235,209,                      //por           %xmm9,%xmm10
+  102,65,15,115,248,2,                    //pslldq        $0x2,%xmm8
+  102,69,15,235,195,                      //por           %xmm11,%xmm8
+  102,69,15,111,202,                      //movdqa        %xmm10,%xmm9
+  102,69,15,98,200,                       //punpckldq     %xmm8,%xmm9
+  243,68,15,127,12,248,                   //movdqu        %xmm9,(%rax,%rdi,8)
+  102,69,15,106,208,                      //punpckhdq     %xmm8,%xmm10
+  243,68,15,127,84,248,16,                //movdqu        %xmm10,0x10(%rax,%rdi,8)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_f32_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,137,249,                             //mov           %rdi,%rcx
+  72,193,225,4,                           //shl           $0x4,%rcx
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  68,15,40,200,                           //movaps        %xmm0,%xmm9
+  68,15,20,201,                           //unpcklps      %xmm1,%xmm9
+  68,15,40,210,                           //movaps        %xmm2,%xmm10
+  68,15,40,218,                           //movaps        %xmm2,%xmm11
+  68,15,20,219,                           //unpcklps      %xmm3,%xmm11
+  68,15,21,193,                           //unpckhps      %xmm1,%xmm8
+  68,15,21,211,                           //unpckhps      %xmm3,%xmm10
+  69,15,40,225,                           //movaps        %xmm9,%xmm12
+  102,69,15,20,227,                       //unpcklpd      %xmm11,%xmm12
+  102,69,15,21,203,                       //unpckhpd      %xmm11,%xmm9
+  69,15,40,216,                           //movaps        %xmm8,%xmm11
+  102,69,15,20,218,                       //unpcklpd      %xmm10,%xmm11
+  102,69,15,21,194,                       //unpckhpd      %xmm10,%xmm8
+  102,68,15,17,36,8,                      //movupd        %xmm12,(%rax,%rcx,1)
+  102,68,15,17,76,8,16,                   //movupd        %xmm9,0x10(%rax,%rcx,1)
+  102,68,15,17,92,8,32,                   //movupd        %xmm11,0x20(%rax,%rcx,1)
+  102,68,15,17,68,8,48,                   //movupd        %xmm8,0x30(%rax,%rcx,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_x_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,95,192,                           //maxps         %xmm0,%xmm8
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  102,15,118,192,                         //pcmpeqd       %xmm0,%xmm0
+  102,65,15,254,193,                      //paddd         %xmm9,%xmm0
+  68,15,93,192,                           //minps         %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_y_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,95,193,                           //maxps         %xmm1,%xmm8
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  102,15,118,201,                         //pcmpeqd       %xmm1,%xmm1
+  102,65,15,254,201,                      //paddd         %xmm9,%xmm1
+  68,15,93,193,                           //minps         %xmm1,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,200,                           //movaps        %xmm8,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_x_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,200,                           //movaps        %xmm0,%xmm9
+  69,15,94,200,                           //divps         %xmm8,%xmm9
+  102,69,15,58,8,201,1,                   //roundps       $0x1,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  65,15,92,193,                           //subps         %xmm9,%xmm0
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,193,                           //minps         %xmm9,%xmm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_y_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  69,15,94,200,                           //divps         %xmm8,%xmm9
+  102,69,15,58,8,201,1,                   //roundps       $0x1,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  65,15,92,201,                           //subps         %xmm9,%xmm1
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,201,                           //minps         %xmm9,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_x_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  65,15,92,193,                           //subps         %xmm9,%xmm0
+  243,69,15,88,192,                       //addss         %xmm8,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,208,                           //movaps        %xmm0,%xmm10
+  69,15,94,208,                           //divps         %xmm8,%xmm10
+  102,69,15,58,8,210,1,                   //roundps       $0x1,%xmm10,%xmm10
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  65,15,92,194,                           //subps         %xmm10,%xmm0
+  65,15,92,193,                           //subps         %xmm9,%xmm0
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,92,192,                           //subps         %xmm0,%xmm8
+  65,15,84,192,                           //andps         %xmm8,%xmm0
+  102,69,15,118,192,                      //pcmpeqd       %xmm8,%xmm8
+  102,69,15,254,193,                      //paddd         %xmm9,%xmm8
+  65,15,93,192,                           //minps         %xmm8,%xmm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_y_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  65,15,92,201,                           //subps         %xmm9,%xmm1
+  243,69,15,88,192,                       //addss         %xmm8,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,209,                           //movaps        %xmm1,%xmm10
+  69,15,94,208,                           //divps         %xmm8,%xmm10
+  102,69,15,58,8,210,1,                   //roundps       $0x1,%xmm10,%xmm10
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  65,15,92,202,                           //subps         %xmm10,%xmm1
+  65,15,92,201,                           //subps         %xmm9,%xmm1
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,92,193,                           //subps         %xmm1,%xmm8
+  65,15,84,200,                           //andps         %xmm8,%xmm1
+  102,69,15,118,192,                      //pcmpeqd       %xmm8,%xmm8
+  102,69,15,254,193,                      //paddd         %xmm9,%xmm8
+  65,15,93,200,                           //minps         %xmm8,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_2x3_sse41[] = {
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,16,                     //movss         0x10(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,194,                           //addps         %xmm10,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,88,202,                           //addps         %xmm10,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_3x4_sse41[] = {
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,36,                     //movss         0x24(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,194,                           //addps         %xmm10,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,88,202,                           //addps         %xmm10,%xmm1
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  68,15,89,226,                           //mulps         %xmm2,%xmm12
+  69,15,88,229,                           //addps         %xmm13,%xmm12
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,210,                           //movaps        %xmm10,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_perspective_sse41[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,68,15,16,72,4,                      //movss         0x4(%rax),%xmm9
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,193,                           //addps         %xmm9,%xmm0
+  243,68,15,16,72,12,                     //movss         0xc(%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  243,68,15,16,80,24,                     //movss         0x18(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,217,                           //mulps         %xmm1,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,83,202,                           //rcpps         %xmm10,%xmm1
+  15,89,193,                              //mulps         %xmm1,%xmm0
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,201,                           //movaps        %xmm9,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_linear_gradient_2stops_sse41[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  68,15,16,8,                             //movups        (%rax),%xmm9
+  15,16,88,16,                            //movups        0x10(%rax),%xmm3
+  68,15,40,195,                           //movaps        %xmm3,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,40,201,                           //movaps        %xmm9,%xmm1
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  68,15,89,192,                           //mulps         %xmm0,%xmm8
+  68,15,88,193,                           //addps         %xmm1,%xmm8
+  15,40,203,                              //movaps        %xmm3,%xmm1
+  15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
+  65,15,40,209,                           //movaps        %xmm9,%xmm2
+  15,198,210,85,                          //shufps        $0x55,%xmm2,%xmm2
+  15,89,200,                              //mulps         %xmm0,%xmm1
+  15,88,202,                              //addps         %xmm2,%xmm1
+  15,40,211,                              //movaps        %xmm3,%xmm2
+  15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
+  69,15,40,209,                           //movaps        %xmm9,%xmm10
+  69,15,198,210,170,                      //shufps        $0xaa,%xmm10,%xmm10
+  15,89,208,                              //mulps         %xmm0,%xmm2
+  65,15,88,210,                           //addps         %xmm10,%xmm2
+  15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
+  69,15,198,201,255,                      //shufps        $0xff,%xmm9,%xmm9
+  15,89,216,                              //mulps         %xmm0,%xmm3
+  65,15,88,217,                           //addps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_start_pipeline_sse2[] = {
+  65,87,                                  //push          %r15
+  65,86,                                  //push          %r14
+  65,85,                                  //push          %r13
+  65,84,                                  //push          %r12
+  86,                                     //push          %rsi
+  87,                                     //push          %rdi
+  83,                                     //push          %rbx
+  72,129,236,160,0,0,0,                   //sub           $0xa0,%rsp
+  68,15,41,188,36,144,0,0,0,              //movaps        %xmm15,0x90(%rsp)
+  68,15,41,180,36,128,0,0,0,              //movaps        %xmm14,0x80(%rsp)
+  68,15,41,108,36,112,                    //movaps        %xmm13,0x70(%rsp)
+  68,15,41,100,36,96,                     //movaps        %xmm12,0x60(%rsp)
+  68,15,41,92,36,80,                      //movaps        %xmm11,0x50(%rsp)
+  68,15,41,84,36,64,                      //movaps        %xmm10,0x40(%rsp)
+  68,15,41,76,36,48,                      //movaps        %xmm9,0x30(%rsp)
+  68,15,41,68,36,32,                      //movaps        %xmm8,0x20(%rsp)
+  15,41,124,36,16,                        //movaps        %xmm7,0x10(%rsp)
+  15,41,52,36,                            //movaps        %xmm6,(%rsp)
+  77,137,207,                             //mov           %r9,%r15
+  77,137,198,                             //mov           %r8,%r14
+  72,137,203,                             //mov           %rcx,%rbx
+  72,137,214,                             //mov           %rdx,%rsi
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  73,137,196,                             //mov           %rax,%r12
+  73,137,245,                             //mov           %rsi,%r13
+  72,141,67,4,                            //lea           0x4(%rbx),%rax
+  76,57,248,                              //cmp           %r15,%rax
+  118,5,                                  //jbe           73 <_sk_start_pipeline_sse2+0x73>
+  72,137,216,                             //mov           %rbx,%rax
+  235,52,                                 //jmp           a7 <_sk_start_pipeline_sse2+0xa7>
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  15,87,228,                              //xorps         %xmm4,%xmm4
+  15,87,237,                              //xorps         %xmm5,%xmm5
+  15,87,246,                              //xorps         %xmm6,%xmm6
+  15,87,255,                              //xorps         %xmm7,%xmm7
+  72,137,223,                             //mov           %rbx,%rdi
+  76,137,238,                             //mov           %r13,%rsi
+  76,137,242,                             //mov           %r14,%rdx
+  65,255,212,                             //callq         *%r12
+  72,141,67,4,                            //lea           0x4(%rbx),%rax
+  72,131,195,8,                           //add           $0x8,%rbx
+  76,57,251,                              //cmp           %r15,%rbx
+  72,137,195,                             //mov           %rax,%rbx
+  118,204,                                //jbe           73 <_sk_start_pipeline_sse2+0x73>
+  15,40,52,36,                            //movaps        (%rsp),%xmm6
+  15,40,124,36,16,                        //movaps        0x10(%rsp),%xmm7
+  68,15,40,68,36,32,                      //movaps        0x20(%rsp),%xmm8
+  68,15,40,76,36,48,                      //movaps        0x30(%rsp),%xmm9
+  68,15,40,84,36,64,                      //movaps        0x40(%rsp),%xmm10
+  68,15,40,92,36,80,                      //movaps        0x50(%rsp),%xmm11
+  68,15,40,100,36,96,                     //movaps        0x60(%rsp),%xmm12
+  68,15,40,108,36,112,                    //movaps        0x70(%rsp),%xmm13
+  68,15,40,180,36,128,0,0,0,              //movaps        0x80(%rsp),%xmm14
+  68,15,40,188,36,144,0,0,0,              //movaps        0x90(%rsp),%xmm15
+  72,129,196,160,0,0,0,                   //add           $0xa0,%rsp
+  91,                                     //pop           %rbx
+  95,                                     //pop           %rdi
+  94,                                     //pop           %rsi
+  65,92,                                  //pop           %r12
+  65,93,                                  //pop           %r13
+  65,94,                                  //pop           %r14
+  65,95,                                  //pop           %r15
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_just_return_sse2[] = {
+  195,                                    //retq
+};
+
+CODE const uint8_t sk_seed_shader_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  102,15,110,199,                         //movd          %edi,%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
+  243,15,16,18,                           //movss         (%rdx),%xmm2
+  243,15,16,90,4,                         //movss         0x4(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  15,88,203,                              //addps         %xmm3,%xmm1
+  15,16,66,20,                            //movups        0x14(%rdx),%xmm0
+  15,88,193,                              //addps         %xmm1,%xmm0
+  102,15,110,8,                           //movd          (%rax),%xmm1
+  102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
+  15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
+  15,88,203,                              //addps         %xmm3,%xmm1
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  15,87,228,                              //xorps         %xmm4,%xmm4
+  15,87,237,                              //xorps         %xmm5,%xmm5
+  15,87,246,                              //xorps         %xmm6,%xmm6
+  15,87,255,                              //xorps         %xmm7,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_constant_color_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,16,24,                               //movups        (%rax),%xmm3
+  15,40,195,                              //movaps        %xmm3,%xmm0
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,40,203,                              //movaps        %xmm3,%xmm1
+  15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
+  15,40,211,                              //movaps        %xmm3,%xmm2
+  15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
+  15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clear_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  15,87,201,                              //xorps         %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  15,87,219,                              //xorps         %xmm3,%xmm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_plus__sse2[] = {
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_srcover_sse2[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,92,195,                           //subps         %xmm3,%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,204,                           //mulps         %xmm4,%xmm9
+  65,15,88,193,                           //addps         %xmm9,%xmm0
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,205,                           //mulps         %xmm5,%xmm9
+  65,15,88,201,                           //addps         %xmm9,%xmm1
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,206,                           //mulps         %xmm6,%xmm9
+  65,15,88,209,                           //addps         %xmm9,%xmm2
+  68,15,89,199,                           //mulps         %xmm7,%xmm8
+  65,15,88,216,                           //addps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_dstover_sse2[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,92,199,                           //subps         %xmm7,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_0_sse2[] = {
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  65,15,95,192,                           //maxps         %xmm8,%xmm0
+  65,15,95,200,                           //maxps         %xmm8,%xmm1
+  65,15,95,208,                           //maxps         %xmm8,%xmm2
+  65,15,95,216,                           //maxps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_1_sse2[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,93,192,                           //minps         %xmm8,%xmm0
+  65,15,93,200,                           //minps         %xmm8,%xmm1
+  65,15,93,208,                           //minps         %xmm8,%xmm2
+  65,15,93,216,                           //minps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_a_sse2[] = {
+  243,68,15,16,2,                         //movss         (%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,93,216,                           //minps         %xmm8,%xmm3
+  15,93,195,                              //minps         %xmm3,%xmm0
+  15,93,203,                              //minps         %xmm3,%xmm1
+  15,93,211,                              //minps         %xmm3,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_set_rgb_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,15,16,80,8,                         //movss         0x8(%rax),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_rb_sse2[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,194,                              //movaps        %xmm2,%xmm0
+  65,15,40,208,                           //movaps        %xmm8,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_swap_sse2[] = {
+  68,15,40,195,                           //movaps        %xmm3,%xmm8
+  68,15,40,202,                           //movaps        %xmm2,%xmm9
+  68,15,40,209,                           //movaps        %xmm1,%xmm10
+  68,15,40,216,                           //movaps        %xmm0,%xmm11
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,196,                              //movaps        %xmm4,%xmm0
+  15,40,205,                              //movaps        %xmm5,%xmm1
+  15,40,214,                              //movaps        %xmm6,%xmm2
+  15,40,223,                              //movaps        %xmm7,%xmm3
+  65,15,40,227,                           //movaps        %xmm11,%xmm4
+  65,15,40,234,                           //movaps        %xmm10,%xmm5
+  65,15,40,241,                           //movaps        %xmm9,%xmm6
+  65,15,40,248,                           //movaps        %xmm8,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_src_dst_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,224,                              //movaps        %xmm0,%xmm4
+  15,40,233,                              //movaps        %xmm1,%xmm5
+  15,40,242,                              //movaps        %xmm2,%xmm6
+  15,40,251,                              //movaps        %xmm3,%xmm7
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_move_dst_src_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,196,                              //movaps        %xmm4,%xmm0
+  15,40,205,                              //movaps        %xmm5,%xmm1
+  15,40,214,                              //movaps        %xmm6,%xmm2
+  15,40,223,                              //movaps        %xmm7,%xmm3
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_premul_sse2[] = {
+  15,89,195,                              //mulps         %xmm3,%xmm0
+  15,89,203,                              //mulps         %xmm3,%xmm1
+  15,89,211,                              //mulps         %xmm3,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_unpremul_sse2[] = {
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,194,195,0,                        //cmpeqps       %xmm3,%xmm8
+  243,68,15,16,10,                        //movss         (%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,94,203,                           //divps         %xmm3,%xmm9
+  69,15,85,193,                           //andnps        %xmm9,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_from_srgb_sse2[] = {
+  243,68,15,16,66,64,                     //movss         0x40(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,232,                           //movaps        %xmm8,%xmm13
+  68,15,89,232,                           //mulps         %xmm0,%xmm13
+  68,15,40,224,                           //movaps        %xmm0,%xmm12
+  69,15,89,228,                           //mulps         %xmm12,%xmm12
+  243,68,15,16,74,60,                     //movss         0x3c(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,82,52,                     //movss         0x34(%rdx),%xmm10
+  243,68,15,16,90,56,                     //movss         0x38(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,40,241,                           //movaps        %xmm9,%xmm14
+  68,15,89,240,                           //mulps         %xmm0,%xmm14
+  69,15,88,243,                           //addps         %xmm11,%xmm14
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  69,15,89,244,                           //mulps         %xmm12,%xmm14
+  69,15,88,242,                           //addps         %xmm10,%xmm14
+  243,68,15,16,98,68,                     //movss         0x44(%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  65,15,194,196,1,                        //cmpltps       %xmm12,%xmm0
+  68,15,84,232,                           //andps         %xmm0,%xmm13
+  65,15,85,198,                           //andnps        %xmm14,%xmm0
+  65,15,86,197,                           //orps          %xmm13,%xmm0
+  69,15,40,232,                           //movaps        %xmm8,%xmm13
+  68,15,89,233,                           //mulps         %xmm1,%xmm13
+  68,15,40,241,                           //movaps        %xmm1,%xmm14
+  69,15,89,246,                           //mulps         %xmm14,%xmm14
+  69,15,40,249,                           //movaps        %xmm9,%xmm15
+  68,15,89,249,                           //mulps         %xmm1,%xmm15
+  69,15,88,251,                           //addps         %xmm11,%xmm15
+  69,15,89,254,                           //mulps         %xmm14,%xmm15
+  69,15,88,250,                           //addps         %xmm10,%xmm15
+  65,15,194,204,1,                        //cmpltps       %xmm12,%xmm1
+  68,15,84,233,                           //andps         %xmm1,%xmm13
+  65,15,85,207,                           //andnps        %xmm15,%xmm1
+  65,15,86,205,                           //orps          %xmm13,%xmm1
+  68,15,89,194,                           //mulps         %xmm2,%xmm8
+  68,15,40,234,                           //movaps        %xmm2,%xmm13
+  69,15,89,237,                           //mulps         %xmm13,%xmm13
+  68,15,89,202,                           //mulps         %xmm2,%xmm9
+  69,15,88,203,                           //addps         %xmm11,%xmm9
+  69,15,89,205,                           //mulps         %xmm13,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  65,15,194,212,1,                        //cmpltps       %xmm12,%xmm2
+  68,15,84,194,                           //andps         %xmm2,%xmm8
+  65,15,85,209,                           //andnps        %xmm9,%xmm2
+  65,15,86,208,                           //orps          %xmm8,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_to_srgb_sse2[] = {
+  72,131,236,40,                          //sub           $0x28,%rsp
+  15,41,124,36,16,                        //movaps        %xmm7,0x10(%rsp)
+  15,41,52,36,                            //movaps        %xmm6,(%rsp)
+  15,40,245,                              //movaps        %xmm5,%xmm6
+  15,40,236,                              //movaps        %xmm4,%xmm5
+  15,40,227,                              //movaps        %xmm3,%xmm4
+  68,15,82,192,                           //rsqrtps       %xmm0,%xmm8
+  69,15,83,232,                           //rcpps         %xmm8,%xmm13
+  69,15,82,248,                           //rsqrtps       %xmm8,%xmm15
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,68,15,16,66,72,                     //movss         0x48(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,240,                           //movaps        %xmm8,%xmm14
+  68,15,89,240,                           //mulps         %xmm0,%xmm14
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  243,68,15,16,82,76,                     //movss         0x4c(%rdx),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,90,80,                     //movss         0x50(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,98,84,                     //movss         0x54(%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  69,15,89,235,                           //mulps         %xmm11,%xmm13
+  69,15,88,236,                           //addps         %xmm12,%xmm13
+  69,15,89,250,                           //mulps         %xmm10,%xmm15
+  69,15,88,253,                           //addps         %xmm13,%xmm15
+  68,15,40,203,                           //movaps        %xmm3,%xmm9
+  69,15,93,207,                           //minps         %xmm15,%xmm9
+  243,68,15,16,106,88,                    //movss         0x58(%rdx),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  65,15,194,197,1,                        //cmpltps       %xmm13,%xmm0
+  68,15,84,240,                           //andps         %xmm0,%xmm14
+  65,15,85,193,                           //andnps        %xmm9,%xmm0
+  65,15,86,198,                           //orps          %xmm14,%xmm0
+  68,15,82,201,                           //rsqrtps       %xmm1,%xmm9
+  69,15,83,241,                           //rcpps         %xmm9,%xmm14
+  69,15,82,201,                           //rsqrtps       %xmm9,%xmm9
+  69,15,89,243,                           //mulps         %xmm11,%xmm14
+  69,15,88,244,                           //addps         %xmm12,%xmm14
+  69,15,89,202,                           //mulps         %xmm10,%xmm9
+  69,15,88,206,                           //addps         %xmm14,%xmm9
+  68,15,40,243,                           //movaps        %xmm3,%xmm14
+  69,15,93,241,                           //minps         %xmm9,%xmm14
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  65,15,194,205,1,                        //cmpltps       %xmm13,%xmm1
+  68,15,84,201,                           //andps         %xmm1,%xmm9
+  65,15,85,206,                           //andnps        %xmm14,%xmm1
+  65,15,86,201,                           //orps          %xmm9,%xmm1
+  68,15,82,202,                           //rsqrtps       %xmm2,%xmm9
+  69,15,83,241,                           //rcpps         %xmm9,%xmm14
+  69,15,89,243,                           //mulps         %xmm11,%xmm14
+  69,15,88,244,                           //addps         %xmm12,%xmm14
+  65,15,82,249,                           //rsqrtps       %xmm9,%xmm7
+  65,15,89,250,                           //mulps         %xmm10,%xmm7
+  65,15,88,254,                           //addps         %xmm14,%xmm7
+  15,93,223,                              //minps         %xmm7,%xmm3
+  68,15,89,194,                           //mulps         %xmm2,%xmm8
+  65,15,194,213,1,                        //cmpltps       %xmm13,%xmm2
+  68,15,84,194,                           //andps         %xmm2,%xmm8
+  15,85,211,                              //andnps        %xmm3,%xmm2
+  65,15,86,208,                           //orps          %xmm8,%xmm2
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,40,220,                              //movaps        %xmm4,%xmm3
+  15,40,229,                              //movaps        %xmm5,%xmm4
+  15,40,238,                              //movaps        %xmm6,%xmm5
+  15,40,52,36,                            //movaps        (%rsp),%xmm6
+  15,40,124,36,16,                        //movaps        0x10(%rsp),%xmm7
+  72,131,196,40,                          //add           $0x28,%rsp
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_1_float_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_scale_u8_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,110,4,56,                     //movd          (%rax,%rdi,1),%xmm8
+  102,69,15,239,201,                      //pxor          %xmm9,%xmm9
+  102,69,15,96,193,                       //punpcklbw     %xmm9,%xmm8
+  102,69,15,97,193,                       //punpcklwd     %xmm9,%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,68,15,16,74,12,                     //movss         0xc(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_1_float_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,92,223,                              //subps         %xmm7,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_u8_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,110,4,56,                     //movd          (%rax,%rdi,1),%xmm8
+  102,69,15,239,201,                      //pxor          %xmm9,%xmm9
+  102,69,15,96,193,                       //punpcklbw     %xmm9,%xmm8
+  102,69,15,97,193,                       //punpcklwd     %xmm9,%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,68,15,16,74,12,                     //movss         0xc(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,92,223,                              //subps         %xmm7,%xmm3
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  15,88,223,                              //addps         %xmm7,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_lerp_565_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,126,4,120,                    //movq          (%rax,%rdi,2),%xmm8
+  102,15,239,219,                         //pxor          %xmm3,%xmm3
+  102,68,15,97,195,                       //punpcklwd     %xmm3,%xmm8
+  102,15,110,90,104,                      //movd          0x68(%rdx),%xmm3
+  102,15,112,219,0,                       //pshufd        $0x0,%xmm3,%xmm3
+  102,65,15,219,216,                      //pand          %xmm8,%xmm3
+  68,15,91,203,                           //cvtdq2ps      %xmm3,%xmm9
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,68,15,16,82,116,                    //movss         0x74(%rdx),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  102,68,15,110,74,108,                   //movd          0x6c(%rdx),%xmm9
+  102,69,15,112,201,0,                    //pshufd        $0x0,%xmm9,%xmm9
+  102,69,15,219,200,                      //pand          %xmm8,%xmm9
+  69,15,91,201,                           //cvtdq2ps      %xmm9,%xmm9
+  243,68,15,16,90,120,                    //movss         0x78(%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  102,68,15,110,74,112,                   //movd          0x70(%rdx),%xmm9
+  102,69,15,112,201,0,                    //pshufd        $0x0,%xmm9,%xmm9
+  102,69,15,219,200,                      //pand          %xmm8,%xmm9
+  69,15,91,193,                           //cvtdq2ps      %xmm9,%xmm8
+  243,68,15,16,74,124,                    //movss         0x7c(%rdx),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  15,92,196,                              //subps         %xmm4,%xmm0
+  65,15,89,194,                           //mulps         %xmm10,%xmm0
+  15,88,196,                              //addps         %xmm4,%xmm0
+  15,92,205,                              //subps         %xmm5,%xmm1
+  65,15,89,203,                           //mulps         %xmm11,%xmm1
+  15,88,205,                              //addps         %xmm5,%xmm1
+  15,92,214,                              //subps         %xmm6,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  15,88,214,                              //addps         %xmm6,%xmm2
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_tables_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,8,                               //mov           (%rax),%rcx
+  76,139,64,8,                            //mov           0x8(%rax),%r8
+  243,68,15,111,4,185,                    //movdqu        (%rcx,%rdi,4),%xmm8
+  102,15,110,66,16,                       //movd          0x10(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
+  102,65,15,114,209,8,                    //psrld         $0x8,%xmm9
+  102,68,15,219,200,                      //pand          %xmm0,%xmm9
+  102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
+  102,65,15,114,210,16,                   //psrld         $0x10,%xmm10
+  102,68,15,219,208,                      //pand          %xmm0,%xmm10
+  102,65,15,219,192,                      //pand          %xmm8,%xmm0
+  102,15,112,216,78,                      //pshufd        $0x4e,%xmm0,%xmm3
+  102,72,15,126,217,                      //movq          %xmm3,%rcx
+  65,137,201,                             //mov           %ecx,%r9d
+  72,193,233,32,                          //shr           $0x20,%rcx
+  102,73,15,126,194,                      //movq          %xmm0,%r10
+  69,137,211,                             //mov           %r10d,%r11d
+  73,193,234,32,                          //shr           $0x20,%r10
+  243,67,15,16,28,144,                    //movss         (%r8,%r10,4),%xmm3
+  243,65,15,16,4,136,                     //movss         (%r8,%rcx,4),%xmm0
+  15,20,216,                              //unpcklps      %xmm0,%xmm3
+  243,67,15,16,4,152,                     //movss         (%r8,%r11,4),%xmm0
+  243,67,15,16,12,136,                    //movss         (%r8,%r9,4),%xmm1
+  15,20,193,                              //unpcklps      %xmm1,%xmm0
+  15,20,195,                              //unpcklps      %xmm3,%xmm0
+  72,139,72,16,                           //mov           0x10(%rax),%rcx
+  102,65,15,112,201,78,                   //pshufd        $0x4e,%xmm9,%xmm1
+  102,73,15,126,200,                      //movq          %xmm1,%r8
+  69,137,193,                             //mov           %r8d,%r9d
+  73,193,232,32,                          //shr           $0x20,%r8
+  102,77,15,126,202,                      //movq          %xmm9,%r10
+  69,137,211,                             //mov           %r10d,%r11d
+  73,193,234,32,                          //shr           $0x20,%r10
+  243,66,15,16,28,145,                    //movss         (%rcx,%r10,4),%xmm3
+  243,66,15,16,12,129,                    //movss         (%rcx,%r8,4),%xmm1
+  15,20,217,                              //unpcklps      %xmm1,%xmm3
+  243,66,15,16,12,153,                    //movss         (%rcx,%r11,4),%xmm1
+  243,66,15,16,20,137,                    //movss         (%rcx,%r9,4),%xmm2
+  15,20,202,                              //unpcklps      %xmm2,%xmm1
+  15,20,203,                              //unpcklps      %xmm3,%xmm1
+  72,139,64,24,                           //mov           0x18(%rax),%rax
+  102,65,15,112,210,78,                   //pshufd        $0x4e,%xmm10,%xmm2
+  102,72,15,126,209,                      //movq          %xmm2,%rcx
+  65,137,200,                             //mov           %ecx,%r8d
+  72,193,233,32,                          //shr           $0x20,%rcx
+  102,77,15,126,209,                      //movq          %xmm10,%r9
+  69,137,202,                             //mov           %r9d,%r10d
+  73,193,233,32,                          //shr           $0x20,%r9
+  243,70,15,16,12,136,                    //movss         (%rax,%r9,4),%xmm9
+  243,15,16,20,136,                       //movss         (%rax,%rcx,4),%xmm2
+  68,15,20,202,                           //unpcklps      %xmm2,%xmm9
+  243,66,15,16,20,144,                    //movss         (%rax,%r10,4),%xmm2
+  243,66,15,16,28,128,                    //movss         (%rax,%r8,4),%xmm3
+  15,20,211,                              //unpcklps      %xmm3,%xmm2
+  65,15,20,209,                           //unpcklps      %xmm9,%xmm2
+  102,65,15,114,208,24,                   //psrld         $0x18,%xmm8
+  69,15,91,192,                           //cvtdq2ps      %xmm8,%xmm8
+  243,15,16,90,12,                        //movss         0xc(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_a8_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,15,110,4,56,                        //movd          (%rax,%rdi,1),%xmm0
+  102,15,239,201,                         //pxor          %xmm1,%xmm1
+  102,15,96,193,                          //punpcklbw     %xmm1,%xmm0
+  102,15,97,193,                          //punpcklwd     %xmm1,%xmm0
+  15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
+  243,15,16,90,12,                        //movss         0xc(%rdx),%xmm3
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  15,89,216,                              //mulps         %xmm0,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  15,87,192,                              //xorps         %xmm0,%xmm0
+  102,15,239,201,                         //pxor          %xmm1,%xmm1
+  15,87,210,                              //xorps         %xmm2,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_a8_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,66,8,                      //movss         0x8(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,65,15,114,240,16,                   //pslld         $0x10,%xmm8
+  102,65,15,114,224,16,                   //psrad         $0x10,%xmm8
+  102,69,15,107,192,                      //packssdw      %xmm8,%xmm8
+  102,69,15,103,192,                      //packuswb      %xmm8,%xmm8
+  102,68,15,126,4,56,                     //movd          %xmm8,(%rax,%rdi,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_565_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,126,12,120,                   //movq          (%rax,%rdi,2),%xmm9
+  102,15,239,192,                         //pxor          %xmm0,%xmm0
+  102,68,15,97,200,                       //punpcklwd     %xmm0,%xmm9
+  102,15,110,66,104,                      //movd          0x68(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,65,15,219,193,                      //pand          %xmm9,%xmm0
+  15,91,200,                              //cvtdq2ps      %xmm0,%xmm1
+  243,15,16,26,                           //movss         (%rdx),%xmm3
+  243,15,16,66,116,                       //movss         0x74(%rdx),%xmm0
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  15,89,193,                              //mulps         %xmm1,%xmm0
+  102,15,110,74,108,                      //movd          0x6c(%rdx),%xmm1
+  102,15,112,201,0,                       //pshufd        $0x0,%xmm1,%xmm1
+  102,65,15,219,201,                      //pand          %xmm9,%xmm1
+  68,15,91,193,                           //cvtdq2ps      %xmm1,%xmm8
+  243,15,16,74,120,                       //movss         0x78(%rdx),%xmm1
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  102,15,110,82,112,                      //movd          0x70(%rdx),%xmm2
+  102,15,112,210,0,                       //pshufd        $0x0,%xmm2,%xmm2
+  102,65,15,219,209,                      //pand          %xmm9,%xmm2
+  68,15,91,194,                           //cvtdq2ps      %xmm2,%xmm8
+  243,15,16,82,124,                       //movss         0x7c(%rdx),%xmm2
+  15,198,210,0,                           //shufps        $0x0,%xmm2,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  15,198,219,0,                           //shufps        $0x0,%xmm3,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_565_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,130,128,0,0,0,             //movss         0x80(%rdx),%xmm8
+  243,68,15,16,138,132,0,0,0,             //movss         0x84(%rdx),%xmm9
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,208,                           //movaps        %xmm8,%xmm10
+  68,15,89,208,                           //mulps         %xmm0,%xmm10
+  102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
+  102,65,15,114,242,11,                   //pslld         $0xb,%xmm10
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  102,65,15,114,241,5,                    //pslld         $0x5,%xmm9
+  102,69,15,235,202,                      //por           %xmm10,%xmm9
+  68,15,89,194,                           //mulps         %xmm2,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,69,15,86,193,                       //orpd          %xmm9,%xmm8
+  102,65,15,114,240,16,                   //pslld         $0x10,%xmm8
+  102,65,15,114,224,16,                   //psrad         $0x10,%xmm8
+  102,69,15,107,192,                      //packssdw      %xmm8,%xmm8
+  102,68,15,214,4,120,                    //movq          %xmm8,(%rax,%rdi,2)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_8888_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,15,111,28,184,                      //movdqu        (%rax,%rdi,4),%xmm3
+  102,15,110,66,16,                       //movd          0x10(%rdx),%xmm0
+  102,15,112,192,0,                       //pshufd        $0x0,%xmm0,%xmm0
+  102,15,111,203,                         //movdqa        %xmm3,%xmm1
+  102,15,114,209,8,                       //psrld         $0x8,%xmm1
+  102,15,219,200,                         //pand          %xmm0,%xmm1
+  102,15,111,211,                         //movdqa        %xmm3,%xmm2
+  102,15,114,210,16,                      //psrld         $0x10,%xmm2
+  102,15,219,208,                         //pand          %xmm0,%xmm2
+  102,15,219,195,                         //pand          %xmm3,%xmm0
+  15,91,192,                              //cvtdq2ps      %xmm0,%xmm0
+  243,68,15,16,66,12,                     //movss         0xc(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  15,91,201,                              //cvtdq2ps      %xmm1,%xmm1
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  15,91,210,                              //cvtdq2ps      %xmm2,%xmm2
+  65,15,89,208,                           //mulps         %xmm8,%xmm2
+  102,15,114,211,24,                      //psrld         $0x18,%xmm3
+  15,91,219,                              //cvtdq2ps      %xmm3,%xmm3
+  65,15,89,216,                           //mulps         %xmm8,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_8888_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,68,15,16,66,8,                      //movss         0x8(%rdx),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  69,15,40,208,                           //movaps        %xmm8,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  102,69,15,91,210,                       //cvtps2dq      %xmm10,%xmm10
+  102,65,15,114,242,8,                    //pslld         $0x8,%xmm10
+  102,69,15,235,209,                      //por           %xmm9,%xmm10
+  69,15,40,200,                           //movaps        %xmm8,%xmm9
+  68,15,89,202,                           //mulps         %xmm2,%xmm9
+  102,69,15,91,201,                       //cvtps2dq      %xmm9,%xmm9
+  102,65,15,114,241,16,                   //pslld         $0x10,%xmm9
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,69,15,91,192,                       //cvtps2dq      %xmm8,%xmm8
+  102,65,15,114,240,24,                   //pslld         $0x18,%xmm8
+  102,69,15,235,193,                      //por           %xmm9,%xmm8
+  102,69,15,235,194,                      //por           %xmm10,%xmm8
+  243,68,15,127,4,184,                    //movdqu        %xmm8,(%rax,%rdi,4)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_load_f16_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  243,15,111,4,248,                       //movdqu        (%rax,%rdi,8),%xmm0
+  243,15,111,76,248,16,                   //movdqu        0x10(%rax,%rdi,8),%xmm1
+  102,15,111,208,                         //movdqa        %xmm0,%xmm2
+  102,15,97,209,                          //punpcklwd     %xmm1,%xmm2
+  102,15,105,193,                         //punpckhwd     %xmm1,%xmm0
+  102,68,15,111,194,                      //movdqa        %xmm2,%xmm8
+  102,68,15,97,192,                       //punpcklwd     %xmm0,%xmm8
+  102,15,105,208,                         //punpckhwd     %xmm0,%xmm2
+  102,15,110,66,100,                      //movd          0x64(%rdx),%xmm0
+  102,15,112,216,0,                       //pshufd        $0x0,%xmm0,%xmm3
+  102,15,111,203,                         //movdqa        %xmm3,%xmm1
+  102,65,15,101,200,                      //pcmpgtw       %xmm8,%xmm1
+  102,65,15,223,200,                      //pandn         %xmm8,%xmm1
+  102,15,101,218,                         //pcmpgtw       %xmm2,%xmm3
+  102,15,223,218,                         //pandn         %xmm2,%xmm3
+  102,69,15,239,192,                      //pxor          %xmm8,%xmm8
+  102,15,111,193,                         //movdqa        %xmm1,%xmm0
+  102,65,15,97,192,                       //punpcklwd     %xmm8,%xmm0
+  102,15,114,240,13,                      //pslld         $0xd,%xmm0
+  102,15,110,82,92,                       //movd          0x5c(%rdx),%xmm2
+  102,68,15,112,202,0,                    //pshufd        $0x0,%xmm2,%xmm9
+  65,15,89,193,                           //mulps         %xmm9,%xmm0
+  102,65,15,105,200,                      //punpckhwd     %xmm8,%xmm1
+  102,15,114,241,13,                      //pslld         $0xd,%xmm1
+  65,15,89,201,                           //mulps         %xmm9,%xmm1
+  102,15,111,211,                         //movdqa        %xmm3,%xmm2
+  102,65,15,97,208,                       //punpcklwd     %xmm8,%xmm2
+  102,15,114,242,13,                      //pslld         $0xd,%xmm2
+  65,15,89,209,                           //mulps         %xmm9,%xmm2
+  102,65,15,105,216,                      //punpckhwd     %xmm8,%xmm3
+  102,15,114,243,13,                      //pslld         $0xd,%xmm3
+  65,15,89,217,                           //mulps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_f16_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  102,68,15,110,66,96,                    //movd          0x60(%rdx),%xmm8
+  102,69,15,112,192,0,                    //pshufd        $0x0,%xmm8,%xmm8
+  102,69,15,111,200,                      //movdqa        %xmm8,%xmm9
+  68,15,89,200,                           //mulps         %xmm0,%xmm9
+  102,65,15,114,209,13,                   //psrld         $0xd,%xmm9
+  102,69,15,111,208,                      //movdqa        %xmm8,%xmm10
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  102,65,15,114,210,13,                   //psrld         $0xd,%xmm10
+  102,69,15,111,216,                      //movdqa        %xmm8,%xmm11
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  102,65,15,114,211,13,                   //psrld         $0xd,%xmm11
+  68,15,89,195,                           //mulps         %xmm3,%xmm8
+  102,65,15,114,208,13,                   //psrld         $0xd,%xmm8
+  102,65,15,115,250,2,                    //pslldq        $0x2,%xmm10
+  102,69,15,235,209,                      //por           %xmm9,%xmm10
+  102,65,15,115,248,2,                    //pslldq        $0x2,%xmm8
+  102,69,15,235,195,                      //por           %xmm11,%xmm8
+  102,69,15,111,202,                      //movdqa        %xmm10,%xmm9
+  102,69,15,98,200,                       //punpckldq     %xmm8,%xmm9
+  243,68,15,127,12,248,                   //movdqu        %xmm9,(%rax,%rdi,8)
+  102,69,15,106,208,                      //punpckhdq     %xmm8,%xmm10
+  243,68,15,127,84,248,16,                //movdqu        %xmm10,0x10(%rax,%rdi,8)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_store_f32_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  72,139,0,                               //mov           (%rax),%rax
+  72,137,249,                             //mov           %rdi,%rcx
+  72,193,225,4,                           //shl           $0x4,%rcx
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  68,15,40,200,                           //movaps        %xmm0,%xmm9
+  68,15,20,201,                           //unpcklps      %xmm1,%xmm9
+  68,15,40,210,                           //movaps        %xmm2,%xmm10
+  68,15,40,218,                           //movaps        %xmm2,%xmm11
+  68,15,20,219,                           //unpcklps      %xmm3,%xmm11
+  68,15,21,193,                           //unpckhps      %xmm1,%xmm8
+  68,15,21,211,                           //unpckhps      %xmm3,%xmm10
+  69,15,40,225,                           //movaps        %xmm9,%xmm12
+  102,69,15,20,227,                       //unpcklpd      %xmm11,%xmm12
+  102,69,15,21,203,                       //unpckhpd      %xmm11,%xmm9
+  69,15,40,216,                           //movaps        %xmm8,%xmm11
+  102,69,15,20,218,                       //unpcklpd      %xmm10,%xmm11
+  102,69,15,21,194,                       //unpckhpd      %xmm10,%xmm8
+  102,68,15,17,36,8,                      //movupd        %xmm12,(%rax,%rcx,1)
+  102,68,15,17,76,8,16,                   //movupd        %xmm9,0x10(%rax,%rcx,1)
+  102,68,15,17,92,8,32,                   //movupd        %xmm11,0x20(%rax,%rcx,1)
+  102,68,15,17,68,8,48,                   //movupd        %xmm8,0x30(%rax,%rcx,1)
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_x_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,95,192,                           //maxps         %xmm0,%xmm8
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  102,15,118,192,                         //pcmpeqd       %xmm0,%xmm0
+  102,65,15,254,193,                      //paddd         %xmm9,%xmm0
+  68,15,93,192,                           //minps         %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_clamp_y_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  69,15,87,192,                           //xorps         %xmm8,%xmm8
+  68,15,95,193,                           //maxps         %xmm1,%xmm8
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  102,15,118,201,                         //pcmpeqd       %xmm1,%xmm1
+  102,65,15,254,201,                      //paddd         %xmm9,%xmm1
+  68,15,93,193,                           //minps         %xmm1,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,200,                           //movaps        %xmm8,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_x_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,200,                           //movaps        %xmm0,%xmm9
+  69,15,94,200,                           //divps         %xmm8,%xmm9
+  243,69,15,91,209,                       //cvttps2dq     %xmm9,%xmm10
+  69,15,91,210,                           //cvtdq2ps      %xmm10,%xmm10
+  69,15,194,202,1,                        //cmpltps       %xmm10,%xmm9
+  243,68,15,16,26,                        //movss         (%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,84,217,                           //andps         %xmm9,%xmm11
+  69,15,92,211,                           //subps         %xmm11,%xmm10
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  65,15,92,194,                           //subps         %xmm10,%xmm0
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,193,                           //minps         %xmm9,%xmm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_repeat_y_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,0,                         //movss         (%rax),%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  69,15,94,200,                           //divps         %xmm8,%xmm9
+  243,69,15,91,209,                       //cvttps2dq     %xmm9,%xmm10
+  69,15,91,210,                           //cvtdq2ps      %xmm10,%xmm10
+  69,15,194,202,1,                        //cmpltps       %xmm10,%xmm9
+  243,68,15,16,26,                        //movss         (%rdx),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,84,217,                           //andps         %xmm9,%xmm11
+  69,15,92,211,                           //subps         %xmm11,%xmm10
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  65,15,92,202,                           //subps         %xmm10,%xmm1
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,201,                           //minps         %xmm9,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_x_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,40,193,                           //movaps        %xmm9,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,92,192,                           //subps         %xmm8,%xmm0
+  243,69,15,88,201,                       //addss         %xmm9,%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,40,208,                           //movaps        %xmm0,%xmm10
+  69,15,94,209,                           //divps         %xmm9,%xmm10
+  243,69,15,91,218,                       //cvttps2dq     %xmm10,%xmm11
+  69,15,91,219,                           //cvtdq2ps      %xmm11,%xmm11
+  69,15,194,211,1,                        //cmpltps       %xmm11,%xmm10
+  243,68,15,16,34,                        //movss         (%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  69,15,84,226,                           //andps         %xmm10,%xmm12
+  69,15,87,210,                           //xorps         %xmm10,%xmm10
+  69,15,92,220,                           //subps         %xmm12,%xmm11
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  65,15,92,195,                           //subps         %xmm11,%xmm0
+  65,15,92,192,                           //subps         %xmm8,%xmm0
+  68,15,92,208,                           //subps         %xmm0,%xmm10
+  65,15,84,194,                           //andps         %xmm10,%xmm0
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,193,                           //minps         %xmm9,%xmm0
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_mirror_y_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,68,15,16,8,                         //movss         (%rax),%xmm9
+  69,15,40,193,                           //movaps        %xmm9,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,92,200,                           //subps         %xmm8,%xmm1
+  243,69,15,88,201,                       //addss         %xmm9,%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  68,15,40,209,                           //movaps        %xmm1,%xmm10
+  69,15,94,209,                           //divps         %xmm9,%xmm10
+  243,69,15,91,218,                       //cvttps2dq     %xmm10,%xmm11
+  69,15,91,219,                           //cvtdq2ps      %xmm11,%xmm11
+  69,15,194,211,1,                        //cmpltps       %xmm11,%xmm10
+  243,68,15,16,34,                        //movss         (%rdx),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  69,15,84,226,                           //andps         %xmm10,%xmm12
+  69,15,87,210,                           //xorps         %xmm10,%xmm10
+  69,15,92,220,                           //subps         %xmm12,%xmm11
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  65,15,92,203,                           //subps         %xmm11,%xmm1
+  65,15,92,200,                           //subps         %xmm8,%xmm1
+  68,15,92,209,                           //subps         %xmm1,%xmm10
+  65,15,84,202,                           //andps         %xmm10,%xmm1
+  102,69,15,118,201,                      //pcmpeqd       %xmm9,%xmm9
+  102,69,15,254,200,                      //paddd         %xmm8,%xmm9
+  65,15,93,201,                           //minps         %xmm9,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_2x3_sse2[] = {
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,16,                     //movss         0x10(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,194,                           //addps         %xmm10,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,88,202,                           //addps         %xmm10,%xmm1
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_3x4_sse2[] = {
+  68,15,40,201,                           //movaps        %xmm1,%xmm9
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,15,16,72,4,                         //movss         0x4(%rax),%xmm1
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  243,68,15,16,80,12,                     //movss         0xc(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,24,                     //movss         0x18(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,36,                     //movss         0x24(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,194,                           //addps         %xmm10,%xmm0
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,40,                     //movss         0x28(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,218,                           //mulps         %xmm2,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,209,                           //mulps         %xmm9,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,89,200,                           //mulps         %xmm8,%xmm1
+  65,15,88,202,                           //addps         %xmm10,%xmm1
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  243,68,15,16,104,44,                    //movss         0x2c(%rax),%xmm13
+  69,15,198,237,0,                        //shufps        $0x0,%xmm13,%xmm13
+  68,15,89,226,                           //mulps         %xmm2,%xmm12
+  69,15,88,229,                           //addps         %xmm13,%xmm12
+  69,15,89,217,                           //mulps         %xmm9,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,210,                           //movaps        %xmm10,%xmm2
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_matrix_perspective_sse2[] = {
+  68,15,40,192,                           //movaps        %xmm0,%xmm8
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  243,15,16,0,                            //movss         (%rax),%xmm0
+  243,68,15,16,72,4,                      //movss         0x4(%rax),%xmm9
+  15,198,192,0,                           //shufps        $0x0,%xmm0,%xmm0
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,80,8,                      //movss         0x8(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  65,15,89,192,                           //mulps         %xmm8,%xmm0
+  65,15,88,193,                           //addps         %xmm9,%xmm0
+  243,68,15,16,72,12,                     //movss         0xc(%rax),%xmm9
+  69,15,198,201,0,                        //shufps        $0x0,%xmm9,%xmm9
+  243,68,15,16,80,16,                     //movss         0x10(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,20,                     //movss         0x14(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  68,15,89,209,                           //mulps         %xmm1,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  69,15,89,200,                           //mulps         %xmm8,%xmm9
+  69,15,88,202,                           //addps         %xmm10,%xmm9
+  243,68,15,16,80,24,                     //movss         0x18(%rax),%xmm10
+  69,15,198,210,0,                        //shufps        $0x0,%xmm10,%xmm10
+  243,68,15,16,88,28,                     //movss         0x1c(%rax),%xmm11
+  69,15,198,219,0,                        //shufps        $0x0,%xmm11,%xmm11
+  243,68,15,16,96,32,                     //movss         0x20(%rax),%xmm12
+  69,15,198,228,0,                        //shufps        $0x0,%xmm12,%xmm12
+  68,15,89,217,                           //mulps         %xmm1,%xmm11
+  69,15,88,220,                           //addps         %xmm12,%xmm11
+  69,15,89,208,                           //mulps         %xmm8,%xmm10
+  69,15,88,211,                           //addps         %xmm11,%xmm10
+  65,15,83,202,                           //rcpps         %xmm10,%xmm1
+  15,89,193,                              //mulps         %xmm1,%xmm0
+  68,15,89,201,                           //mulps         %xmm1,%xmm9
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,201,                           //movaps        %xmm9,%xmm1
+  255,224,                                //jmpq          *%rax
+};
+
+CODE const uint8_t sk_linear_gradient_2stops_sse2[] = {
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  68,15,16,8,                             //movups        (%rax),%xmm9
+  15,16,88,16,                            //movups        0x10(%rax),%xmm3
+  68,15,40,195,                           //movaps        %xmm3,%xmm8
+  69,15,198,192,0,                        //shufps        $0x0,%xmm8,%xmm8
+  65,15,40,201,                           //movaps        %xmm9,%xmm1
+  15,198,201,0,                           //shufps        $0x0,%xmm1,%xmm1
+  68,15,89,192,                           //mulps         %xmm0,%xmm8
+  68,15,88,193,                           //addps         %xmm1,%xmm8
+  15,40,203,                              //movaps        %xmm3,%xmm1
+  15,198,201,85,                          //shufps        $0x55,%xmm1,%xmm1
+  65,15,40,209,                           //movaps        %xmm9,%xmm2
+  15,198,210,85,                          //shufps        $0x55,%xmm2,%xmm2
+  15,89,200,                              //mulps         %xmm0,%xmm1
+  15,88,202,                              //addps         %xmm2,%xmm1
+  15,40,211,                              //movaps        %xmm3,%xmm2
+  15,198,210,170,                         //shufps        $0xaa,%xmm2,%xmm2
+  69,15,40,209,                           //movaps        %xmm9,%xmm10
+  69,15,198,210,170,                      //shufps        $0xaa,%xmm10,%xmm10
+  15,89,208,                              //mulps         %xmm0,%xmm2
+  65,15,88,210,                           //addps         %xmm10,%xmm2
+  15,198,219,255,                         //shufps        $0xff,%xmm3,%xmm3
+  69,15,198,201,255,                      //shufps        $0xff,%xmm9,%xmm9
+  15,89,216,                              //mulps         %xmm0,%xmm3
+  65,15,88,217,                           //addps         %xmm9,%xmm3
+  72,173,                                 //lods          %ds:(%rsi),%rax
+  65,15,40,192,                           //movaps        %xmm8,%xmm0
+  255,224,                                //jmpq          *%rax
+};
+#endif
diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S
deleted file mode 100644 (file)
index 6afcfca..0000000
+++ /dev/null
@@ -1,4892 +0,0 @@
-; Copyright 2017 Google Inc.
-;
-; Use of this source code is governed by a BSD-style license that can be
-; found in the LICENSE file.
-
-; This file is generated semi-automatically with this command:
-;   $ src/jumper/build_stages.py
-
-_text SEGMENT
-
-PUBLIC _sk_start_pipeline_hsw
-_sk_start_pipeline_hsw LABEL PROC
-  DB  65,87                               ; push          %r15
-  DB  65,86                               ; push          %r14
-  DB  65,85                               ; push          %r13
-  DB  65,84                               ; push          %r12
-  DB  86                                  ; push          %rsi
-  DB  87                                  ; push          %rdi
-  DB  83                                  ; push          %rbx
-  DB  72,129,236,160,0,0,0                ; sub           $0xa0,%rsp
-  DB  197,120,41,188,36,144,0,0,0         ; vmovaps       %xmm15,0x90(%rsp)
-  DB  197,120,41,180,36,128,0,0,0         ; vmovaps       %xmm14,0x80(%rsp)
-  DB  197,120,41,108,36,112               ; vmovaps       %xmm13,0x70(%rsp)
-  DB  197,120,41,100,36,96                ; vmovaps       %xmm12,0x60(%rsp)
-  DB  197,120,41,92,36,80                 ; vmovaps       %xmm11,0x50(%rsp)
-  DB  197,120,41,84,36,64                 ; vmovaps       %xmm10,0x40(%rsp)
-  DB  197,120,41,76,36,48                 ; vmovaps       %xmm9,0x30(%rsp)
-  DB  197,120,41,68,36,32                 ; vmovaps       %xmm8,0x20(%rsp)
-  DB  197,248,41,124,36,16                ; vmovaps       %xmm7,0x10(%rsp)
-  DB  197,248,41,52,36                    ; vmovaps       %xmm6,(%rsp)
-  DB  77,137,205                          ; mov           %r9,%r13
-  DB  77,137,198                          ; mov           %r8,%r14
-  DB  72,137,203                          ; mov           %rcx,%rbx
-  DB  72,137,214                          ; mov           %rdx,%rsi
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  73,137,199                          ; mov           %rax,%r15
-  DB  73,137,244                          ; mov           %rsi,%r12
-  DB  72,141,67,8                         ; lea           0x8(%rbx),%rax
-  DB  76,57,232                           ; cmp           %r13,%rax
-  DB  118,5                               ; jbe           75 <_sk_start_pipeline_hsw+0x75>
-  DB  72,137,223                          ; mov           %rbx,%rdi
-  DB  235,65                              ; jmp           b6 <_sk_start_pipeline_hsw+0xb6>
-  DB  185,0,0,0,0                         ; mov           $0x0,%ecx
-  DB  197,252,87,192                      ; vxorps        %ymm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  197,236,87,210                      ; vxorps        %ymm2,%ymm2,%ymm2
-  DB  197,228,87,219                      ; vxorps        %ymm3,%ymm3,%ymm3
-  DB  197,220,87,228                      ; vxorps        %ymm4,%ymm4,%ymm4
-  DB  197,212,87,237                      ; vxorps        %ymm5,%ymm5,%ymm5
-  DB  197,204,87,246                      ; vxorps        %ymm6,%ymm6,%ymm6
-  DB  197,196,87,255                      ; vxorps        %ymm7,%ymm7,%ymm7
-  DB  72,137,223                          ; mov           %rbx,%rdi
-  DB  76,137,230                          ; mov           %r12,%rsi
-  DB  76,137,242                          ; mov           %r14,%rdx
-  DB  65,255,215                          ; callq         *%r15
-  DB  72,141,123,8                        ; lea           0x8(%rbx),%rdi
-  DB  72,131,195,16                       ; add           $0x10,%rbx
-  DB  76,57,235                           ; cmp           %r13,%rbx
-  DB  72,137,251                          ; mov           %rdi,%rbx
-  DB  118,191                             ; jbe           75 <_sk_start_pipeline_hsw+0x75>
-  DB  76,137,233                          ; mov           %r13,%rcx
-  DB  72,41,249                           ; sub           %rdi,%rcx
-  DB  116,41                              ; je            e7 <_sk_start_pipeline_hsw+0xe7>
-  DB  197,252,87,192                      ; vxorps        %ymm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  197,236,87,210                      ; vxorps        %ymm2,%ymm2,%ymm2
-  DB  197,228,87,219                      ; vxorps        %ymm3,%ymm3,%ymm3
-  DB  197,220,87,228                      ; vxorps        %ymm4,%ymm4,%ymm4
-  DB  197,212,87,237                      ; vxorps        %ymm5,%ymm5,%ymm5
-  DB  197,204,87,246                      ; vxorps        %ymm6,%ymm6,%ymm6
-  DB  197,196,87,255                      ; vxorps        %ymm7,%ymm7,%ymm7
-  DB  76,137,230                          ; mov           %r12,%rsi
-  DB  76,137,242                          ; mov           %r14,%rdx
-  DB  65,255,215                          ; callq         *%r15
-  DB  76,137,232                          ; mov           %r13,%rax
-  DB  197,248,40,52,36                    ; vmovaps       (%rsp),%xmm6
-  DB  197,248,40,124,36,16                ; vmovaps       0x10(%rsp),%xmm7
-  DB  197,120,40,68,36,32                 ; vmovaps       0x20(%rsp),%xmm8
-  DB  197,120,40,76,36,48                 ; vmovaps       0x30(%rsp),%xmm9
-  DB  197,120,40,84,36,64                 ; vmovaps       0x40(%rsp),%xmm10
-  DB  197,120,40,92,36,80                 ; vmovaps       0x50(%rsp),%xmm11
-  DB  197,120,40,100,36,96                ; vmovaps       0x60(%rsp),%xmm12
-  DB  197,120,40,108,36,112               ; vmovaps       0x70(%rsp),%xmm13
-  DB  197,120,40,180,36,128,0,0,0         ; vmovaps       0x80(%rsp),%xmm14
-  DB  197,120,40,188,36,144,0,0,0         ; vmovaps       0x90(%rsp),%xmm15
-  DB  72,129,196,160,0,0,0                ; add           $0xa0,%rsp
-  DB  91                                  ; pop           %rbx
-  DB  95                                  ; pop           %rdi
-  DB  94                                  ; pop           %rsi
-  DB  65,92                               ; pop           %r12
-  DB  65,93                               ; pop           %r13
-  DB  65,94                               ; pop           %r14
-  DB  65,95                               ; pop           %r15
-  DB  197,248,119                         ; vzeroupper
-  DB  195                                 ; retq
-
-PUBLIC _sk_just_return_hsw
-_sk_just_return_hsw LABEL PROC
-  DB  195                                 ; retq
-
-PUBLIC _sk_seed_shader_hsw
-_sk_seed_shader_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,249,110,199                     ; vmovd         %edi,%xmm0
-  DB  196,226,125,24,192                  ; vbroadcastss  %xmm0,%ymm0
-  DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
-  DB  196,226,125,24,74,4                 ; vbroadcastss  0x4(%rdx),%ymm1
-  DB  197,252,88,193                      ; vaddps        %ymm1,%ymm0,%ymm0
-  DB  197,252,88,66,20                    ; vaddps        0x14(%rdx),%ymm0,%ymm0
-  DB  196,226,125,24,16                   ; vbroadcastss  (%rax),%ymm2
-  DB  197,252,91,210                      ; vcvtdq2ps     %ymm2,%ymm2
-  DB  197,236,88,201                      ; vaddps        %ymm1,%ymm2,%ymm1
-  DB  196,226,125,24,18                   ; vbroadcastss  (%rdx),%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,228,87,219                      ; vxorps        %ymm3,%ymm3,%ymm3
-  DB  197,220,87,228                      ; vxorps        %ymm4,%ymm4,%ymm4
-  DB  197,212,87,237                      ; vxorps        %ymm5,%ymm5,%ymm5
-  DB  197,204,87,246                      ; vxorps        %ymm6,%ymm6,%ymm6
-  DB  197,196,87,255                      ; vxorps        %ymm7,%ymm7,%ymm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_constant_color_hsw
-_sk_constant_color_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,226,125,24,0                    ; vbroadcastss  (%rax),%ymm0
-  DB  196,226,125,24,72,4                 ; vbroadcastss  0x4(%rax),%ymm1
-  DB  196,226,125,24,80,8                 ; vbroadcastss  0x8(%rax),%ymm2
-  DB  196,226,125,24,88,12                ; vbroadcastss  0xc(%rax),%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clear_hsw
-_sk_clear_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,87,192                      ; vxorps        %ymm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  197,236,87,210                      ; vxorps        %ymm2,%ymm2,%ymm2
-  DB  197,228,87,219                      ; vxorps        %ymm3,%ymm3,%ymm3
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_plus__hsw
-_sk_plus__hsw LABEL PROC
-  DB  197,252,88,196                      ; vaddps        %ymm4,%ymm0,%ymm0
-  DB  197,244,88,205                      ; vaddps        %ymm5,%ymm1,%ymm1
-  DB  197,236,88,214                      ; vaddps        %ymm6,%ymm2,%ymm2
-  DB  197,228,88,223                      ; vaddps        %ymm7,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_srcover_hsw
-_sk_srcover_hsw LABEL PROC
-  DB  196,98,125,24,2                     ; vbroadcastss  (%rdx),%ymm8
-  DB  197,60,92,195                       ; vsubps        %ymm3,%ymm8,%ymm8
-  DB  196,194,93,184,192                  ; vfmadd231ps   %ymm8,%ymm4,%ymm0
-  DB  196,194,85,184,200                  ; vfmadd231ps   %ymm8,%ymm5,%ymm1
-  DB  196,194,77,184,208                  ; vfmadd231ps   %ymm8,%ymm6,%ymm2
-  DB  196,194,69,184,216                  ; vfmadd231ps   %ymm8,%ymm7,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_dstover_hsw
-_sk_dstover_hsw LABEL PROC
-  DB  196,98,125,24,2                     ; vbroadcastss  (%rdx),%ymm8
-  DB  197,60,92,199                       ; vsubps        %ymm7,%ymm8,%ymm8
-  DB  196,226,61,168,196                  ; vfmadd213ps   %ymm4,%ymm8,%ymm0
-  DB  196,226,61,168,205                  ; vfmadd213ps   %ymm5,%ymm8,%ymm1
-  DB  196,226,61,168,214                  ; vfmadd213ps   %ymm6,%ymm8,%ymm2
-  DB  196,226,61,168,223                  ; vfmadd213ps   %ymm7,%ymm8,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_0_hsw
-_sk_clamp_0_hsw LABEL PROC
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  196,193,124,95,192                  ; vmaxps        %ymm8,%ymm0,%ymm0
-  DB  196,193,116,95,200                  ; vmaxps        %ymm8,%ymm1,%ymm1
-  DB  196,193,108,95,208                  ; vmaxps        %ymm8,%ymm2,%ymm2
-  DB  196,193,100,95,216                  ; vmaxps        %ymm8,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_1_hsw
-_sk_clamp_1_hsw LABEL PROC
-  DB  196,98,125,24,2                     ; vbroadcastss  (%rdx),%ymm8
-  DB  196,193,124,93,192                  ; vminps        %ymm8,%ymm0,%ymm0
-  DB  196,193,116,93,200                  ; vminps        %ymm8,%ymm1,%ymm1
-  DB  196,193,108,93,208                  ; vminps        %ymm8,%ymm2,%ymm2
-  DB  196,193,100,93,216                  ; vminps        %ymm8,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_a_hsw
-_sk_clamp_a_hsw LABEL PROC
-  DB  196,98,125,24,2                     ; vbroadcastss  (%rdx),%ymm8
-  DB  196,193,100,93,216                  ; vminps        %ymm8,%ymm3,%ymm3
-  DB  197,252,93,195                      ; vminps        %ymm3,%ymm0,%ymm0
-  DB  197,244,93,203                      ; vminps        %ymm3,%ymm1,%ymm1
-  DB  197,236,93,211                      ; vminps        %ymm3,%ymm2,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_set_rgb_hsw
-_sk_set_rgb_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,226,125,24,0                    ; vbroadcastss  (%rax),%ymm0
-  DB  196,226,125,24,72,4                 ; vbroadcastss  0x4(%rax),%ymm1
-  DB  196,226,125,24,80,8                 ; vbroadcastss  0x8(%rax),%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_swap_rb_hsw
-_sk_swap_rb_hsw LABEL PROC
-  DB  197,124,40,192                      ; vmovaps       %ymm0,%ymm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,40,194                      ; vmovaps       %ymm2,%ymm0
-  DB  197,124,41,194                      ; vmovaps       %ymm8,%ymm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_swap_hsw
-_sk_swap_hsw LABEL PROC
-  DB  197,124,40,195                      ; vmovaps       %ymm3,%ymm8
-  DB  197,124,40,202                      ; vmovaps       %ymm2,%ymm9
-  DB  197,124,40,209                      ; vmovaps       %ymm1,%ymm10
-  DB  197,124,40,216                      ; vmovaps       %ymm0,%ymm11
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,40,196                      ; vmovaps       %ymm4,%ymm0
-  DB  197,252,40,205                      ; vmovaps       %ymm5,%ymm1
-  DB  197,252,40,214                      ; vmovaps       %ymm6,%ymm2
-  DB  197,252,40,223                      ; vmovaps       %ymm7,%ymm3
-  DB  197,124,41,220                      ; vmovaps       %ymm11,%ymm4
-  DB  197,124,41,213                      ; vmovaps       %ymm10,%ymm5
-  DB  197,124,41,206                      ; vmovaps       %ymm9,%ymm6
-  DB  197,124,41,199                      ; vmovaps       %ymm8,%ymm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_move_src_dst_hsw
-_sk_move_src_dst_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,40,224                      ; vmovaps       %ymm0,%ymm4
-  DB  197,252,40,233                      ; vmovaps       %ymm1,%ymm5
-  DB  197,252,40,242                      ; vmovaps       %ymm2,%ymm6
-  DB  197,252,40,251                      ; vmovaps       %ymm3,%ymm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_move_dst_src_hsw
-_sk_move_dst_src_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,40,196                      ; vmovaps       %ymm4,%ymm0
-  DB  197,252,40,205                      ; vmovaps       %ymm5,%ymm1
-  DB  197,252,40,214                      ; vmovaps       %ymm6,%ymm2
-  DB  197,252,40,223                      ; vmovaps       %ymm7,%ymm3
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_premul_hsw
-_sk_premul_hsw LABEL PROC
-  DB  197,252,89,195                      ; vmulps        %ymm3,%ymm0,%ymm0
-  DB  197,244,89,203                      ; vmulps        %ymm3,%ymm1,%ymm1
-  DB  197,236,89,211                      ; vmulps        %ymm3,%ymm2,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_unpremul_hsw
-_sk_unpremul_hsw LABEL PROC
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  196,65,100,194,200,0                ; vcmpeqps      %ymm8,%ymm3,%ymm9
-  DB  196,98,125,24,18                    ; vbroadcastss  (%rdx),%ymm10
-  DB  197,44,94,211                       ; vdivps        %ymm3,%ymm10,%ymm10
-  DB  196,67,45,74,192,144                ; vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  197,188,89,210                      ; vmulps        %ymm2,%ymm8,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_from_srgb_hsw
-_sk_from_srgb_hsw LABEL PROC
-  DB  196,98,125,24,66,64                 ; vbroadcastss  0x40(%rdx),%ymm8
-  DB  197,60,89,200                       ; vmulps        %ymm0,%ymm8,%ymm9
-  DB  197,124,89,208                      ; vmulps        %ymm0,%ymm0,%ymm10
-  DB  196,98,125,24,90,60                 ; vbroadcastss  0x3c(%rdx),%ymm11
-  DB  196,98,125,24,98,56                 ; vbroadcastss  0x38(%rdx),%ymm12
-  DB  196,65,124,40,235                   ; vmovaps       %ymm11,%ymm13
-  DB  196,66,125,168,236                  ; vfmadd213ps   %ymm12,%ymm0,%ymm13
-  DB  196,98,125,24,114,52                ; vbroadcastss  0x34(%rdx),%ymm14
-  DB  196,66,45,168,238                   ; vfmadd213ps   %ymm14,%ymm10,%ymm13
-  DB  196,98,125,24,82,68                 ; vbroadcastss  0x44(%rdx),%ymm10
-  DB  196,193,124,194,194,1               ; vcmpltps      %ymm10,%ymm0,%ymm0
-  DB  196,195,21,74,193,0                 ; vblendvps     %ymm0,%ymm9,%ymm13,%ymm0
-  DB  197,60,89,201                       ; vmulps        %ymm1,%ymm8,%ymm9
-  DB  197,116,89,233                      ; vmulps        %ymm1,%ymm1,%ymm13
-  DB  196,65,124,40,251                   ; vmovaps       %ymm11,%ymm15
-  DB  196,66,117,168,252                  ; vfmadd213ps   %ymm12,%ymm1,%ymm15
-  DB  196,66,21,168,254                   ; vfmadd213ps   %ymm14,%ymm13,%ymm15
-  DB  196,193,116,194,202,1               ; vcmpltps      %ymm10,%ymm1,%ymm1
-  DB  196,195,5,74,201,16                 ; vblendvps     %ymm1,%ymm9,%ymm15,%ymm1
-  DB  197,60,89,194                       ; vmulps        %ymm2,%ymm8,%ymm8
-  DB  197,108,89,202                      ; vmulps        %ymm2,%ymm2,%ymm9
-  DB  196,66,109,168,220                  ; vfmadd213ps   %ymm12,%ymm2,%ymm11
-  DB  196,66,53,168,222                   ; vfmadd213ps   %ymm14,%ymm9,%ymm11
-  DB  196,193,108,194,210,1               ; vcmpltps      %ymm10,%ymm2,%ymm2
-  DB  196,195,37,74,208,32                ; vblendvps     %ymm2,%ymm8,%ymm11,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_to_srgb_hsw
-_sk_to_srgb_hsw LABEL PROC
-  DB  197,124,82,192                      ; vrsqrtps      %ymm0,%ymm8
-  DB  196,65,124,83,200                   ; vrcpps        %ymm8,%ymm9
-  DB  196,65,124,82,208                   ; vrsqrtps      %ymm8,%ymm10
-  DB  196,98,125,24,66,72                 ; vbroadcastss  0x48(%rdx),%ymm8
-  DB  197,60,89,216                       ; vmulps        %ymm0,%ymm8,%ymm11
-  DB  196,98,125,24,34                    ; vbroadcastss  (%rdx),%ymm12
-  DB  196,98,125,24,106,76                ; vbroadcastss  0x4c(%rdx),%ymm13
-  DB  196,98,125,24,114,80                ; vbroadcastss  0x50(%rdx),%ymm14
-  DB  196,98,125,24,122,84                ; vbroadcastss  0x54(%rdx),%ymm15
-  DB  196,66,13,168,207                   ; vfmadd213ps   %ymm15,%ymm14,%ymm9
-  DB  196,66,21,184,202                   ; vfmadd231ps   %ymm10,%ymm13,%ymm9
-  DB  196,65,28,93,201                    ; vminps        %ymm9,%ymm12,%ymm9
-  DB  196,98,125,24,82,88                 ; vbroadcastss  0x58(%rdx),%ymm10
-  DB  196,193,124,194,194,1               ; vcmpltps      %ymm10,%ymm0,%ymm0
-  DB  196,195,53,74,195,0                 ; vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
-  DB  197,124,82,201                      ; vrsqrtps      %ymm1,%ymm9
-  DB  196,65,124,83,217                   ; vrcpps        %ymm9,%ymm11
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,66,13,168,223                   ; vfmadd213ps   %ymm15,%ymm14,%ymm11
-  DB  196,66,21,184,217                   ; vfmadd231ps   %ymm9,%ymm13,%ymm11
-  DB  197,60,89,201                       ; vmulps        %ymm1,%ymm8,%ymm9
-  DB  196,65,28,93,219                    ; vminps        %ymm11,%ymm12,%ymm11
-  DB  196,193,116,194,202,1               ; vcmpltps      %ymm10,%ymm1,%ymm1
-  DB  196,195,37,74,201,16                ; vblendvps     %ymm1,%ymm9,%ymm11,%ymm1
-  DB  197,124,82,202                      ; vrsqrtps      %ymm2,%ymm9
-  DB  196,65,124,83,217                   ; vrcpps        %ymm9,%ymm11
-  DB  196,66,13,168,223                   ; vfmadd213ps   %ymm15,%ymm14,%ymm11
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,66,21,184,217                   ; vfmadd231ps   %ymm9,%ymm13,%ymm11
-  DB  196,65,28,93,203                    ; vminps        %ymm11,%ymm12,%ymm9
-  DB  197,60,89,194                       ; vmulps        %ymm2,%ymm8,%ymm8
-  DB  196,193,108,194,210,1               ; vcmpltps      %ymm10,%ymm2,%ymm2
-  DB  196,195,53,74,208,32                ; vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_scale_1_float_hsw
-_sk_scale_1_float_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  197,188,89,210                      ; vmulps        %ymm2,%ymm8,%ymm2
-  DB  197,188,89,219                      ; vmulps        %ymm3,%ymm8,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_scale_u8_hsw
-_sk_scale_u8_hsw LABEL PROC
-  DB  73,137,200                          ; mov           %rcx,%r8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,1,248                            ; add           %rdi,%rax
-  DB  77,133,192                          ; test          %r8,%r8
-  DB  117,48                              ; jne           4b1 <_sk_scale_u8_hsw+0x40>
-  DB  197,123,16,0                        ; vmovsd        (%rax),%xmm8
-  DB  196,66,125,49,192                   ; vpmovzxbd     %xmm8,%ymm8
-  DB  196,65,124,91,192                   ; vcvtdq2ps     %ymm8,%ymm8
-  DB  196,98,125,24,74,12                 ; vbroadcastss  0xc(%rdx),%ymm9
-  DB  196,65,60,89,193                    ; vmulps        %ymm9,%ymm8,%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  197,188,89,210                      ; vmulps        %ymm2,%ymm8,%ymm2
-  DB  197,188,89,219                      ; vmulps        %ymm3,%ymm8,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,137,193                          ; mov           %r8,%rcx
-  DB  255,224                             ; jmpq          *%rax
-  DB  49,201                              ; xor           %ecx,%ecx
-  DB  77,137,194                          ; mov           %r8,%r10
-  DB  69,49,201                           ; xor           %r9d,%r9d
-  DB  68,15,182,24                        ; movzbl        (%rax),%r11d
-  DB  72,255,192                          ; inc           %rax
-  DB  73,211,227                          ; shl           %cl,%r11
-  DB  77,9,217                            ; or            %r11,%r9
-  DB  72,131,193,8                        ; add           $0x8,%rcx
-  DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           4b9 <_sk_scale_u8_hsw+0x48>
-  DB  196,65,249,110,193                  ; vmovq         %r9,%xmm8
-  DB  235,175                             ; jmp           485 <_sk_scale_u8_hsw+0x14>
-
-PUBLIC _sk_lerp_1_float_hsw
-_sk_lerp_1_float_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  197,252,92,196                      ; vsubps        %ymm4,%ymm0,%ymm0
-  DB  196,226,61,168,196                  ; vfmadd213ps   %ymm4,%ymm8,%ymm0
-  DB  197,244,92,205                      ; vsubps        %ymm5,%ymm1,%ymm1
-  DB  196,226,61,168,205                  ; vfmadd213ps   %ymm5,%ymm8,%ymm1
-  DB  197,236,92,214                      ; vsubps        %ymm6,%ymm2,%ymm2
-  DB  196,226,61,168,214                  ; vfmadd213ps   %ymm6,%ymm8,%ymm2
-  DB  197,228,92,223                      ; vsubps        %ymm7,%ymm3,%ymm3
-  DB  196,226,61,168,223                  ; vfmadd213ps   %ymm7,%ymm8,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_lerp_u8_hsw
-_sk_lerp_u8_hsw LABEL PROC
-  DB  73,137,200                          ; mov           %rcx,%r8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,1,248                            ; add           %rdi,%rax
-  DB  77,133,192                          ; test          %r8,%r8
-  DB  117,68                              ; jne           559 <_sk_lerp_u8_hsw+0x54>
-  DB  197,123,16,0                        ; vmovsd        (%rax),%xmm8
-  DB  196,66,125,49,192                   ; vpmovzxbd     %xmm8,%ymm8
-  DB  196,65,124,91,192                   ; vcvtdq2ps     %ymm8,%ymm8
-  DB  196,98,125,24,74,12                 ; vbroadcastss  0xc(%rdx),%ymm9
-  DB  196,65,60,89,193                    ; vmulps        %ymm9,%ymm8,%ymm8
-  DB  197,252,92,196                      ; vsubps        %ymm4,%ymm0,%ymm0
-  DB  196,226,61,168,196                  ; vfmadd213ps   %ymm4,%ymm8,%ymm0
-  DB  197,244,92,205                      ; vsubps        %ymm5,%ymm1,%ymm1
-  DB  196,226,61,168,205                  ; vfmadd213ps   %ymm5,%ymm8,%ymm1
-  DB  197,236,92,214                      ; vsubps        %ymm6,%ymm2,%ymm2
-  DB  196,226,61,168,214                  ; vfmadd213ps   %ymm6,%ymm8,%ymm2
-  DB  197,228,92,223                      ; vsubps        %ymm7,%ymm3,%ymm3
-  DB  196,226,61,168,223                  ; vfmadd213ps   %ymm7,%ymm8,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,137,193                          ; mov           %r8,%rcx
-  DB  255,224                             ; jmpq          *%rax
-  DB  49,201                              ; xor           %ecx,%ecx
-  DB  77,137,194                          ; mov           %r8,%r10
-  DB  69,49,201                           ; xor           %r9d,%r9d
-  DB  68,15,182,24                        ; movzbl        (%rax),%r11d
-  DB  72,255,192                          ; inc           %rax
-  DB  73,211,227                          ; shl           %cl,%r11
-  DB  77,9,217                            ; or            %r11,%r9
-  DB  72,131,193,8                        ; add           $0x8,%rcx
-  DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           561 <_sk_lerp_u8_hsw+0x5c>
-  DB  196,65,249,110,193                  ; vmovq         %r9,%xmm8
-  DB  235,155                             ; jmp           519 <_sk_lerp_u8_hsw+0x14>
-
-PUBLIC _sk_lerp_565_hsw
-_sk_lerp_565_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,16                           ; mov           (%rax),%r10
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,123                             ; jne           603 <_sk_lerp_565_hsw+0x85>
-  DB  196,193,122,111,28,122              ; vmovdqu       (%r10,%rdi,2),%xmm3
-  DB  196,226,125,51,219                  ; vpmovzxwd     %xmm3,%ymm3
-  DB  196,98,125,88,66,104                ; vpbroadcastd  0x68(%rdx),%ymm8
-  DB  197,61,219,195                      ; vpand         %ymm3,%ymm8,%ymm8
-  DB  196,65,124,91,192                   ; vcvtdq2ps     %ymm8,%ymm8
-  DB  196,98,125,24,74,116                ; vbroadcastss  0x74(%rdx),%ymm9
-  DB  196,65,52,89,192                    ; vmulps        %ymm8,%ymm9,%ymm8
-  DB  196,98,125,88,74,108                ; vpbroadcastd  0x6c(%rdx),%ymm9
-  DB  197,53,219,203                      ; vpand         %ymm3,%ymm9,%ymm9
-  DB  196,65,124,91,201                   ; vcvtdq2ps     %ymm9,%ymm9
-  DB  196,98,125,24,82,120                ; vbroadcastss  0x78(%rdx),%ymm10
-  DB  196,65,44,89,201                    ; vmulps        %ymm9,%ymm10,%ymm9
-  DB  196,98,125,88,82,112                ; vpbroadcastd  0x70(%rdx),%ymm10
-  DB  197,173,219,219                     ; vpand         %ymm3,%ymm10,%ymm3
-  DB  197,252,91,219                      ; vcvtdq2ps     %ymm3,%ymm3
-  DB  196,98,125,24,82,124                ; vbroadcastss  0x7c(%rdx),%ymm10
-  DB  197,172,89,219                      ; vmulps        %ymm3,%ymm10,%ymm3
-  DB  197,252,92,196                      ; vsubps        %ymm4,%ymm0,%ymm0
-  DB  196,226,61,168,196                  ; vfmadd213ps   %ymm4,%ymm8,%ymm0
-  DB  197,244,92,205                      ; vsubps        %ymm5,%ymm1,%ymm1
-  DB  196,226,53,168,205                  ; vfmadd213ps   %ymm5,%ymm9,%ymm1
-  DB  197,236,92,214                      ; vsubps        %ymm6,%ymm2,%ymm2
-  DB  196,226,101,168,214                 ; vfmadd213ps   %ymm6,%ymm3,%ymm2
-  DB  196,226,125,24,26                   ; vbroadcastss  (%rdx),%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  65,137,200                          ; mov           %ecx,%r8d
-  DB  65,128,224,7                        ; and           $0x7,%r8b
-  DB  197,225,239,219                     ; vpxor         %xmm3,%xmm3,%xmm3
-  DB  65,254,200                          ; dec           %r8b
-  DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,111,255,255,255              ; ja            58e <_sk_lerp_565_hsw+0x10>
-  DB  76,141,13,74,0,0,0                  ; lea           0x4a(%rip),%r9        # 670 <_sk_lerp_565_hsw+0xf2>
-  DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
-  DB  76,1,200                            ; add           %r9,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  197,225,239,219                     ; vpxor         %xmm3,%xmm3,%xmm3
-  DB  196,193,97,196,92,122,12,6          ; vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm3
-  DB  196,193,97,196,92,122,10,5          ; vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm3,%xmm3
-  DB  196,193,97,196,92,122,8,4           ; vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm3,%xmm3
-  DB  196,193,97,196,92,122,6,3           ; vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm3,%xmm3
-  DB  196,193,97,196,92,122,4,2           ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm3,%xmm3
-  DB  196,193,97,196,92,122,2,1           ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm3,%xmm3
-  DB  196,193,97,196,28,122,0             ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm3,%xmm3
-  DB  233,31,255,255,255                  ; jmpq          58e <_sk_lerp_565_hsw+0x10>
-  DB  144                                 ; nop
-  DB  243,255                             ; repz          (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  235,255                             ; jmp           675 <_sk_lerp_565_hsw+0xf7>
-  DB  255                                 ; (bad)
-  DB  255,227                             ; jmpq          *%rbx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  219,255                             ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,211                             ; callq         *%rbx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,203                             ; dec           %ebx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  191                                 ; .byte         0xbf
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_load_tables_hsw
-_sk_load_tables_hsw LABEL PROC
-  DB  73,137,200                          ; mov           %rcx,%r8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
-  DB  76,3,8                              ; add           (%rax),%r9
-  DB  77,133,192                          ; test          %r8,%r8
-  DB  117,106                             ; jne           70b <_sk_load_tables_hsw+0x7f>
-  DB  196,193,126,111,25                  ; vmovdqu       (%r9),%ymm3
-  DB  196,226,125,88,82,16                ; vpbroadcastd  0x10(%rdx),%ymm2
-  DB  197,237,219,203                     ; vpand         %ymm3,%ymm2,%ymm1
-  DB  196,65,61,118,192                   ; vpcmpeqd      %ymm8,%ymm8,%ymm8
-  DB  72,139,72,8                         ; mov           0x8(%rax),%rcx
-  DB  76,139,72,16                        ; mov           0x10(%rax),%r9
-  DB  196,65,53,118,201                   ; vpcmpeqd      %ymm9,%ymm9,%ymm9
-  DB  196,226,53,146,4,137                ; vgatherdps    %ymm9,(%rcx,%ymm1,4),%ymm0
-  DB  197,245,114,211,8                   ; vpsrld        $0x8,%ymm3,%ymm1
-  DB  197,109,219,201                     ; vpand         %ymm1,%ymm2,%ymm9
-  DB  196,65,45,118,210                   ; vpcmpeqd      %ymm10,%ymm10,%ymm10
-  DB  196,130,45,146,12,137               ; vgatherdps    %ymm10,(%r9,%ymm9,4),%ymm1
-  DB  72,139,64,24                        ; mov           0x18(%rax),%rax
-  DB  197,181,114,211,16                  ; vpsrld        $0x10,%ymm3,%ymm9
-  DB  196,65,109,219,201                  ; vpand         %ymm9,%ymm2,%ymm9
-  DB  196,162,61,146,20,136               ; vgatherdps    %ymm8,(%rax,%ymm9,4),%ymm2
-  DB  197,229,114,211,24                  ; vpsrld        $0x18,%ymm3,%ymm3
-  DB  197,252,91,219                      ; vcvtdq2ps     %ymm3,%ymm3
-  DB  196,98,125,24,66,12                 ; vbroadcastss  0xc(%rdx),%ymm8
-  DB  196,193,100,89,216                  ; vmulps        %ymm8,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,137,193                          ; mov           %r8,%rcx
-  DB  255,224                             ; jmpq          *%rax
-  DB  185,8,0,0,0                         ; mov           $0x8,%ecx
-  DB  68,41,193                           ; sub           %r8d,%ecx
-  DB  192,225,3                           ; shl           $0x3,%cl
-  DB  73,199,194,255,255,255,255          ; mov           $0xffffffffffffffff,%r10
-  DB  73,211,234                          ; shr           %cl,%r10
-  DB  196,193,249,110,194                 ; vmovq         %r10,%xmm0
-  DB  196,226,125,33,192                  ; vpmovsxbd     %xmm0,%ymm0
-  DB  196,194,125,140,25                  ; vpmaskmovd    (%r9),%ymm0,%ymm3
-  DB  233,114,255,255,255                 ; jmpq          6a6 <_sk_load_tables_hsw+0x1a>
-
-PUBLIC _sk_load_a8_hsw
-_sk_load_a8_hsw LABEL PROC
-  DB  73,137,200                          ; mov           %rcx,%r8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,1,248                            ; add           %rdi,%rax
-  DB  77,133,192                          ; test          %r8,%r8
-  DB  117,42                              ; jne           76e <_sk_load_a8_hsw+0x3a>
-  DB  197,251,16,0                        ; vmovsd        (%rax),%xmm0
-  DB  196,226,125,49,192                  ; vpmovzxbd     %xmm0,%ymm0
-  DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
-  DB  196,226,125,24,74,12                ; vbroadcastss  0xc(%rdx),%ymm1
-  DB  197,252,89,217                      ; vmulps        %ymm1,%ymm0,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,87,192                      ; vxorps        %ymm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  197,236,87,210                      ; vxorps        %ymm2,%ymm2,%ymm2
-  DB  76,137,193                          ; mov           %r8,%rcx
-  DB  255,224                             ; jmpq          *%rax
-  DB  49,201                              ; xor           %ecx,%ecx
-  DB  77,137,194                          ; mov           %r8,%r10
-  DB  69,49,201                           ; xor           %r9d,%r9d
-  DB  68,15,182,24                        ; movzbl        (%rax),%r11d
-  DB  72,255,192                          ; inc           %rax
-  DB  73,211,227                          ; shl           %cl,%r11
-  DB  77,9,217                            ; or            %r11,%r9
-  DB  72,131,193,8                        ; add           $0x8,%rcx
-  DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           776 <_sk_load_a8_hsw+0x42>
-  DB  196,193,249,110,193                 ; vmovq         %r9,%xmm0
-  DB  235,181                             ; jmp           748 <_sk_load_a8_hsw+0x14>
-
-PUBLIC _sk_store_a8_hsw
-_sk_store_a8_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,8                            ; mov           (%rax),%r9
-  DB  196,98,125,24,66,8                  ; vbroadcastss  0x8(%rdx),%ymm8
-  DB  197,60,89,195                       ; vmulps        %ymm3,%ymm8,%ymm8
-  DB  196,65,125,91,192                   ; vcvtps2dq     %ymm8,%ymm8
-  DB  196,67,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm9
-  DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
-  DB  196,65,57,103,192                   ; vpackuswb     %xmm8,%xmm8,%xmm8
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           7c6 <_sk_store_a8_hsw+0x33>
-  DB  196,65,123,17,4,57                  ; vmovsd        %xmm8,(%r9,%rdi,1)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  137,200                             ; mov           %ecx,%eax
-  DB  36,7                                ; and           $0x7,%al
-  DB  254,200                             ; dec           %al
-  DB  68,15,182,192                       ; movzbl        %al,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            7c2 <_sk_store_a8_hsw+0x2f>
-  DB  196,66,121,48,192                   ; vpmovzxbw     %xmm8,%xmm8
-  DB  76,141,21,66,0,0,0                  ; lea           0x42(%rip),%r10        # 824 <_sk_store_a8_hsw+0x91>
-  DB  75,99,4,130                         ; movslq        (%r10,%r8,4),%rax
-  DB  76,1,208                            ; add           %r10,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  196,67,121,20,68,57,6,12            ; vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,5,10            ; vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,4,8             ; vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,3,6             ; vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,2,4             ; vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,1,2             ; vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
-  DB  196,67,121,20,4,57,0                ; vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
-  DB  235,158                             ; jmp           7c2 <_sk_store_a8_hsw+0x2f>
-  DB  247,255                             ; idiv          %edi
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  239                                 ; out           %eax,(%dx)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,231                             ; jmpq          *%rdi
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  223,255                             ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,215                             ; callq         *%rdi
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,207                             ; dec           %edi
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,199                             ; inc           %edi
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_load_565_hsw
-_sk_load_565_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,16                           ; mov           (%rax),%r10
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,92                              ; jne           8a6 <_sk_load_565_hsw+0x66>
-  DB  196,193,122,111,4,122               ; vmovdqu       (%r10,%rdi,2),%xmm0
-  DB  196,226,125,51,208                  ; vpmovzxwd     %xmm0,%ymm2
-  DB  196,226,125,88,66,104               ; vpbroadcastd  0x68(%rdx),%ymm0
-  DB  197,253,219,194                     ; vpand         %ymm2,%ymm0,%ymm0
-  DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
-  DB  196,226,125,24,74,116               ; vbroadcastss  0x74(%rdx),%ymm1
-  DB  197,244,89,192                      ; vmulps        %ymm0,%ymm1,%ymm0
-  DB  196,226,125,88,74,108               ; vpbroadcastd  0x6c(%rdx),%ymm1
-  DB  197,245,219,202                     ; vpand         %ymm2,%ymm1,%ymm1
-  DB  197,252,91,201                      ; vcvtdq2ps     %ymm1,%ymm1
-  DB  196,226,125,24,90,120               ; vbroadcastss  0x78(%rdx),%ymm3
-  DB  197,228,89,201                      ; vmulps        %ymm1,%ymm3,%ymm1
-  DB  196,226,125,88,90,112               ; vpbroadcastd  0x70(%rdx),%ymm3
-  DB  197,229,219,210                     ; vpand         %ymm2,%ymm3,%ymm2
-  DB  197,252,91,210                      ; vcvtdq2ps     %ymm2,%ymm2
-  DB  196,226,125,24,90,124               ; vbroadcastss  0x7c(%rdx),%ymm3
-  DB  197,228,89,210                      ; vmulps        %ymm2,%ymm3,%ymm2
-  DB  196,226,125,24,26                   ; vbroadcastss  (%rdx),%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  65,137,200                          ; mov           %ecx,%r8d
-  DB  65,128,224,7                        ; and           $0x7,%r8b
-  DB  197,249,239,192                     ; vpxor         %xmm0,%xmm0,%xmm0
-  DB  65,254,200                          ; dec           %r8b
-  DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,146                             ; ja            850 <_sk_load_565_hsw+0x10>
-  DB  76,141,13,75,0,0,0                  ; lea           0x4b(%rip),%r9        # 910 <_sk_load_565_hsw+0xd0>
-  DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
-  DB  76,1,200                            ; add           %r9,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  197,249,239,192                     ; vpxor         %xmm0,%xmm0,%xmm0
-  DB  196,193,121,196,68,122,12,6         ; vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,10,5         ; vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,8,4          ; vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,6,3          ; vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,4,2          ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,2,1          ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,4,122,0             ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  DB  233,66,255,255,255                  ; jmpq          850 <_sk_load_565_hsw+0x10>
-  DB  102,144                             ; xchg          %ax,%ax
-  DB  242,255                             ; repnz         (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  234                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,226                             ; jmpq          *%rdx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  218,255                             ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,210                             ; callq         *%rdx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,202                             ; dec           %edx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  190                                 ; .byte         0xbe
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_store_565_hsw
-_sk_store_565_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,8                            ; mov           (%rax),%r9
-  DB  196,98,125,24,130,128,0,0,0         ; vbroadcastss  0x80(%rdx),%ymm8
-  DB  197,60,89,200                       ; vmulps        %ymm0,%ymm8,%ymm9
-  DB  196,65,125,91,201                   ; vcvtps2dq     %ymm9,%ymm9
-  DB  196,193,53,114,241,11               ; vpslld        $0xb,%ymm9,%ymm9
-  DB  196,98,125,24,146,132,0,0,0         ; vbroadcastss  0x84(%rdx),%ymm10
-  DB  197,44,89,209                       ; vmulps        %ymm1,%ymm10,%ymm10
-  DB  196,65,125,91,210                   ; vcvtps2dq     %ymm10,%ymm10
-  DB  196,193,45,114,242,5                ; vpslld        $0x5,%ymm10,%ymm10
-  DB  196,65,45,235,201                   ; vpor          %ymm9,%ymm10,%ymm9
-  DB  197,60,89,194                       ; vmulps        %ymm2,%ymm8,%ymm8
-  DB  196,65,125,91,192                   ; vcvtps2dq     %ymm8,%ymm8
-  DB  196,65,53,235,192                   ; vpor          %ymm8,%ymm9,%ymm8
-  DB  196,67,125,57,193,1                 ; vextracti128  $0x1,%ymm8,%xmm9
-  DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           98e <_sk_store_565_hsw+0x62>
-  DB  196,65,122,127,4,121                ; vmovdqu       %xmm8,(%r9,%rdi,2)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  137,200                             ; mov           %ecx,%eax
-  DB  36,7                                ; and           $0x7,%al
-  DB  254,200                             ; dec           %al
-  DB  68,15,182,192                       ; movzbl        %al,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            98a <_sk_store_565_hsw+0x5e>
-  DB  76,141,21,71,0,0,0                  ; lea           0x47(%rip),%r10        # 9ec <_sk_store_565_hsw+0xc0>
-  DB  75,99,4,130                         ; movslq        (%r10,%r8,4),%rax
-  DB  76,1,208                            ; add           %r10,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  196,67,121,21,68,121,12,6           ; vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,10,5           ; vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,8,4            ; vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,6,3            ; vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,4,2            ; vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,2,1            ; vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
-  DB  197,121,126,192                     ; vmovd         %xmm8,%eax
-  DB  102,65,137,4,121                    ; mov           %ax,(%r9,%rdi,2)
-  DB  235,161                             ; jmp           98a <_sk_store_565_hsw+0x5e>
-  DB  15,31,0                             ; nopl          (%rax)
-  DB  242,255                             ; repnz         (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  234                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,226                             ; jmpq          *%rdx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  218,255                             ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,210                             ; callq         *%rdx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,202                             ; dec           %edx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,194                             ; inc           %edx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_load_8888_hsw
-_sk_load_8888_hsw LABEL PROC
-  DB  73,137,200                          ; mov           %rcx,%r8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
-  DB  76,3,8                              ; add           (%rax),%r9
-  DB  77,133,192                          ; test          %r8,%r8
-  DB  117,85                              ; jne           a72 <_sk_load_8888_hsw+0x6a>
-  DB  196,193,126,111,25                  ; vmovdqu       (%r9),%ymm3
-  DB  196,226,125,88,82,16                ; vpbroadcastd  0x10(%rdx),%ymm2
-  DB  197,237,219,195                     ; vpand         %ymm3,%ymm2,%ymm0
-  DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
-  DB  196,98,125,24,66,12                 ; vbroadcastss  0xc(%rdx),%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  197,245,114,211,8                   ; vpsrld        $0x8,%ymm3,%ymm1
-  DB  197,237,219,201                     ; vpand         %ymm1,%ymm2,%ymm1
-  DB  197,252,91,201                      ; vcvtdq2ps     %ymm1,%ymm1
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  197,181,114,211,16                  ; vpsrld        $0x10,%ymm3,%ymm9
-  DB  196,193,109,219,209                 ; vpand         %ymm9,%ymm2,%ymm2
-  DB  197,252,91,210                      ; vcvtdq2ps     %ymm2,%ymm2
-  DB  197,188,89,210                      ; vmulps        %ymm2,%ymm8,%ymm2
-  DB  197,229,114,211,24                  ; vpsrld        $0x18,%ymm3,%ymm3
-  DB  197,252,91,219                      ; vcvtdq2ps     %ymm3,%ymm3
-  DB  196,193,100,89,216                  ; vmulps        %ymm8,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,137,193                          ; mov           %r8,%rcx
-  DB  255,224                             ; jmpq          *%rax
-  DB  185,8,0,0,0                         ; mov           $0x8,%ecx
-  DB  68,41,193                           ; sub           %r8d,%ecx
-  DB  192,225,3                           ; shl           $0x3,%cl
-  DB  72,199,192,255,255,255,255          ; mov           $0xffffffffffffffff,%rax
-  DB  72,211,232                          ; shr           %cl,%rax
-  DB  196,225,249,110,192                 ; vmovq         %rax,%xmm0
-  DB  196,226,125,33,192                  ; vpmovsxbd     %xmm0,%ymm0
-  DB  196,194,125,140,25                  ; vpmaskmovd    (%r9),%ymm0,%ymm3
-  DB  235,138                             ; jmp           a22 <_sk_load_8888_hsw+0x1a>
-
-PUBLIC _sk_store_8888_hsw
-_sk_store_8888_hsw LABEL PROC
-  DB  73,137,200                          ; mov           %rcx,%r8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,141,12,189,0,0,0,0               ; lea           0x0(,%rdi,4),%r9
-  DB  76,3,8                              ; add           (%rax),%r9
-  DB  196,98,125,24,66,8                  ; vbroadcastss  0x8(%rdx),%ymm8
-  DB  197,60,89,200                       ; vmulps        %ymm0,%ymm8,%ymm9
-  DB  196,65,125,91,201                   ; vcvtps2dq     %ymm9,%ymm9
-  DB  197,60,89,209                       ; vmulps        %ymm1,%ymm8,%ymm10
-  DB  196,65,125,91,210                   ; vcvtps2dq     %ymm10,%ymm10
-  DB  196,193,45,114,242,8                ; vpslld        $0x8,%ymm10,%ymm10
-  DB  196,65,45,235,201                   ; vpor          %ymm9,%ymm10,%ymm9
-  DB  197,60,89,210                       ; vmulps        %ymm2,%ymm8,%ymm10
-  DB  196,65,125,91,210                   ; vcvtps2dq     %ymm10,%ymm10
-  DB  196,193,45,114,242,16               ; vpslld        $0x10,%ymm10,%ymm10
-  DB  197,60,89,195                       ; vmulps        %ymm3,%ymm8,%ymm8
-  DB  196,65,125,91,192                   ; vcvtps2dq     %ymm8,%ymm8
-  DB  196,193,61,114,240,24               ; vpslld        $0x18,%ymm8,%ymm8
-  DB  196,65,45,235,192                   ; vpor          %ymm8,%ymm10,%ymm8
-  DB  196,65,53,235,192                   ; vpor          %ymm8,%ymm9,%ymm8
-  DB  77,133,192                          ; test          %r8,%r8
-  DB  117,12                              ; jne           b04 <_sk_store_8888_hsw+0x6c>
-  DB  196,65,126,127,1                    ; vmovdqu       %ymm8,(%r9)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,137,193                          ; mov           %r8,%rcx
-  DB  255,224                             ; jmpq          *%rax
-  DB  185,8,0,0,0                         ; mov           $0x8,%ecx
-  DB  68,41,193                           ; sub           %r8d,%ecx
-  DB  192,225,3                           ; shl           $0x3,%cl
-  DB  72,199,192,255,255,255,255          ; mov           $0xffffffffffffffff,%rax
-  DB  72,211,232                          ; shr           %cl,%rax
-  DB  196,97,249,110,200                  ; vmovq         %rax,%xmm9
-  DB  196,66,125,33,201                   ; vpmovsxbd     %xmm9,%ymm9
-  DB  196,66,53,142,1                     ; vpmaskmovd    %ymm8,%ymm9,(%r9)
-  DB  235,211                             ; jmp           afd <_sk_store_8888_hsw+0x65>
-
-PUBLIC _sk_load_f16_hsw
-_sk_load_f16_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,97                              ; jne           b95 <_sk_load_f16_hsw+0x6b>
-  DB  197,249,16,12,248                   ; vmovupd       (%rax,%rdi,8),%xmm1
-  DB  197,249,16,84,248,16                ; vmovupd       0x10(%rax,%rdi,8),%xmm2
-  DB  197,249,16,92,248,32                ; vmovupd       0x20(%rax,%rdi,8),%xmm3
-  DB  197,121,16,68,248,48                ; vmovupd       0x30(%rax,%rdi,8),%xmm8
-  DB  197,241,97,194                      ; vpunpcklwd    %xmm2,%xmm1,%xmm0
-  DB  197,241,105,202                     ; vpunpckhwd    %xmm2,%xmm1,%xmm1
-  DB  196,193,97,97,208                   ; vpunpcklwd    %xmm8,%xmm3,%xmm2
-  DB  196,193,97,105,216                  ; vpunpckhwd    %xmm8,%xmm3,%xmm3
-  DB  197,121,97,193                      ; vpunpcklwd    %xmm1,%xmm0,%xmm8
-  DB  197,121,105,201                     ; vpunpckhwd    %xmm1,%xmm0,%xmm9
-  DB  197,233,97,203                      ; vpunpcklwd    %xmm3,%xmm2,%xmm1
-  DB  197,233,105,219                     ; vpunpckhwd    %xmm3,%xmm2,%xmm3
-  DB  197,185,108,193                     ; vpunpcklqdq   %xmm1,%xmm8,%xmm0
-  DB  196,226,125,19,192                  ; vcvtph2ps     %xmm0,%ymm0
-  DB  197,185,109,201                     ; vpunpckhqdq   %xmm1,%xmm8,%xmm1
-  DB  196,226,125,19,201                  ; vcvtph2ps     %xmm1,%ymm1
-  DB  197,177,108,211                     ; vpunpcklqdq   %xmm3,%xmm9,%xmm2
-  DB  196,226,125,19,210                  ; vcvtph2ps     %xmm2,%ymm2
-  DB  197,177,109,219                     ; vpunpckhqdq   %xmm3,%xmm9,%xmm3
-  DB  196,226,125,19,219                  ; vcvtph2ps     %xmm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  197,251,16,12,248                   ; vmovsd        (%rax,%rdi,8),%xmm1
-  DB  196,65,57,87,192                    ; vxorpd        %xmm8,%xmm8,%xmm8
-  DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  117,6                               ; jne           bab <_sk_load_f16_hsw+0x81>
-  DB  197,250,126,201                     ; vmovq         %xmm1,%xmm1
-  DB  235,30                              ; jmp           bc9 <_sk_load_f16_hsw+0x9f>
-  DB  197,241,22,76,248,8                 ; vmovhpd       0x8(%rax,%rdi,8),%xmm1,%xmm1
-  DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,18                              ; jb            bc9 <_sk_load_f16_hsw+0x9f>
-  DB  197,251,16,84,248,16                ; vmovsd        0x10(%rax,%rdi,8),%xmm2
-  DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  117,19                              ; jne           bd6 <_sk_load_f16_hsw+0xac>
-  DB  197,250,126,210                     ; vmovq         %xmm2,%xmm2
-  DB  235,46                              ; jmp           bf7 <_sk_load_f16_hsw+0xcd>
-  DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,117,255,255,255                 ; jmpq          b4b <_sk_load_f16_hsw+0x21>
-  DB  197,233,22,84,248,24                ; vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
-  DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,21                              ; jb            bf7 <_sk_load_f16_hsw+0xcd>
-  DB  197,251,16,92,248,32                ; vmovsd        0x20(%rax,%rdi,8),%xmm3
-  DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  117,18                              ; jne           c00 <_sk_load_f16_hsw+0xd6>
-  DB  197,250,126,219                     ; vmovq         %xmm3,%xmm3
-  DB  233,84,255,255,255                  ; jmpq          b4b <_sk_load_f16_hsw+0x21>
-  DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  233,75,255,255,255                  ; jmpq          b4b <_sk_load_f16_hsw+0x21>
-  DB  197,225,22,92,248,40                ; vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
-  DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,59,255,255,255               ; jb            b4b <_sk_load_f16_hsw+0x21>
-  DB  197,123,16,68,248,48                ; vmovsd        0x30(%rax,%rdi,8),%xmm8
-  DB  233,48,255,255,255                  ; jmpq          b4b <_sk_load_f16_hsw+0x21>
-
-PUBLIC _sk_store_f16_hsw
-_sk_store_f16_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  196,195,125,29,192,4                ; vcvtps2ph     $0x4,%ymm0,%xmm8
-  DB  196,195,125,29,201,4                ; vcvtps2ph     $0x4,%ymm1,%xmm9
-  DB  196,195,125,29,210,4                ; vcvtps2ph     $0x4,%ymm2,%xmm10
-  DB  196,195,125,29,219,4                ; vcvtps2ph     $0x4,%ymm3,%xmm11
-  DB  196,65,57,97,225                    ; vpunpcklwd    %xmm9,%xmm8,%xmm12
-  DB  196,65,57,105,193                   ; vpunpckhwd    %xmm9,%xmm8,%xmm8
-  DB  196,65,41,97,203                    ; vpunpcklwd    %xmm11,%xmm10,%xmm9
-  DB  196,65,41,105,235                   ; vpunpckhwd    %xmm11,%xmm10,%xmm13
-  DB  196,65,25,98,217                    ; vpunpckldq    %xmm9,%xmm12,%xmm11
-  DB  196,65,25,106,209                   ; vpunpckhdq    %xmm9,%xmm12,%xmm10
-  DB  196,65,57,98,205                    ; vpunpckldq    %xmm13,%xmm8,%xmm9
-  DB  196,65,57,106,197                   ; vpunpckhdq    %xmm13,%xmm8,%xmm8
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,27                              ; jne           c80 <_sk_store_f16_hsw+0x65>
-  DB  197,120,17,28,248                   ; vmovups       %xmm11,(%rax,%rdi,8)
-  DB  197,120,17,84,248,16                ; vmovups       %xmm10,0x10(%rax,%rdi,8)
-  DB  197,120,17,76,248,32                ; vmovups       %xmm9,0x20(%rax,%rdi,8)
-  DB  197,122,127,68,248,48               ; vmovdqu       %xmm8,0x30(%rax,%rdi,8)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  197,121,214,28,248                  ; vmovq         %xmm11,(%rax,%rdi,8)
-  DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,241                             ; je            c7c <_sk_store_f16_hsw+0x61>
-  DB  197,121,23,92,248,8                 ; vmovhpd       %xmm11,0x8(%rax,%rdi,8)
-  DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,229                             ; jb            c7c <_sk_store_f16_hsw+0x61>
-  DB  197,121,214,84,248,16               ; vmovq         %xmm10,0x10(%rax,%rdi,8)
-  DB  116,221                             ; je            c7c <_sk_store_f16_hsw+0x61>
-  DB  197,121,23,84,248,24                ; vmovhpd       %xmm10,0x18(%rax,%rdi,8)
-  DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,209                             ; jb            c7c <_sk_store_f16_hsw+0x61>
-  DB  197,121,214,76,248,32               ; vmovq         %xmm9,0x20(%rax,%rdi,8)
-  DB  116,201                             ; je            c7c <_sk_store_f16_hsw+0x61>
-  DB  197,121,23,76,248,40                ; vmovhpd       %xmm9,0x28(%rax,%rdi,8)
-  DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,189                             ; jb            c7c <_sk_store_f16_hsw+0x61>
-  DB  197,121,214,68,248,48               ; vmovq         %xmm8,0x30(%rax,%rdi,8)
-  DB  235,181                             ; jmp           c7c <_sk_store_f16_hsw+0x61>
-
-PUBLIC _sk_store_f32_hsw
-_sk_store_f32_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,0                            ; mov           (%rax),%r8
-  DB  72,141,4,189,0,0,0,0                ; lea           0x0(,%rdi,4),%rax
-  DB  197,124,20,193                      ; vunpcklps     %ymm1,%ymm0,%ymm8
-  DB  197,124,21,217                      ; vunpckhps     %ymm1,%ymm0,%ymm11
-  DB  197,108,20,203                      ; vunpcklps     %ymm3,%ymm2,%ymm9
-  DB  197,108,21,227                      ; vunpckhps     %ymm3,%ymm2,%ymm12
-  DB  196,65,61,20,209                    ; vunpcklpd     %ymm9,%ymm8,%ymm10
-  DB  196,65,61,21,201                    ; vunpckhpd     %ymm9,%ymm8,%ymm9
-  DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
-  DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,55                              ; jne           d34 <_sk_store_f32_hsw+0x6d>
-  DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
-  DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
-  DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
-  DB  196,67,61,6,195,49                  ; vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
-  DB  196,65,125,17,36,128                ; vmovupd       %ymm12,(%r8,%rax,4)
-  DB  196,65,125,17,108,128,32            ; vmovupd       %ymm13,0x20(%r8,%rax,4)
-  DB  196,65,125,17,76,128,64             ; vmovupd       %ymm9,0x40(%r8,%rax,4)
-  DB  196,65,125,17,68,128,96             ; vmovupd       %ymm8,0x60(%r8,%rax,4)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
-  DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            d30 <_sk_store_f32_hsw+0x69>
-  DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
-  DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            d30 <_sk_store_f32_hsw+0x69>
-  DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
-  DB  116,218                             ; je            d30 <_sk_store_f32_hsw+0x69>
-  DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
-  DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            d30 <_sk_store_f32_hsw+0x69>
-  DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  DB  116,195                             ; je            d30 <_sk_store_f32_hsw+0x69>
-  DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
-  DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,181                             ; jb            d30 <_sk_store_f32_hsw+0x69>
-  DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  DB  235,171                             ; jmp           d30 <_sk_store_f32_hsw+0x69>
-
-PUBLIC _sk_clamp_x_hsw
-_sk_clamp_x_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  197,188,95,192                      ; vmaxps        %ymm0,%ymm8,%ymm0
-  DB  196,98,125,88,0                     ; vpbroadcastd  (%rax),%ymm8
-  DB  196,65,53,118,201                   ; vpcmpeqd      %ymm9,%ymm9,%ymm9
-  DB  196,65,61,254,193                   ; vpaddd        %ymm9,%ymm8,%ymm8
-  DB  196,193,124,93,192                  ; vminps        %ymm8,%ymm0,%ymm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_y_hsw
-_sk_clamp_y_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  197,188,95,201                      ; vmaxps        %ymm1,%ymm8,%ymm1
-  DB  196,98,125,88,0                     ; vpbroadcastd  (%rax),%ymm8
-  DB  196,65,53,118,201                   ; vpcmpeqd      %ymm9,%ymm9,%ymm9
-  DB  196,65,61,254,193                   ; vpaddd        %ymm9,%ymm8,%ymm8
-  DB  196,193,116,93,200                  ; vminps        %ymm8,%ymm1,%ymm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_repeat_x_hsw
-_sk_repeat_x_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,65,124,94,200                   ; vdivps        %ymm8,%ymm0,%ymm9
-  DB  196,67,125,8,201,1                  ; vroundps      $0x1,%ymm9,%ymm9
-  DB  196,98,61,172,200                   ; vfnmadd213ps  %ymm0,%ymm8,%ymm9
-  DB  197,253,118,192                     ; vpcmpeqd      %ymm0,%ymm0,%ymm0
-  DB  197,189,254,192                     ; vpaddd        %ymm0,%ymm8,%ymm0
-  DB  197,180,93,192                      ; vminps        %ymm0,%ymm9,%ymm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_repeat_y_hsw
-_sk_repeat_y_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,65,116,94,200                   ; vdivps        %ymm8,%ymm1,%ymm9
-  DB  196,67,125,8,201,1                  ; vroundps      $0x1,%ymm9,%ymm9
-  DB  196,98,61,172,201                   ; vfnmadd213ps  %ymm1,%ymm8,%ymm9
-  DB  197,245,118,201                     ; vpcmpeqd      %ymm1,%ymm1,%ymm1
-  DB  197,189,254,201                     ; vpaddd        %ymm1,%ymm8,%ymm1
-  DB  197,180,93,201                      ; vminps        %ymm1,%ymm9,%ymm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_mirror_x_hsw
-_sk_mirror_x_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,122,16,0                        ; vmovss        (%rax),%xmm8
-  DB  196,66,125,24,200                   ; vbroadcastss  %xmm8,%ymm9
-  DB  196,65,124,92,209                   ; vsubps        %ymm9,%ymm0,%ymm10
-  DB  196,193,58,88,192                   ; vaddss        %xmm8,%xmm8,%xmm0
-  DB  196,226,125,24,192                  ; vbroadcastss  %xmm0,%ymm0
-  DB  197,44,94,192                       ; vdivps        %ymm0,%ymm10,%ymm8
-  DB  196,67,125,8,192,1                  ; vroundps      $0x1,%ymm8,%ymm8
-  DB  196,66,125,172,194                  ; vfnmadd213ps  %ymm10,%ymm0,%ymm8
-  DB  196,193,60,92,193                   ; vsubps        %ymm9,%ymm8,%ymm0
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  197,60,92,192                       ; vsubps        %ymm0,%ymm8,%ymm8
-  DB  197,188,84,192                      ; vandps        %ymm0,%ymm8,%ymm0
-  DB  196,65,61,118,192                   ; vpcmpeqd      %ymm8,%ymm8,%ymm8
-  DB  196,65,53,254,192                   ; vpaddd        %ymm8,%ymm9,%ymm8
-  DB  196,193,124,93,192                  ; vminps        %ymm8,%ymm0,%ymm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_mirror_y_hsw
-_sk_mirror_y_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,122,16,0                        ; vmovss        (%rax),%xmm8
-  DB  196,66,125,24,200                   ; vbroadcastss  %xmm8,%ymm9
-  DB  196,65,116,92,209                   ; vsubps        %ymm9,%ymm1,%ymm10
-  DB  196,193,58,88,200                   ; vaddss        %xmm8,%xmm8,%xmm1
-  DB  196,226,125,24,201                  ; vbroadcastss  %xmm1,%ymm1
-  DB  197,44,94,193                       ; vdivps        %ymm1,%ymm10,%ymm8
-  DB  196,67,125,8,192,1                  ; vroundps      $0x1,%ymm8,%ymm8
-  DB  196,66,117,172,194                  ; vfnmadd213ps  %ymm10,%ymm1,%ymm8
-  DB  196,193,60,92,201                   ; vsubps        %ymm9,%ymm8,%ymm1
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  197,60,92,193                       ; vsubps        %ymm1,%ymm8,%ymm8
-  DB  197,188,84,201                      ; vandps        %ymm1,%ymm8,%ymm1
-  DB  196,65,61,118,192                   ; vpcmpeqd      %ymm8,%ymm8,%ymm8
-  DB  196,65,53,254,192                   ; vpaddd        %ymm8,%ymm9,%ymm8
-  DB  196,193,116,93,200                  ; vminps        %ymm8,%ymm1,%ymm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_2x3_hsw
-_sk_matrix_2x3_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,8                     ; vbroadcastss  (%rax),%ymm9
-  DB  196,98,125,24,80,8                  ; vbroadcastss  0x8(%rax),%ymm10
-  DB  196,98,125,24,64,16                 ; vbroadcastss  0x10(%rax),%ymm8
-  DB  196,66,117,184,194                  ; vfmadd231ps   %ymm10,%ymm1,%ymm8
-  DB  196,66,125,184,193                  ; vfmadd231ps   %ymm9,%ymm0,%ymm8
-  DB  196,98,125,24,80,4                  ; vbroadcastss  0x4(%rax),%ymm10
-  DB  196,98,125,24,88,12                 ; vbroadcastss  0xc(%rax),%ymm11
-  DB  196,98,125,24,72,20                 ; vbroadcastss  0x14(%rax),%ymm9
-  DB  196,66,117,184,203                  ; vfmadd231ps   %ymm11,%ymm1,%ymm9
-  DB  196,66,125,184,202                  ; vfmadd231ps   %ymm10,%ymm0,%ymm9
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,124,41,192                      ; vmovaps       %ymm8,%ymm0
-  DB  197,124,41,201                      ; vmovaps       %ymm9,%ymm1
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_3x4_hsw
-_sk_matrix_3x4_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,8                     ; vbroadcastss  (%rax),%ymm9
-  DB  196,98,125,24,80,12                 ; vbroadcastss  0xc(%rax),%ymm10
-  DB  196,98,125,24,88,24                 ; vbroadcastss  0x18(%rax),%ymm11
-  DB  196,98,125,24,64,36                 ; vbroadcastss  0x24(%rax),%ymm8
-  DB  196,66,109,184,195                  ; vfmadd231ps   %ymm11,%ymm2,%ymm8
-  DB  196,66,117,184,194                  ; vfmadd231ps   %ymm10,%ymm1,%ymm8
-  DB  196,66,125,184,193                  ; vfmadd231ps   %ymm9,%ymm0,%ymm8
-  DB  196,98,125,24,80,4                  ; vbroadcastss  0x4(%rax),%ymm10
-  DB  196,98,125,24,88,16                 ; vbroadcastss  0x10(%rax),%ymm11
-  DB  196,98,125,24,96,28                 ; vbroadcastss  0x1c(%rax),%ymm12
-  DB  196,98,125,24,72,40                 ; vbroadcastss  0x28(%rax),%ymm9
-  DB  196,66,109,184,204                  ; vfmadd231ps   %ymm12,%ymm2,%ymm9
-  DB  196,66,117,184,203                  ; vfmadd231ps   %ymm11,%ymm1,%ymm9
-  DB  196,66,125,184,202                  ; vfmadd231ps   %ymm10,%ymm0,%ymm9
-  DB  196,98,125,24,88,8                  ; vbroadcastss  0x8(%rax),%ymm11
-  DB  196,98,125,24,96,20                 ; vbroadcastss  0x14(%rax),%ymm12
-  DB  196,98,125,24,104,32                ; vbroadcastss  0x20(%rax),%ymm13
-  DB  196,98,125,24,80,44                 ; vbroadcastss  0x2c(%rax),%ymm10
-  DB  196,66,109,184,213                  ; vfmadd231ps   %ymm13,%ymm2,%ymm10
-  DB  196,66,117,184,212                  ; vfmadd231ps   %ymm12,%ymm1,%ymm10
-  DB  196,66,125,184,211                  ; vfmadd231ps   %ymm11,%ymm0,%ymm10
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,124,41,192                      ; vmovaps       %ymm8,%ymm0
-  DB  197,124,41,201                      ; vmovaps       %ymm9,%ymm1
-  DB  197,124,41,210                      ; vmovaps       %ymm10,%ymm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_perspective_hsw
-_sk_matrix_perspective_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,98,125,24,72,4                  ; vbroadcastss  0x4(%rax),%ymm9
-  DB  196,98,125,24,80,8                  ; vbroadcastss  0x8(%rax),%ymm10
-  DB  196,66,117,184,209                  ; vfmadd231ps   %ymm9,%ymm1,%ymm10
-  DB  196,66,125,184,208                  ; vfmadd231ps   %ymm8,%ymm0,%ymm10
-  DB  196,98,125,24,64,12                 ; vbroadcastss  0xc(%rax),%ymm8
-  DB  196,98,125,24,72,16                 ; vbroadcastss  0x10(%rax),%ymm9
-  DB  196,98,125,24,88,20                 ; vbroadcastss  0x14(%rax),%ymm11
-  DB  196,66,117,184,217                  ; vfmadd231ps   %ymm9,%ymm1,%ymm11
-  DB  196,66,125,184,216                  ; vfmadd231ps   %ymm8,%ymm0,%ymm11
-  DB  196,98,125,24,64,24                 ; vbroadcastss  0x18(%rax),%ymm8
-  DB  196,98,125,24,72,28                 ; vbroadcastss  0x1c(%rax),%ymm9
-  DB  196,98,125,24,96,32                 ; vbroadcastss  0x20(%rax),%ymm12
-  DB  196,66,117,184,225                  ; vfmadd231ps   %ymm9,%ymm1,%ymm12
-  DB  196,66,125,184,224                  ; vfmadd231ps   %ymm8,%ymm0,%ymm12
-  DB  196,193,124,83,204                  ; vrcpps        %ymm12,%ymm1
-  DB  197,172,89,193                      ; vmulps        %ymm1,%ymm10,%ymm0
-  DB  197,164,89,201                      ; vmulps        %ymm1,%ymm11,%ymm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_linear_gradient_2stops_hsw
-_sk_linear_gradient_2stops_hsw LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,226,125,24,72,16                ; vbroadcastss  0x10(%rax),%ymm1
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,98,125,184,193                  ; vfmadd231ps   %ymm1,%ymm0,%ymm8
-  DB  196,226,125,24,80,20                ; vbroadcastss  0x14(%rax),%ymm2
-  DB  196,226,125,24,72,4                 ; vbroadcastss  0x4(%rax),%ymm1
-  DB  196,226,125,184,202                 ; vfmadd231ps   %ymm2,%ymm0,%ymm1
-  DB  196,226,125,24,88,24                ; vbroadcastss  0x18(%rax),%ymm3
-  DB  196,226,125,24,80,8                 ; vbroadcastss  0x8(%rax),%ymm2
-  DB  196,226,125,184,211                 ; vfmadd231ps   %ymm3,%ymm0,%ymm2
-  DB  196,98,125,24,72,28                 ; vbroadcastss  0x1c(%rax),%ymm9
-  DB  196,226,125,24,88,12                ; vbroadcastss  0xc(%rax),%ymm3
-  DB  196,194,125,184,217                 ; vfmadd231ps   %ymm9,%ymm0,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,124,41,192                      ; vmovaps       %ymm8,%ymm0
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_start_pipeline_avx
-_sk_start_pipeline_avx LABEL PROC
-  DB  65,87                               ; push          %r15
-  DB  65,86                               ; push          %r14
-  DB  65,85                               ; push          %r13
-  DB  65,84                               ; push          %r12
-  DB  86                                  ; push          %rsi
-  DB  87                                  ; push          %rdi
-  DB  83                                  ; push          %rbx
-  DB  72,129,236,160,0,0,0                ; sub           $0xa0,%rsp
-  DB  197,120,41,188,36,144,0,0,0         ; vmovaps       %xmm15,0x90(%rsp)
-  DB  197,120,41,180,36,128,0,0,0         ; vmovaps       %xmm14,0x80(%rsp)
-  DB  197,120,41,108,36,112               ; vmovaps       %xmm13,0x70(%rsp)
-  DB  197,120,41,100,36,96                ; vmovaps       %xmm12,0x60(%rsp)
-  DB  197,120,41,92,36,80                 ; vmovaps       %xmm11,0x50(%rsp)
-  DB  197,120,41,84,36,64                 ; vmovaps       %xmm10,0x40(%rsp)
-  DB  197,120,41,76,36,48                 ; vmovaps       %xmm9,0x30(%rsp)
-  DB  197,120,41,68,36,32                 ; vmovaps       %xmm8,0x20(%rsp)
-  DB  197,248,41,124,36,16                ; vmovaps       %xmm7,0x10(%rsp)
-  DB  197,248,41,52,36                    ; vmovaps       %xmm6,(%rsp)
-  DB  77,137,205                          ; mov           %r9,%r13
-  DB  77,137,198                          ; mov           %r8,%r14
-  DB  72,137,203                          ; mov           %rcx,%rbx
-  DB  72,137,214                          ; mov           %rdx,%rsi
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  73,137,199                          ; mov           %rax,%r15
-  DB  73,137,244                          ; mov           %rsi,%r12
-  DB  72,141,67,8                         ; lea           0x8(%rbx),%rax
-  DB  76,57,232                           ; cmp           %r13,%rax
-  DB  118,5                               ; jbe           75 <_sk_start_pipeline_avx+0x75>
-  DB  72,137,223                          ; mov           %rbx,%rdi
-  DB  235,65                              ; jmp           b6 <_sk_start_pipeline_avx+0xb6>
-  DB  185,0,0,0,0                         ; mov           $0x0,%ecx
-  DB  197,252,87,192                      ; vxorps        %ymm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  197,236,87,210                      ; vxorps        %ymm2,%ymm2,%ymm2
-  DB  197,228,87,219                      ; vxorps        %ymm3,%ymm3,%ymm3
-  DB  197,220,87,228                      ; vxorps        %ymm4,%ymm4,%ymm4
-  DB  197,212,87,237                      ; vxorps        %ymm5,%ymm5,%ymm5
-  DB  197,204,87,246                      ; vxorps        %ymm6,%ymm6,%ymm6
-  DB  197,196,87,255                      ; vxorps        %ymm7,%ymm7,%ymm7
-  DB  72,137,223                          ; mov           %rbx,%rdi
-  DB  76,137,230                          ; mov           %r12,%rsi
-  DB  76,137,242                          ; mov           %r14,%rdx
-  DB  65,255,215                          ; callq         *%r15
-  DB  72,141,123,8                        ; lea           0x8(%rbx),%rdi
-  DB  72,131,195,16                       ; add           $0x10,%rbx
-  DB  76,57,235                           ; cmp           %r13,%rbx
-  DB  72,137,251                          ; mov           %rdi,%rbx
-  DB  118,191                             ; jbe           75 <_sk_start_pipeline_avx+0x75>
-  DB  76,137,233                          ; mov           %r13,%rcx
-  DB  72,41,249                           ; sub           %rdi,%rcx
-  DB  116,41                              ; je            e7 <_sk_start_pipeline_avx+0xe7>
-  DB  197,252,87,192                      ; vxorps        %ymm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  197,236,87,210                      ; vxorps        %ymm2,%ymm2,%ymm2
-  DB  197,228,87,219                      ; vxorps        %ymm3,%ymm3,%ymm3
-  DB  197,220,87,228                      ; vxorps        %ymm4,%ymm4,%ymm4
-  DB  197,212,87,237                      ; vxorps        %ymm5,%ymm5,%ymm5
-  DB  197,204,87,246                      ; vxorps        %ymm6,%ymm6,%ymm6
-  DB  197,196,87,255                      ; vxorps        %ymm7,%ymm7,%ymm7
-  DB  76,137,230                          ; mov           %r12,%rsi
-  DB  76,137,242                          ; mov           %r14,%rdx
-  DB  65,255,215                          ; callq         *%r15
-  DB  76,137,232                          ; mov           %r13,%rax
-  DB  197,248,40,52,36                    ; vmovaps       (%rsp),%xmm6
-  DB  197,248,40,124,36,16                ; vmovaps       0x10(%rsp),%xmm7
-  DB  197,120,40,68,36,32                 ; vmovaps       0x20(%rsp),%xmm8
-  DB  197,120,40,76,36,48                 ; vmovaps       0x30(%rsp),%xmm9
-  DB  197,120,40,84,36,64                 ; vmovaps       0x40(%rsp),%xmm10
-  DB  197,120,40,92,36,80                 ; vmovaps       0x50(%rsp),%xmm11
-  DB  197,120,40,100,36,96                ; vmovaps       0x60(%rsp),%xmm12
-  DB  197,120,40,108,36,112               ; vmovaps       0x70(%rsp),%xmm13
-  DB  197,120,40,180,36,128,0,0,0         ; vmovaps       0x80(%rsp),%xmm14
-  DB  197,120,40,188,36,144,0,0,0         ; vmovaps       0x90(%rsp),%xmm15
-  DB  72,129,196,160,0,0,0                ; add           $0xa0,%rsp
-  DB  91                                  ; pop           %rbx
-  DB  95                                  ; pop           %rdi
-  DB  94                                  ; pop           %rsi
-  DB  65,92                               ; pop           %r12
-  DB  65,93                               ; pop           %r13
-  DB  65,94                               ; pop           %r14
-  DB  65,95                               ; pop           %r15
-  DB  197,248,119                         ; vzeroupper
-  DB  195                                 ; retq
-
-PUBLIC _sk_just_return_avx
-_sk_just_return_avx LABEL PROC
-  DB  195                                 ; retq
-
-PUBLIC _sk_seed_shader_avx
-_sk_seed_shader_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,249,110,199                     ; vmovd         %edi,%xmm0
-  DB  197,249,112,192,0                   ; vpshufd       $0x0,%xmm0,%xmm0
-  DB  196,227,125,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
-  DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
-  DB  196,226,125,24,74,4                 ; vbroadcastss  0x4(%rdx),%ymm1
-  DB  197,252,88,193                      ; vaddps        %ymm1,%ymm0,%ymm0
-  DB  197,252,88,66,20                    ; vaddps        0x14(%rdx),%ymm0,%ymm0
-  DB  196,226,125,24,16                   ; vbroadcastss  (%rax),%ymm2
-  DB  197,252,91,210                      ; vcvtdq2ps     %ymm2,%ymm2
-  DB  197,236,88,201                      ; vaddps        %ymm1,%ymm2,%ymm1
-  DB  196,226,125,24,18                   ; vbroadcastss  (%rdx),%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,228,87,219                      ; vxorps        %ymm3,%ymm3,%ymm3
-  DB  197,220,87,228                      ; vxorps        %ymm4,%ymm4,%ymm4
-  DB  197,212,87,237                      ; vxorps        %ymm5,%ymm5,%ymm5
-  DB  197,204,87,246                      ; vxorps        %ymm6,%ymm6,%ymm6
-  DB  197,196,87,255                      ; vxorps        %ymm7,%ymm7,%ymm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_constant_color_avx
-_sk_constant_color_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,226,125,24,0                    ; vbroadcastss  (%rax),%ymm0
-  DB  196,226,125,24,72,4                 ; vbroadcastss  0x4(%rax),%ymm1
-  DB  196,226,125,24,80,8                 ; vbroadcastss  0x8(%rax),%ymm2
-  DB  196,226,125,24,88,12                ; vbroadcastss  0xc(%rax),%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clear_avx
-_sk_clear_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,87,192                      ; vxorps        %ymm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  197,236,87,210                      ; vxorps        %ymm2,%ymm2,%ymm2
-  DB  197,228,87,219                      ; vxorps        %ymm3,%ymm3,%ymm3
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_plus__avx
-_sk_plus__avx LABEL PROC
-  DB  197,252,88,196                      ; vaddps        %ymm4,%ymm0,%ymm0
-  DB  197,244,88,205                      ; vaddps        %ymm5,%ymm1,%ymm1
-  DB  197,236,88,214                      ; vaddps        %ymm6,%ymm2,%ymm2
-  DB  197,228,88,223                      ; vaddps        %ymm7,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_srcover_avx
-_sk_srcover_avx LABEL PROC
-  DB  196,98,125,24,2                     ; vbroadcastss  (%rdx),%ymm8
-  DB  197,60,92,195                       ; vsubps        %ymm3,%ymm8,%ymm8
-  DB  197,60,89,204                       ; vmulps        %ymm4,%ymm8,%ymm9
-  DB  197,180,88,192                      ; vaddps        %ymm0,%ymm9,%ymm0
-  DB  197,60,89,205                       ; vmulps        %ymm5,%ymm8,%ymm9
-  DB  197,180,88,201                      ; vaddps        %ymm1,%ymm9,%ymm1
-  DB  197,60,89,206                       ; vmulps        %ymm6,%ymm8,%ymm9
-  DB  197,180,88,210                      ; vaddps        %ymm2,%ymm9,%ymm2
-  DB  197,60,89,199                       ; vmulps        %ymm7,%ymm8,%ymm8
-  DB  197,188,88,219                      ; vaddps        %ymm3,%ymm8,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_dstover_avx
-_sk_dstover_avx LABEL PROC
-  DB  196,98,125,24,2                     ; vbroadcastss  (%rdx),%ymm8
-  DB  197,60,92,199                       ; vsubps        %ymm7,%ymm8,%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  197,252,88,196                      ; vaddps        %ymm4,%ymm0,%ymm0
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  197,244,88,205                      ; vaddps        %ymm5,%ymm1,%ymm1
-  DB  197,188,89,210                      ; vmulps        %ymm2,%ymm8,%ymm2
-  DB  197,236,88,214                      ; vaddps        %ymm6,%ymm2,%ymm2
-  DB  197,188,89,219                      ; vmulps        %ymm3,%ymm8,%ymm3
-  DB  197,228,88,223                      ; vaddps        %ymm7,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_0_avx
-_sk_clamp_0_avx LABEL PROC
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  196,193,124,95,192                  ; vmaxps        %ymm8,%ymm0,%ymm0
-  DB  196,193,116,95,200                  ; vmaxps        %ymm8,%ymm1,%ymm1
-  DB  196,193,108,95,208                  ; vmaxps        %ymm8,%ymm2,%ymm2
-  DB  196,193,100,95,216                  ; vmaxps        %ymm8,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_1_avx
-_sk_clamp_1_avx LABEL PROC
-  DB  196,98,125,24,2                     ; vbroadcastss  (%rdx),%ymm8
-  DB  196,193,124,93,192                  ; vminps        %ymm8,%ymm0,%ymm0
-  DB  196,193,116,93,200                  ; vminps        %ymm8,%ymm1,%ymm1
-  DB  196,193,108,93,208                  ; vminps        %ymm8,%ymm2,%ymm2
-  DB  196,193,100,93,216                  ; vminps        %ymm8,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_a_avx
-_sk_clamp_a_avx LABEL PROC
-  DB  196,98,125,24,2                     ; vbroadcastss  (%rdx),%ymm8
-  DB  196,193,100,93,216                  ; vminps        %ymm8,%ymm3,%ymm3
-  DB  197,252,93,195                      ; vminps        %ymm3,%ymm0,%ymm0
-  DB  197,244,93,203                      ; vminps        %ymm3,%ymm1,%ymm1
-  DB  197,236,93,211                      ; vminps        %ymm3,%ymm2,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_set_rgb_avx
-_sk_set_rgb_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,226,125,24,0                    ; vbroadcastss  (%rax),%ymm0
-  DB  196,226,125,24,72,4                 ; vbroadcastss  0x4(%rax),%ymm1
-  DB  196,226,125,24,80,8                 ; vbroadcastss  0x8(%rax),%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_swap_rb_avx
-_sk_swap_rb_avx LABEL PROC
-  DB  197,124,40,192                      ; vmovaps       %ymm0,%ymm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,40,194                      ; vmovaps       %ymm2,%ymm0
-  DB  197,124,41,194                      ; vmovaps       %ymm8,%ymm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_swap_avx
-_sk_swap_avx LABEL PROC
-  DB  197,124,40,195                      ; vmovaps       %ymm3,%ymm8
-  DB  197,124,40,202                      ; vmovaps       %ymm2,%ymm9
-  DB  197,124,40,209                      ; vmovaps       %ymm1,%ymm10
-  DB  197,124,40,216                      ; vmovaps       %ymm0,%ymm11
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,40,196                      ; vmovaps       %ymm4,%ymm0
-  DB  197,252,40,205                      ; vmovaps       %ymm5,%ymm1
-  DB  197,252,40,214                      ; vmovaps       %ymm6,%ymm2
-  DB  197,252,40,223                      ; vmovaps       %ymm7,%ymm3
-  DB  197,124,41,220                      ; vmovaps       %ymm11,%ymm4
-  DB  197,124,41,213                      ; vmovaps       %ymm10,%ymm5
-  DB  197,124,41,206                      ; vmovaps       %ymm9,%ymm6
-  DB  197,124,41,199                      ; vmovaps       %ymm8,%ymm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_move_src_dst_avx
-_sk_move_src_dst_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,40,224                      ; vmovaps       %ymm0,%ymm4
-  DB  197,252,40,233                      ; vmovaps       %ymm1,%ymm5
-  DB  197,252,40,242                      ; vmovaps       %ymm2,%ymm6
-  DB  197,252,40,251                      ; vmovaps       %ymm3,%ymm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_move_dst_src_avx
-_sk_move_dst_src_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,40,196                      ; vmovaps       %ymm4,%ymm0
-  DB  197,252,40,205                      ; vmovaps       %ymm5,%ymm1
-  DB  197,252,40,214                      ; vmovaps       %ymm6,%ymm2
-  DB  197,252,40,223                      ; vmovaps       %ymm7,%ymm3
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_premul_avx
-_sk_premul_avx LABEL PROC
-  DB  197,252,89,195                      ; vmulps        %ymm3,%ymm0,%ymm0
-  DB  197,244,89,203                      ; vmulps        %ymm3,%ymm1,%ymm1
-  DB  197,236,89,211                      ; vmulps        %ymm3,%ymm2,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_unpremul_avx
-_sk_unpremul_avx LABEL PROC
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  196,65,100,194,200,0                ; vcmpeqps      %ymm8,%ymm3,%ymm9
-  DB  196,98,125,24,18                    ; vbroadcastss  (%rdx),%ymm10
-  DB  197,44,94,211                       ; vdivps        %ymm3,%ymm10,%ymm10
-  DB  196,67,45,74,192,144                ; vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  197,188,89,210                      ; vmulps        %ymm2,%ymm8,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_from_srgb_avx
-_sk_from_srgb_avx LABEL PROC
-  DB  196,98,125,24,66,64                 ; vbroadcastss  0x40(%rdx),%ymm8
-  DB  197,60,89,200                       ; vmulps        %ymm0,%ymm8,%ymm9
-  DB  197,124,89,208                      ; vmulps        %ymm0,%ymm0,%ymm10
-  DB  196,98,125,24,90,60                 ; vbroadcastss  0x3c(%rdx),%ymm11
-  DB  196,98,125,24,98,56                 ; vbroadcastss  0x38(%rdx),%ymm12
-  DB  197,36,89,232                       ; vmulps        %ymm0,%ymm11,%ymm13
-  DB  196,65,20,88,236                    ; vaddps        %ymm12,%ymm13,%ymm13
-  DB  196,98,125,24,114,52                ; vbroadcastss  0x34(%rdx),%ymm14
-  DB  196,65,44,89,213                    ; vmulps        %ymm13,%ymm10,%ymm10
-  DB  196,65,12,88,210                    ; vaddps        %ymm10,%ymm14,%ymm10
-  DB  196,98,125,24,106,68                ; vbroadcastss  0x44(%rdx),%ymm13
-  DB  196,193,124,194,197,1               ; vcmpltps      %ymm13,%ymm0,%ymm0
-  DB  196,195,45,74,193,0                 ; vblendvps     %ymm0,%ymm9,%ymm10,%ymm0
-  DB  197,60,89,201                       ; vmulps        %ymm1,%ymm8,%ymm9
-  DB  197,116,89,209                      ; vmulps        %ymm1,%ymm1,%ymm10
-  DB  197,36,89,249                       ; vmulps        %ymm1,%ymm11,%ymm15
-  DB  196,65,4,88,252                     ; vaddps        %ymm12,%ymm15,%ymm15
-  DB  196,65,44,89,215                    ; vmulps        %ymm15,%ymm10,%ymm10
-  DB  196,65,12,88,210                    ; vaddps        %ymm10,%ymm14,%ymm10
-  DB  196,193,116,194,205,1               ; vcmpltps      %ymm13,%ymm1,%ymm1
-  DB  196,195,45,74,201,16                ; vblendvps     %ymm1,%ymm9,%ymm10,%ymm1
-  DB  197,60,89,194                       ; vmulps        %ymm2,%ymm8,%ymm8
-  DB  197,108,89,202                      ; vmulps        %ymm2,%ymm2,%ymm9
-  DB  197,36,89,210                       ; vmulps        %ymm2,%ymm11,%ymm10
-  DB  196,65,44,88,212                    ; vaddps        %ymm12,%ymm10,%ymm10
-  DB  196,65,52,89,202                    ; vmulps        %ymm10,%ymm9,%ymm9
-  DB  196,65,12,88,201                    ; vaddps        %ymm9,%ymm14,%ymm9
-  DB  196,193,108,194,213,1               ; vcmpltps      %ymm13,%ymm2,%ymm2
-  DB  196,195,53,74,208,32                ; vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_to_srgb_avx
-_sk_to_srgb_avx LABEL PROC
-  DB  197,124,82,192                      ; vrsqrtps      %ymm0,%ymm8
-  DB  196,65,124,83,200                   ; vrcpps        %ymm8,%ymm9
-  DB  196,65,124,82,208                   ; vrsqrtps      %ymm8,%ymm10
-  DB  196,98,125,24,66,72                 ; vbroadcastss  0x48(%rdx),%ymm8
-  DB  197,60,89,216                       ; vmulps        %ymm0,%ymm8,%ymm11
-  DB  196,98,125,24,34                    ; vbroadcastss  (%rdx),%ymm12
-  DB  196,98,125,24,106,76                ; vbroadcastss  0x4c(%rdx),%ymm13
-  DB  196,98,125,24,114,80                ; vbroadcastss  0x50(%rdx),%ymm14
-  DB  196,98,125,24,122,84                ; vbroadcastss  0x54(%rdx),%ymm15
-  DB  196,65,52,89,206                    ; vmulps        %ymm14,%ymm9,%ymm9
-  DB  196,65,52,88,207                    ; vaddps        %ymm15,%ymm9,%ymm9
-  DB  196,65,44,89,213                    ; vmulps        %ymm13,%ymm10,%ymm10
-  DB  196,65,44,88,201                    ; vaddps        %ymm9,%ymm10,%ymm9
-  DB  196,65,28,93,201                    ; vminps        %ymm9,%ymm12,%ymm9
-  DB  196,98,125,24,82,88                 ; vbroadcastss  0x58(%rdx),%ymm10
-  DB  196,193,124,194,194,1               ; vcmpltps      %ymm10,%ymm0,%ymm0
-  DB  196,195,53,74,195,0                 ; vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
-  DB  197,124,82,201                      ; vrsqrtps      %ymm1,%ymm9
-  DB  196,65,124,83,217                   ; vrcpps        %ymm9,%ymm11
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,12,89,219                    ; vmulps        %ymm11,%ymm14,%ymm11
-  DB  196,65,4,88,219                     ; vaddps        %ymm11,%ymm15,%ymm11
-  DB  196,65,20,89,201                    ; vmulps        %ymm9,%ymm13,%ymm9
-  DB  196,65,52,88,203                    ; vaddps        %ymm11,%ymm9,%ymm9
-  DB  197,60,89,217                       ; vmulps        %ymm1,%ymm8,%ymm11
-  DB  196,65,28,93,201                    ; vminps        %ymm9,%ymm12,%ymm9
-  DB  196,193,116,194,202,1               ; vcmpltps      %ymm10,%ymm1,%ymm1
-  DB  196,195,53,74,203,16                ; vblendvps     %ymm1,%ymm11,%ymm9,%ymm1
-  DB  197,124,82,202                      ; vrsqrtps      %ymm2,%ymm9
-  DB  196,65,124,83,217                   ; vrcpps        %ymm9,%ymm11
-  DB  196,65,12,89,219                    ; vmulps        %ymm11,%ymm14,%ymm11
-  DB  196,65,4,88,219                     ; vaddps        %ymm11,%ymm15,%ymm11
-  DB  196,65,124,82,201                   ; vrsqrtps      %ymm9,%ymm9
-  DB  196,65,20,89,201                    ; vmulps        %ymm9,%ymm13,%ymm9
-  DB  196,65,52,88,203                    ; vaddps        %ymm11,%ymm9,%ymm9
-  DB  196,65,28,93,201                    ; vminps        %ymm9,%ymm12,%ymm9
-  DB  197,60,89,194                       ; vmulps        %ymm2,%ymm8,%ymm8
-  DB  196,193,108,194,210,1               ; vcmpltps      %ymm10,%ymm2,%ymm2
-  DB  196,195,53,74,208,32                ; vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_scale_1_float_avx
-_sk_scale_1_float_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  197,188,89,210                      ; vmulps        %ymm2,%ymm8,%ymm2
-  DB  197,188,89,219                      ; vmulps        %ymm3,%ymm8,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_scale_u8_avx
-_sk_scale_u8_avx LABEL PROC
-  DB  73,137,200                          ; mov           %rcx,%r8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,1,248                            ; add           %rdi,%rax
-  DB  77,133,192                          ; test          %r8,%r8
-  DB  117,65                              ; jne           50f <_sk_scale_u8_avx+0x51>
-  DB  197,123,16,0                        ; vmovsd        (%rax),%xmm8
-  DB  196,66,121,49,200                   ; vpmovzxbd     %xmm8,%xmm9
-  DB  196,67,121,4,192,229                ; vpermilps     $0xe5,%xmm8,%xmm8
-  DB  196,66,121,49,192                   ; vpmovzxbd     %xmm8,%xmm8
-  DB  196,67,53,24,192,1                  ; vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
-  DB  196,65,124,91,192                   ; vcvtdq2ps     %ymm8,%ymm8
-  DB  196,98,125,24,74,12                 ; vbroadcastss  0xc(%rdx),%ymm9
-  DB  196,65,60,89,193                    ; vmulps        %ymm9,%ymm8,%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  197,188,89,210                      ; vmulps        %ymm2,%ymm8,%ymm2
-  DB  197,188,89,219                      ; vmulps        %ymm3,%ymm8,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,137,193                          ; mov           %r8,%rcx
-  DB  255,224                             ; jmpq          *%rax
-  DB  49,201                              ; xor           %ecx,%ecx
-  DB  77,137,194                          ; mov           %r8,%r10
-  DB  69,49,201                           ; xor           %r9d,%r9d
-  DB  68,15,182,24                        ; movzbl        (%rax),%r11d
-  DB  72,255,192                          ; inc           %rax
-  DB  73,211,227                          ; shl           %cl,%r11
-  DB  77,9,217                            ; or            %r11,%r9
-  DB  72,131,193,8                        ; add           $0x8,%rcx
-  DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           517 <_sk_scale_u8_avx+0x59>
-  DB  196,65,249,110,193                  ; vmovq         %r9,%xmm8
-  DB  235,158                             ; jmp           4d2 <_sk_scale_u8_avx+0x14>
-
-PUBLIC _sk_lerp_1_float_avx
-_sk_lerp_1_float_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  197,252,92,196                      ; vsubps        %ymm4,%ymm0,%ymm0
-  DB  196,193,124,89,192                  ; vmulps        %ymm8,%ymm0,%ymm0
-  DB  197,252,88,196                      ; vaddps        %ymm4,%ymm0,%ymm0
-  DB  197,244,92,205                      ; vsubps        %ymm5,%ymm1,%ymm1
-  DB  196,193,116,89,200                  ; vmulps        %ymm8,%ymm1,%ymm1
-  DB  197,244,88,205                      ; vaddps        %ymm5,%ymm1,%ymm1
-  DB  197,236,92,214                      ; vsubps        %ymm6,%ymm2,%ymm2
-  DB  196,193,108,89,208                  ; vmulps        %ymm8,%ymm2,%ymm2
-  DB  197,236,88,214                      ; vaddps        %ymm6,%ymm2,%ymm2
-  DB  197,228,92,223                      ; vsubps        %ymm7,%ymm3,%ymm3
-  DB  196,193,100,89,216                  ; vmulps        %ymm8,%ymm3,%ymm3
-  DB  197,228,88,223                      ; vaddps        %ymm7,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_lerp_u8_avx
-_sk_lerp_u8_avx LABEL PROC
-  DB  73,137,200                          ; mov           %rcx,%r8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,1,248                            ; add           %rdi,%rax
-  DB  77,133,192                          ; test          %r8,%r8
-  DB  117,101                             ; jne           5e8 <_sk_lerp_u8_avx+0x75>
-  DB  197,123,16,0                        ; vmovsd        (%rax),%xmm8
-  DB  196,66,121,49,200                   ; vpmovzxbd     %xmm8,%xmm9
-  DB  196,67,121,4,192,229                ; vpermilps     $0xe5,%xmm8,%xmm8
-  DB  196,66,121,49,192                   ; vpmovzxbd     %xmm8,%xmm8
-  DB  196,67,53,24,192,1                  ; vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
-  DB  196,65,124,91,192                   ; vcvtdq2ps     %ymm8,%ymm8
-  DB  196,98,125,24,74,12                 ; vbroadcastss  0xc(%rdx),%ymm9
-  DB  196,65,60,89,193                    ; vmulps        %ymm9,%ymm8,%ymm8
-  DB  197,252,92,196                      ; vsubps        %ymm4,%ymm0,%ymm0
-  DB  196,193,124,89,192                  ; vmulps        %ymm8,%ymm0,%ymm0
-  DB  197,252,88,196                      ; vaddps        %ymm4,%ymm0,%ymm0
-  DB  197,244,92,205                      ; vsubps        %ymm5,%ymm1,%ymm1
-  DB  196,193,116,89,200                  ; vmulps        %ymm8,%ymm1,%ymm1
-  DB  197,244,88,205                      ; vaddps        %ymm5,%ymm1,%ymm1
-  DB  197,236,92,214                      ; vsubps        %ymm6,%ymm2,%ymm2
-  DB  196,193,108,89,208                  ; vmulps        %ymm8,%ymm2,%ymm2
-  DB  197,236,88,214                      ; vaddps        %ymm6,%ymm2,%ymm2
-  DB  197,228,92,223                      ; vsubps        %ymm7,%ymm3,%ymm3
-  DB  196,193,100,89,216                  ; vmulps        %ymm8,%ymm3,%ymm3
-  DB  197,228,88,223                      ; vaddps        %ymm7,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,137,193                          ; mov           %r8,%rcx
-  DB  255,224                             ; jmpq          *%rax
-  DB  49,201                              ; xor           %ecx,%ecx
-  DB  77,137,194                          ; mov           %r8,%r10
-  DB  69,49,201                           ; xor           %r9d,%r9d
-  DB  68,15,182,24                        ; movzbl        (%rax),%r11d
-  DB  72,255,192                          ; inc           %rax
-  DB  73,211,227                          ; shl           %cl,%r11
-  DB  77,9,217                            ; or            %r11,%r9
-  DB  72,131,193,8                        ; add           $0x8,%rcx
-  DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           5f0 <_sk_lerp_u8_avx+0x7d>
-  DB  196,65,249,110,193                  ; vmovq         %r9,%xmm8
-  DB  233,119,255,255,255                 ; jmpq          587 <_sk_lerp_u8_avx+0x14>
-
-PUBLIC _sk_lerp_565_avx
-_sk_lerp_565_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,16                           ; mov           (%rax),%r10
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,148,0,0,0                    ; jne           6b2 <_sk_lerp_565_avx+0xa2>
-  DB  196,65,122,111,4,122                ; vmovdqu       (%r10,%rdi,2),%xmm8
-  DB  197,225,239,219                     ; vpxor         %xmm3,%xmm3,%xmm3
-  DB  197,185,105,219                     ; vpunpckhwd    %xmm3,%xmm8,%xmm3
-  DB  196,66,121,51,192                   ; vpmovzxwd     %xmm8,%xmm8
-  DB  196,227,61,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  DB  196,98,125,24,66,104                ; vbroadcastss  0x68(%rdx),%ymm8
-  DB  197,60,84,195                       ; vandps        %ymm3,%ymm8,%ymm8
-  DB  196,65,124,91,192                   ; vcvtdq2ps     %ymm8,%ymm8
-  DB  196,98,125,24,74,116                ; vbroadcastss  0x74(%rdx),%ymm9
-  DB  196,65,52,89,192                    ; vmulps        %ymm8,%ymm9,%ymm8
-  DB  196,98,125,24,74,108                ; vbroadcastss  0x6c(%rdx),%ymm9
-  DB  197,52,84,203                       ; vandps        %ymm3,%ymm9,%ymm9
-  DB  196,65,124,91,201                   ; vcvtdq2ps     %ymm9,%ymm9
-  DB  196,98,125,24,82,120                ; vbroadcastss  0x78(%rdx),%ymm10
-  DB  196,65,44,89,201                    ; vmulps        %ymm9,%ymm10,%ymm9
-  DB  196,98,125,24,82,112                ; vbroadcastss  0x70(%rdx),%ymm10
-  DB  197,172,84,219                      ; vandps        %ymm3,%ymm10,%ymm3
-  DB  197,252,91,219                      ; vcvtdq2ps     %ymm3,%ymm3
-  DB  196,98,125,24,82,124                ; vbroadcastss  0x7c(%rdx),%ymm10
-  DB  197,172,89,219                      ; vmulps        %ymm3,%ymm10,%ymm3
-  DB  197,252,92,196                      ; vsubps        %ymm4,%ymm0,%ymm0
-  DB  196,193,124,89,192                  ; vmulps        %ymm8,%ymm0,%ymm0
-  DB  197,252,88,196                      ; vaddps        %ymm4,%ymm0,%ymm0
-  DB  197,244,92,205                      ; vsubps        %ymm5,%ymm1,%ymm1
-  DB  196,193,116,89,201                  ; vmulps        %ymm9,%ymm1,%ymm1
-  DB  197,244,88,205                      ; vaddps        %ymm5,%ymm1,%ymm1
-  DB  197,236,92,214                      ; vsubps        %ymm6,%ymm2,%ymm2
-  DB  197,236,89,211                      ; vmulps        %ymm3,%ymm2,%ymm2
-  DB  197,236,88,214                      ; vaddps        %ymm6,%ymm2,%ymm2
-  DB  196,226,125,24,26                   ; vbroadcastss  (%rdx),%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  65,137,200                          ; mov           %ecx,%r8d
-  DB  65,128,224,7                        ; and           $0x7,%r8b
-  DB  196,65,57,239,192                   ; vpxor         %xmm8,%xmm8,%xmm8
-  DB  65,254,200                          ; dec           %r8b
-  DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,85,255,255,255               ; ja            624 <_sk_lerp_565_avx+0x14>
-  DB  76,141,13,74,0,0,0                  ; lea           0x4a(%rip),%r9        # 720 <_sk_lerp_565_avx+0x110>
-  DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
-  DB  76,1,200                            ; add           %r9,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  197,225,239,219                     ; vpxor         %xmm3,%xmm3,%xmm3
-  DB  196,65,97,196,68,122,12,6           ; vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm3,%xmm8
-  DB  196,65,57,196,68,122,10,5           ; vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm8,%xmm8
-  DB  196,65,57,196,68,122,8,4            ; vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm8,%xmm8
-  DB  196,65,57,196,68,122,6,3            ; vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm8,%xmm8
-  DB  196,65,57,196,68,122,4,2            ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm8,%xmm8
-  DB  196,65,57,196,68,122,2,1            ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm8,%xmm8
-  DB  196,65,57,196,4,122,0               ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm8,%xmm8
-  DB  233,5,255,255,255                   ; jmpq          624 <_sk_lerp_565_avx+0x14>
-  DB  144                                 ; nop
-  DB  243,255                             ; repz          (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  235,255                             ; jmp           725 <_sk_lerp_565_avx+0x115>
-  DB  255                                 ; (bad)
-  DB  255,227                             ; jmpq          *%rbx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  219,255                             ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,211                             ; callq         *%rbx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,203                             ; dec           %ebx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  191                                 ; .byte         0xbf
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_load_tables_avx
-_sk_load_tables_avx LABEL PROC
-  DB  85                                  ; push          %rbp
-  DB  65,87                               ; push          %r15
-  DB  65,86                               ; push          %r14
-  DB  65,85                               ; push          %r13
-  DB  65,84                               ; push          %r12
-  DB  83                                  ; push          %rbx
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,0                            ; mov           (%rax),%r8
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,18,2,0,0                     ; jne           966 <_sk_load_tables_avx+0x22a>
-  DB  196,65,124,16,4,184                 ; vmovups       (%r8,%rdi,4),%ymm8
-  DB  196,98,125,24,74,16                 ; vbroadcastss  0x10(%rdx),%ymm9
-  DB  196,193,52,84,192                   ; vandps        %ymm8,%ymm9,%ymm0
-  DB  196,193,249,126,193                 ; vmovq         %xmm0,%r9
-  DB  69,137,203                          ; mov           %r9d,%r11d
-  DB  196,195,249,22,194,1                ; vpextrq       $0x1,%xmm0,%r10
-  DB  69,137,214                          ; mov           %r10d,%r14d
-  DB  73,193,234,32                       ; shr           $0x20,%r10
-  DB  73,193,233,32                       ; shr           $0x20,%r9
-  DB  196,227,125,25,192,1                ; vextractf128  $0x1,%ymm0,%xmm0
-  DB  196,193,249,126,196                 ; vmovq         %xmm0,%r12
-  DB  69,137,231                          ; mov           %r12d,%r15d
-  DB  196,227,249,22,195,1                ; vpextrq       $0x1,%xmm0,%rbx
-  DB  65,137,221                          ; mov           %ebx,%r13d
-  DB  72,193,235,32                       ; shr           $0x20,%rbx
-  DB  73,193,236,32                       ; shr           $0x20,%r12
-  DB  72,139,104,8                        ; mov           0x8(%rax),%rbp
-  DB  76,139,64,16                        ; mov           0x10(%rax),%r8
-  DB  196,161,122,16,68,189,0             ; vmovss        0x0(%rbp,%r15,4),%xmm0
-  DB  196,163,121,33,68,165,0,16          ; vinsertps     $0x10,0x0(%rbp,%r12,4),%xmm0,%xmm0
-  DB  196,163,121,33,68,173,0,32          ; vinsertps     $0x20,0x0(%rbp,%r13,4),%xmm0,%xmm0
-  DB  197,250,16,76,157,0                 ; vmovss        0x0(%rbp,%rbx,4),%xmm1
-  DB  196,227,121,33,193,48               ; vinsertps     $0x30,%xmm1,%xmm0,%xmm0
-  DB  196,161,122,16,76,157,0             ; vmovss        0x0(%rbp,%r11,4),%xmm1
-  DB  196,163,113,33,76,141,0,16          ; vinsertps     $0x10,0x0(%rbp,%r9,4),%xmm1,%xmm1
-  DB  196,163,113,33,76,181,0,32          ; vinsertps     $0x20,0x0(%rbp,%r14,4),%xmm1,%xmm1
-  DB  196,161,122,16,92,149,0             ; vmovss        0x0(%rbp,%r10,4),%xmm3
-  DB  196,227,113,33,203,48               ; vinsertps     $0x30,%xmm3,%xmm1,%xmm1
-  DB  196,227,117,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
-  DB  196,193,113,114,208,8               ; vpsrld        $0x8,%xmm8,%xmm1
-  DB  196,67,125,25,194,1                 ; vextractf128  $0x1,%ymm8,%xmm10
-  DB  196,193,105,114,210,8               ; vpsrld        $0x8,%xmm10,%xmm2
-  DB  196,227,117,24,202,1                ; vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
-  DB  197,180,84,201                      ; vandps        %ymm1,%ymm9,%ymm1
-  DB  196,193,249,126,201                 ; vmovq         %xmm1,%r9
-  DB  69,137,203                          ; mov           %r9d,%r11d
-  DB  196,195,249,22,202,1                ; vpextrq       $0x1,%xmm1,%r10
-  DB  69,137,214                          ; mov           %r10d,%r14d
-  DB  73,193,234,32                       ; shr           $0x20,%r10
-  DB  73,193,233,32                       ; shr           $0x20,%r9
-  DB  196,227,125,25,201,1                ; vextractf128  $0x1,%ymm1,%xmm1
-  DB  196,225,249,126,205                 ; vmovq         %xmm1,%rbp
-  DB  65,137,239                          ; mov           %ebp,%r15d
-  DB  196,227,249,22,203,1                ; vpextrq       $0x1,%xmm1,%rbx
-  DB  65,137,220                          ; mov           %ebx,%r12d
-  DB  72,193,235,32                       ; shr           $0x20,%rbx
-  DB  72,193,237,32                       ; shr           $0x20,%rbp
-  DB  196,129,122,16,12,184               ; vmovss        (%r8,%r15,4),%xmm1
-  DB  196,195,113,33,12,168,16            ; vinsertps     $0x10,(%r8,%rbp,4),%xmm1,%xmm1
-  DB  196,129,122,16,20,160               ; vmovss        (%r8,%r12,4),%xmm2
-  DB  196,227,113,33,202,32               ; vinsertps     $0x20,%xmm2,%xmm1,%xmm1
-  DB  196,193,122,16,20,152               ; vmovss        (%r8,%rbx,4),%xmm2
-  DB  196,227,113,33,202,48               ; vinsertps     $0x30,%xmm2,%xmm1,%xmm1
-  DB  196,129,122,16,20,152               ; vmovss        (%r8,%r11,4),%xmm2
-  DB  196,131,105,33,20,136,16            ; vinsertps     $0x10,(%r8,%r9,4),%xmm2,%xmm2
-  DB  196,129,122,16,28,176               ; vmovss        (%r8,%r14,4),%xmm3
-  DB  196,227,105,33,211,32               ; vinsertps     $0x20,%xmm3,%xmm2,%xmm2
-  DB  196,129,122,16,28,144               ; vmovss        (%r8,%r10,4),%xmm3
-  DB  196,227,105,33,211,48               ; vinsertps     $0x30,%xmm3,%xmm2,%xmm2
-  DB  196,227,109,24,201,1                ; vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
-  DB  72,139,64,24                        ; mov           0x18(%rax),%rax
-  DB  196,193,105,114,208,16              ; vpsrld        $0x10,%xmm8,%xmm2
-  DB  196,193,97,114,210,16               ; vpsrld        $0x10,%xmm10,%xmm3
-  DB  196,227,109,24,211,1                ; vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
-  DB  197,180,84,210                      ; vandps        %ymm2,%ymm9,%ymm2
-  DB  196,193,249,126,208                 ; vmovq         %xmm2,%r8
-  DB  69,137,194                          ; mov           %r8d,%r10d
-  DB  196,195,249,22,209,1                ; vpextrq       $0x1,%xmm2,%r9
-  DB  69,137,203                          ; mov           %r9d,%r11d
-  DB  73,193,233,32                       ; shr           $0x20,%r9
-  DB  73,193,232,32                       ; shr           $0x20,%r8
-  DB  196,227,125,25,210,1                ; vextractf128  $0x1,%ymm2,%xmm2
-  DB  196,225,249,126,213                 ; vmovq         %xmm2,%rbp
-  DB  65,137,238                          ; mov           %ebp,%r14d
-  DB  196,227,249,22,211,1                ; vpextrq       $0x1,%xmm2,%rbx
-  DB  65,137,223                          ; mov           %ebx,%r15d
-  DB  72,193,235,32                       ; shr           $0x20,%rbx
-  DB  72,193,237,32                       ; shr           $0x20,%rbp
-  DB  196,161,122,16,20,176               ; vmovss        (%rax,%r14,4),%xmm2
-  DB  196,227,105,33,20,168,16            ; vinsertps     $0x10,(%rax,%rbp,4),%xmm2,%xmm2
-  DB  196,161,122,16,28,184               ; vmovss        (%rax,%r15,4),%xmm3
-  DB  196,227,105,33,211,32               ; vinsertps     $0x20,%xmm3,%xmm2,%xmm2
-  DB  197,250,16,28,152                   ; vmovss        (%rax,%rbx,4),%xmm3
-  DB  196,99,105,33,203,48                ; vinsertps     $0x30,%xmm3,%xmm2,%xmm9
-  DB  196,161,122,16,28,144               ; vmovss        (%rax,%r10,4),%xmm3
-  DB  196,163,97,33,28,128,16             ; vinsertps     $0x10,(%rax,%r8,4),%xmm3,%xmm3
-  DB  196,161,122,16,20,152               ; vmovss        (%rax,%r11,4),%xmm2
-  DB  196,227,97,33,210,32                ; vinsertps     $0x20,%xmm2,%xmm3,%xmm2
-  DB  196,161,122,16,28,136               ; vmovss        (%rax,%r9,4),%xmm3
-  DB  196,227,105,33,211,48               ; vinsertps     $0x30,%xmm3,%xmm2,%xmm2
-  DB  196,195,109,24,209,1                ; vinsertf128   $0x1,%xmm9,%ymm2,%ymm2
-  DB  196,193,57,114,208,24               ; vpsrld        $0x18,%xmm8,%xmm8
-  DB  196,193,97,114,210,24               ; vpsrld        $0x18,%xmm10,%xmm3
-  DB  196,227,61,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  DB  197,252,91,219                      ; vcvtdq2ps     %ymm3,%ymm3
-  DB  196,98,125,24,66,12                 ; vbroadcastss  0xc(%rdx),%ymm8
-  DB  196,193,100,89,216                  ; vmulps        %ymm8,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  91                                  ; pop           %rbx
-  DB  65,92                               ; pop           %r12
-  DB  65,93                               ; pop           %r13
-  DB  65,94                               ; pop           %r14
-  DB  65,95                               ; pop           %r15
-  DB  93                                  ; pop           %rbp
-  DB  255,224                             ; jmpq          *%rax
-  DB  65,137,201                          ; mov           %ecx,%r9d
-  DB  65,128,225,7                        ; and           $0x7,%r9b
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  65,254,201                          ; dec           %r9b
-  DB  69,15,182,201                       ; movzbl        %r9b,%r9d
-  DB  65,128,249,6                        ; cmp           $0x6,%r9b
-  DB  15,135,215,253,255,255              ; ja            75a <_sk_load_tables_avx+0x1e>
-  DB  76,141,21,138,0,0,0                 ; lea           0x8a(%rip),%r10        # a14 <_sk_load_tables_avx+0x2d8>
-  DB  79,99,12,138                        ; movslq        (%r10,%r9,4),%r9
-  DB  77,1,209                            ; add           %r10,%r9
-  DB  65,255,225                          ; jmpq          *%r9
-  DB  196,193,121,110,68,184,24           ; vmovd         0x18(%r8,%rdi,4),%xmm0
-  DB  197,249,112,192,68                  ; vpshufd       $0x44,%xmm0,%xmm0
-  DB  196,227,125,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  196,99,117,12,192,64                ; vblendps      $0x40,%ymm0,%ymm1,%ymm8
-  DB  196,99,125,25,192,1                 ; vextractf128  $0x1,%ymm8,%xmm0
-  DB  196,195,121,34,68,184,20,1          ; vpinsrd       $0x1,0x14(%r8,%rdi,4),%xmm0,%xmm0
-  DB  196,99,61,24,192,1                  ; vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
-  DB  196,99,125,25,192,1                 ; vextractf128  $0x1,%ymm8,%xmm0
-  DB  196,195,121,34,68,184,16,0          ; vpinsrd       $0x0,0x10(%r8,%rdi,4),%xmm0,%xmm0
-  DB  196,99,61,24,192,1                  ; vinsertf128   $0x1,%xmm0,%ymm8,%ymm8
-  DB  196,195,57,34,68,184,12,3           ; vpinsrd       $0x3,0xc(%r8,%rdi,4),%xmm8,%xmm0
-  DB  196,99,61,12,192,15                 ; vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  DB  196,195,57,34,68,184,8,2            ; vpinsrd       $0x2,0x8(%r8,%rdi,4),%xmm8,%xmm0
-  DB  196,99,61,12,192,15                 ; vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  DB  196,195,57,34,68,184,4,1            ; vpinsrd       $0x1,0x4(%r8,%rdi,4),%xmm8,%xmm0
-  DB  196,99,61,12,192,15                 ; vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  DB  196,195,57,34,4,184,0               ; vpinsrd       $0x0,(%r8,%rdi,4),%xmm8,%xmm0
-  DB  196,99,61,12,192,15                 ; vblendps      $0xf,%ymm0,%ymm8,%ymm8
-  DB  233,70,253,255,255                  ; jmpq          75a <_sk_load_tables_avx+0x1e>
-  DB  238                                 ; out           %al,(%dx)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,224                             ; jmpq          *%rax
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,210                             ; callq         *%rdx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,196                             ; inc           %esp
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,176,255,255,255,156             ; pushq         -0x63000001(%rax)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-  DB  128,255,255                         ; cmp           $0xff,%bh
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_load_a8_avx
-_sk_load_a8_avx LABEL PROC
-  DB  73,137,200                          ; mov           %rcx,%r8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,1,248                            ; add           %rdi,%rax
-  DB  77,133,192                          ; test          %r8,%r8
-  DB  117,59                              ; jne           a7b <_sk_load_a8_avx+0x4b>
-  DB  197,251,16,0                        ; vmovsd        (%rax),%xmm0
-  DB  196,226,121,49,200                  ; vpmovzxbd     %xmm0,%xmm1
-  DB  196,227,121,4,192,229               ; vpermilps     $0xe5,%xmm0,%xmm0
-  DB  196,226,121,49,192                  ; vpmovzxbd     %xmm0,%xmm0
-  DB  196,227,117,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
-  DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
-  DB  196,226,125,24,74,12                ; vbroadcastss  0xc(%rdx),%ymm1
-  DB  197,252,89,217                      ; vmulps        %ymm1,%ymm0,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,252,87,192                      ; vxorps        %ymm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  197,236,87,210                      ; vxorps        %ymm2,%ymm2,%ymm2
-  DB  76,137,193                          ; mov           %r8,%rcx
-  DB  255,224                             ; jmpq          *%rax
-  DB  49,201                              ; xor           %ecx,%ecx
-  DB  77,137,194                          ; mov           %r8,%r10
-  DB  69,49,201                           ; xor           %r9d,%r9d
-  DB  68,15,182,24                        ; movzbl        (%rax),%r11d
-  DB  72,255,192                          ; inc           %rax
-  DB  73,211,227                          ; shl           %cl,%r11
-  DB  77,9,217                            ; or            %r11,%r9
-  DB  72,131,193,8                        ; add           $0x8,%rcx
-  DB  73,255,202                          ; dec           %r10
-  DB  117,234                             ; jne           a83 <_sk_load_a8_avx+0x53>
-  DB  196,193,249,110,193                 ; vmovq         %r9,%xmm0
-  DB  235,164                             ; jmp           a44 <_sk_load_a8_avx+0x14>
-
-PUBLIC _sk_store_a8_avx
-_sk_store_a8_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,8                            ; mov           (%rax),%r9
-  DB  196,98,125,24,66,8                  ; vbroadcastss  0x8(%rdx),%ymm8
-  DB  197,60,89,195                       ; vmulps        %ymm3,%ymm8,%ymm8
-  DB  196,65,125,91,192                   ; vcvtps2dq     %ymm8,%ymm8
-  DB  196,67,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm9
-  DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
-  DB  196,65,57,103,192                   ; vpackuswb     %xmm8,%xmm8,%xmm8
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           ad3 <_sk_store_a8_avx+0x33>
-  DB  196,65,123,17,4,57                  ; vmovsd        %xmm8,(%r9,%rdi,1)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  137,200                             ; mov           %ecx,%eax
-  DB  36,7                                ; and           $0x7,%al
-  DB  254,200                             ; dec           %al
-  DB  68,15,182,192                       ; movzbl        %al,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            acf <_sk_store_a8_avx+0x2f>
-  DB  196,66,121,48,192                   ; vpmovzxbw     %xmm8,%xmm8
-  DB  76,141,21,69,0,0,0                  ; lea           0x45(%rip),%r10        # b34 <_sk_store_a8_avx+0x94>
-  DB  75,99,4,130                         ; movslq        (%r10,%r8,4),%rax
-  DB  76,1,208                            ; add           %r10,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  196,67,121,20,68,57,6,12            ; vpextrb       $0xc,%xmm8,0x6(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,5,10            ; vpextrb       $0xa,%xmm8,0x5(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,4,8             ; vpextrb       $0x8,%xmm8,0x4(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,3,6             ; vpextrb       $0x6,%xmm8,0x3(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,2,4             ; vpextrb       $0x4,%xmm8,0x2(%r9,%rdi,1)
-  DB  196,67,121,20,68,57,1,2             ; vpextrb       $0x2,%xmm8,0x1(%r9,%rdi,1)
-  DB  196,67,121,20,4,57,0                ; vpextrb       $0x0,%xmm8,(%r9,%rdi,1)
-  DB  235,158                             ; jmp           acf <_sk_store_a8_avx+0x2f>
-  DB  15,31,0                             ; nopl          (%rax)
-  DB  244                                 ; hlt
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  236                                 ; in            (%dx),%al
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,228                             ; jmpq          *%rsp
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  220,255                             ; fdivr         %st,%st(7)
-  DB  255                                 ; (bad)
-  DB  255,212                             ; callq         *%rsp
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,204                             ; dec           %esp
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,196                             ; inc           %esp
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_load_565_avx
-_sk_load_565_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,16                           ; mov           (%rax),%r10
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,106                             ; jne           bc4 <_sk_load_565_avx+0x74>
-  DB  196,193,122,111,4,122               ; vmovdqu       (%r10,%rdi,2),%xmm0
-  DB  197,241,239,201                     ; vpxor         %xmm1,%xmm1,%xmm1
-  DB  197,249,105,201                     ; vpunpckhwd    %xmm1,%xmm0,%xmm1
-  DB  196,226,121,51,192                  ; vpmovzxwd     %xmm0,%xmm0
-  DB  196,227,125,24,209,1                ; vinsertf128   $0x1,%xmm1,%ymm0,%ymm2
-  DB  196,226,125,24,66,104               ; vbroadcastss  0x68(%rdx),%ymm0
-  DB  197,252,84,194                      ; vandps        %ymm2,%ymm0,%ymm0
-  DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
-  DB  196,226,125,24,74,116               ; vbroadcastss  0x74(%rdx),%ymm1
-  DB  197,244,89,192                      ; vmulps        %ymm0,%ymm1,%ymm0
-  DB  196,226,125,24,74,108               ; vbroadcastss  0x6c(%rdx),%ymm1
-  DB  197,244,84,202                      ; vandps        %ymm2,%ymm1,%ymm1
-  DB  197,252,91,201                      ; vcvtdq2ps     %ymm1,%ymm1
-  DB  196,226,125,24,90,120               ; vbroadcastss  0x78(%rdx),%ymm3
-  DB  197,228,89,201                      ; vmulps        %ymm1,%ymm3,%ymm1
-  DB  196,226,125,24,90,112               ; vbroadcastss  0x70(%rdx),%ymm3
-  DB  197,228,84,210                      ; vandps        %ymm2,%ymm3,%ymm2
-  DB  197,252,91,210                      ; vcvtdq2ps     %ymm2,%ymm2
-  DB  196,226,125,24,90,124               ; vbroadcastss  0x7c(%rdx),%ymm3
-  DB  197,228,89,210                      ; vmulps        %ymm2,%ymm3,%ymm2
-  DB  196,226,125,24,26                   ; vbroadcastss  (%rdx),%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  65,137,200                          ; mov           %ecx,%r8d
-  DB  65,128,224,7                        ; and           $0x7,%r8b
-  DB  197,249,239,192                     ; vpxor         %xmm0,%xmm0,%xmm0
-  DB  65,254,200                          ; dec           %r8b
-  DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,132                             ; ja            b60 <_sk_load_565_avx+0x10>
-  DB  76,141,13,73,0,0,0                  ; lea           0x49(%rip),%r9        # c2c <_sk_load_565_avx+0xdc>
-  DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
-  DB  76,1,200                            ; add           %r9,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  197,249,239,192                     ; vpxor         %xmm0,%xmm0,%xmm0
-  DB  196,193,121,196,68,122,12,6         ; vpinsrw       $0x6,0xc(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,10,5         ; vpinsrw       $0x5,0xa(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,8,4          ; vpinsrw       $0x4,0x8(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,6,3          ; vpinsrw       $0x3,0x6(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,4,2          ; vpinsrw       $0x2,0x4(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,68,122,2,1          ; vpinsrw       $0x1,0x2(%r10,%rdi,2),%xmm0,%xmm0
-  DB  196,193,121,196,4,122,0             ; vpinsrw       $0x0,(%r10,%rdi,2),%xmm0,%xmm0
-  DB  233,52,255,255,255                  ; jmpq          b60 <_sk_load_565_avx+0x10>
-  DB  244                                 ; hlt
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  236                                 ; in            (%dx),%al
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,228                             ; jmpq          *%rsp
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  220,255                             ; fdivr         %st,%st(7)
-  DB  255                                 ; (bad)
-  DB  255,212                             ; callq         *%rsp
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,204                             ; dec           %esp
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,192                             ; inc           %eax
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_store_565_avx
-_sk_store_565_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,8                            ; mov           (%rax),%r9
-  DB  196,98,125,24,130,128,0,0,0         ; vbroadcastss  0x80(%rdx),%ymm8
-  DB  197,60,89,200                       ; vmulps        %ymm0,%ymm8,%ymm9
-  DB  196,65,125,91,201                   ; vcvtps2dq     %ymm9,%ymm9
-  DB  196,193,41,114,241,11               ; vpslld        $0xb,%xmm9,%xmm10
-  DB  196,67,125,25,201,1                 ; vextractf128  $0x1,%ymm9,%xmm9
-  DB  196,193,49,114,241,11               ; vpslld        $0xb,%xmm9,%xmm9
-  DB  196,67,45,24,201,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm9
-  DB  196,98,125,24,146,132,0,0,0         ; vbroadcastss  0x84(%rdx),%ymm10
-  DB  197,44,89,209                       ; vmulps        %ymm1,%ymm10,%ymm10
-  DB  196,65,125,91,210                   ; vcvtps2dq     %ymm10,%ymm10
-  DB  196,193,33,114,242,5                ; vpslld        $0x5,%xmm10,%xmm11
-  DB  196,67,125,25,210,1                 ; vextractf128  $0x1,%ymm10,%xmm10
-  DB  196,193,41,114,242,5                ; vpslld        $0x5,%xmm10,%xmm10
-  DB  196,67,37,24,210,1                  ; vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
-  DB  196,65,45,86,201                    ; vorpd         %ymm9,%ymm10,%ymm9
-  DB  197,60,89,194                       ; vmulps        %ymm2,%ymm8,%ymm8
-  DB  196,65,125,91,192                   ; vcvtps2dq     %ymm8,%ymm8
-  DB  196,65,53,86,192                    ; vorpd         %ymm8,%ymm9,%ymm8
-  DB  196,67,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm9
-  DB  196,66,57,43,193                    ; vpackusdw     %xmm9,%xmm8,%xmm8
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           cce <_sk_store_565_avx+0x86>
-  DB  196,65,122,127,4,121                ; vmovdqu       %xmm8,(%r9,%rdi,2)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  137,200                             ; mov           %ecx,%eax
-  DB  36,7                                ; and           $0x7,%al
-  DB  254,200                             ; dec           %al
-  DB  68,15,182,192                       ; movzbl        %al,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            cca <_sk_store_565_avx+0x82>
-  DB  76,141,21,71,0,0,0                  ; lea           0x47(%rip),%r10        # d2c <_sk_store_565_avx+0xe4>
-  DB  75,99,4,130                         ; movslq        (%r10,%r8,4),%rax
-  DB  76,1,208                            ; add           %r10,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  196,67,121,21,68,121,12,6           ; vpextrw       $0x6,%xmm8,0xc(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,10,5           ; vpextrw       $0x5,%xmm8,0xa(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,8,4            ; vpextrw       $0x4,%xmm8,0x8(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,6,3            ; vpextrw       $0x3,%xmm8,0x6(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,4,2            ; vpextrw       $0x2,%xmm8,0x4(%r9,%rdi,2)
-  DB  196,67,121,21,68,121,2,1            ; vpextrw       $0x1,%xmm8,0x2(%r9,%rdi,2)
-  DB  197,121,126,192                     ; vmovd         %xmm8,%eax
-  DB  102,65,137,4,121                    ; mov           %ax,(%r9,%rdi,2)
-  DB  235,161                             ; jmp           cca <_sk_store_565_avx+0x82>
-  DB  15,31,0                             ; nopl          (%rax)
-  DB  242,255                             ; repnz         (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  234                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,226                             ; jmpq          *%rdx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  218,255                             ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,210                             ; callq         *%rdx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,202                             ; dec           %edx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,194                             ; inc           %edx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_load_8888_avx
-_sk_load_8888_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,16                           ; mov           (%rax),%r10
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,125                             ; jne           dcf <_sk_load_8888_avx+0x87>
-  DB  196,65,124,16,12,186                ; vmovups       (%r10,%rdi,4),%ymm9
-  DB  196,98,125,24,90,16                 ; vbroadcastss  0x10(%rdx),%ymm11
-  DB  196,193,36,84,193                   ; vandps        %ymm9,%ymm11,%ymm0
-  DB  197,252,91,192                      ; vcvtdq2ps     %ymm0,%ymm0
-  DB  196,98,125,24,66,12                 ; vbroadcastss  0xc(%rdx),%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  196,193,41,114,209,8                ; vpsrld        $0x8,%xmm9,%xmm10
-  DB  196,99,125,25,203,1                 ; vextractf128  $0x1,%ymm9,%xmm3
-  DB  197,241,114,211,8                   ; vpsrld        $0x8,%xmm3,%xmm1
-  DB  196,227,45,24,201,1                 ; vinsertf128   $0x1,%xmm1,%ymm10,%ymm1
-  DB  197,164,84,201                      ; vandps        %ymm1,%ymm11,%ymm1
-  DB  197,252,91,201                      ; vcvtdq2ps     %ymm1,%ymm1
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  196,193,41,114,209,16               ; vpsrld        $0x10,%xmm9,%xmm10
-  DB  197,233,114,211,16                  ; vpsrld        $0x10,%xmm3,%xmm2
-  DB  196,227,45,24,210,1                 ; vinsertf128   $0x1,%xmm2,%ymm10,%ymm2
-  DB  197,164,84,210                      ; vandps        %ymm2,%ymm11,%ymm2
-  DB  197,252,91,210                      ; vcvtdq2ps     %ymm2,%ymm2
-  DB  197,188,89,210                      ; vmulps        %ymm2,%ymm8,%ymm2
-  DB  196,193,49,114,209,24               ; vpsrld        $0x18,%xmm9,%xmm9
-  DB  197,225,114,211,24                  ; vpsrld        $0x18,%xmm3,%xmm3
-  DB  196,227,53,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm9,%ymm3
-  DB  197,252,91,219                      ; vcvtdq2ps     %ymm3,%ymm3
-  DB  196,193,100,89,216                  ; vmulps        %ymm8,%ymm3,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  65,137,200                          ; mov           %ecx,%r8d
-  DB  65,128,224,7                        ; and           $0x7,%r8b
-  DB  196,65,52,87,201                    ; vxorps        %ymm9,%ymm9,%ymm9
-  DB  65,254,200                          ; dec           %r8b
-  DB  69,15,182,192                       ; movzbl        %r8b,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  15,135,108,255,255,255              ; ja            d58 <_sk_load_8888_avx+0x10>
-  DB  76,141,13,137,0,0,0                 ; lea           0x89(%rip),%r9        # e7c <_sk_load_8888_avx+0x134>
-  DB  75,99,4,129                         ; movslq        (%r9,%r8,4),%rax
-  DB  76,1,200                            ; add           %r9,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  196,193,121,110,68,186,24           ; vmovd         0x18(%r10,%rdi,4),%xmm0
-  DB  197,249,112,192,68                  ; vpshufd       $0x44,%xmm0,%xmm0
-  DB  196,227,125,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
-  DB  197,244,87,201                      ; vxorps        %ymm1,%ymm1,%ymm1
-  DB  196,99,117,12,200,64                ; vblendps      $0x40,%ymm0,%ymm1,%ymm9
-  DB  196,99,125,25,200,1                 ; vextractf128  $0x1,%ymm9,%xmm0
-  DB  196,195,121,34,68,186,20,1          ; vpinsrd       $0x1,0x14(%r10,%rdi,4),%xmm0,%xmm0
-  DB  196,99,53,24,200,1                  ; vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
-  DB  196,99,125,25,200,1                 ; vextractf128  $0x1,%ymm9,%xmm0
-  DB  196,195,121,34,68,186,16,0          ; vpinsrd       $0x0,0x10(%r10,%rdi,4),%xmm0,%xmm0
-  DB  196,99,53,24,200,1                  ; vinsertf128   $0x1,%xmm0,%ymm9,%ymm9
-  DB  196,195,49,34,68,186,12,3           ; vpinsrd       $0x3,0xc(%r10,%rdi,4),%xmm9,%xmm0
-  DB  196,99,53,12,200,15                 ; vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  DB  196,195,49,34,68,186,8,2            ; vpinsrd       $0x2,0x8(%r10,%rdi,4),%xmm9,%xmm0
-  DB  196,99,53,12,200,15                 ; vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  DB  196,195,49,34,68,186,4,1            ; vpinsrd       $0x1,0x4(%r10,%rdi,4),%xmm9,%xmm0
-  DB  196,99,53,12,200,15                 ; vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  DB  196,195,49,34,4,186,0               ; vpinsrd       $0x0,(%r10,%rdi,4),%xmm9,%xmm0
-  DB  196,99,53,12,200,15                 ; vblendps      $0xf,%ymm0,%ymm9,%ymm9
-  DB  233,220,254,255,255                 ; jmpq          d58 <_sk_load_8888_avx+0x10>
-  DB  238                                 ; out           %al,(%dx)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,224                             ; jmpq          *%rax
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,210                             ; callq         *%rdx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,196                             ; inc           %esp
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,176,255,255,255,156             ; pushq         -0x63000001(%rax)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-  DB  128,255,255                         ; cmp           $0xff,%bh
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_store_8888_avx
-_sk_store_8888_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,8                            ; mov           (%rax),%r9
-  DB  196,98,125,24,66,8                  ; vbroadcastss  0x8(%rdx),%ymm8
-  DB  197,60,89,200                       ; vmulps        %ymm0,%ymm8,%ymm9
-  DB  196,65,125,91,201                   ; vcvtps2dq     %ymm9,%ymm9
-  DB  197,60,89,209                       ; vmulps        %ymm1,%ymm8,%ymm10
-  DB  196,65,125,91,210                   ; vcvtps2dq     %ymm10,%ymm10
-  DB  196,193,33,114,242,8                ; vpslld        $0x8,%xmm10,%xmm11
-  DB  196,67,125,25,210,1                 ; vextractf128  $0x1,%ymm10,%xmm10
-  DB  196,193,41,114,242,8                ; vpslld        $0x8,%xmm10,%xmm10
-  DB  196,67,37,24,210,1                  ; vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
-  DB  196,65,45,86,201                    ; vorpd         %ymm9,%ymm10,%ymm9
-  DB  197,60,89,210                       ; vmulps        %ymm2,%ymm8,%ymm10
-  DB  196,65,125,91,210                   ; vcvtps2dq     %ymm10,%ymm10
-  DB  196,193,33,114,242,16               ; vpslld        $0x10,%xmm10,%xmm11
-  DB  196,67,125,25,210,1                 ; vextractf128  $0x1,%ymm10,%xmm10
-  DB  196,193,41,114,242,16               ; vpslld        $0x10,%xmm10,%xmm10
-  DB  196,67,37,24,210,1                  ; vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
-  DB  197,60,89,195                       ; vmulps        %ymm3,%ymm8,%ymm8
-  DB  196,65,125,91,192                   ; vcvtps2dq     %ymm8,%ymm8
-  DB  196,193,33,114,240,24               ; vpslld        $0x18,%xmm8,%xmm11
-  DB  196,67,125,25,192,1                 ; vextractf128  $0x1,%ymm8,%xmm8
-  DB  196,193,57,114,240,24               ; vpslld        $0x18,%xmm8,%xmm8
-  DB  196,67,37,24,192,1                  ; vinsertf128   $0x1,%xmm8,%ymm11,%ymm8
-  DB  196,65,45,86,192                    ; vorpd         %ymm8,%ymm10,%ymm8
-  DB  196,65,53,86,192                    ; vorpd         %ymm8,%ymm9,%ymm8
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,10                              ; jne           f2d <_sk_store_8888_avx+0x95>
-  DB  196,65,124,17,4,185                 ; vmovups       %ymm8,(%r9,%rdi,4)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  137,200                             ; mov           %ecx,%eax
-  DB  36,7                                ; and           $0x7,%al
-  DB  254,200                             ; dec           %al
-  DB  68,15,182,192                       ; movzbl        %al,%r8d
-  DB  65,128,248,6                        ; cmp           $0x6,%r8b
-  DB  119,236                             ; ja            f29 <_sk_store_8888_avx+0x91>
-  DB  76,141,21,84,0,0,0                  ; lea           0x54(%rip),%r10        # f98 <_sk_store_8888_avx+0x100>
-  DB  75,99,4,130                         ; movslq        (%r10,%r8,4),%rax
-  DB  76,1,208                            ; add           %r10,%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  196,67,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm9
-  DB  196,67,121,22,76,185,24,2           ; vpextrd       $0x2,%xmm9,0x18(%r9,%rdi,4)
-  DB  196,67,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm9
-  DB  196,67,121,22,76,185,20,1           ; vpextrd       $0x1,%xmm9,0x14(%r9,%rdi,4)
-  DB  196,67,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm9
-  DB  196,65,121,126,76,185,16            ; vmovd         %xmm9,0x10(%r9,%rdi,4)
-  DB  196,67,121,22,68,185,12,3           ; vpextrd       $0x3,%xmm8,0xc(%r9,%rdi,4)
-  DB  196,67,121,22,68,185,8,2            ; vpextrd       $0x2,%xmm8,0x8(%r9,%rdi,4)
-  DB  196,67,121,22,68,185,4,1            ; vpextrd       $0x1,%xmm8,0x4(%r9,%rdi,4)
-  DB  196,65,121,126,4,185                ; vmovd         %xmm8,(%r9,%rdi,4)
-  DB  235,147                             ; jmp           f29 <_sk_store_8888_avx+0x91>
-  DB  102,144                             ; xchg          %ax,%ax
-  DB  246,255                             ; idiv          %bh
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  238                                 ; out           %al,(%dx)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,230                             ; jmpq          *%rsi
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  222,255                             ; fdivrp        %st,%st(7)
-  DB  255                                 ; (bad)
-  DB  255,209                             ; callq         *%rcx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255,195                             ; inc           %ebx
-  DB  255                                 ; (bad)
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-  DB  181,255                             ; mov           $0xff,%ch
-  DB  255                                 ; (bad)
-  DB  255                                 ; .byte         0xff
-
-PUBLIC _sk_load_f16_avx
-_sk_load_f16_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  15,133,240,0,0,0                    ; jne           10b2 <_sk_load_f16_avx+0xfe>
-  DB  197,249,16,12,248                   ; vmovupd       (%rax,%rdi,8),%xmm1
-  DB  197,249,16,84,248,16                ; vmovupd       0x10(%rax,%rdi,8),%xmm2
-  DB  197,249,16,92,248,32                ; vmovupd       0x20(%rax,%rdi,8),%xmm3
-  DB  197,121,16,68,248,48                ; vmovupd       0x30(%rax,%rdi,8),%xmm8
-  DB  197,241,97,194                      ; vpunpcklwd    %xmm2,%xmm1,%xmm0
-  DB  197,241,105,202                     ; vpunpckhwd    %xmm2,%xmm1,%xmm1
-  DB  196,193,97,97,208                   ; vpunpcklwd    %xmm8,%xmm3,%xmm2
-  DB  196,193,97,105,216                  ; vpunpckhwd    %xmm8,%xmm3,%xmm3
-  DB  197,121,97,193                      ; vpunpcklwd    %xmm1,%xmm0,%xmm8
-  DB  197,249,105,193                     ; vpunpckhwd    %xmm1,%xmm0,%xmm0
-  DB  197,233,97,203                      ; vpunpcklwd    %xmm3,%xmm2,%xmm1
-  DB  197,105,105,203                     ; vpunpckhwd    %xmm3,%xmm2,%xmm9
-  DB  197,249,110,90,100                  ; vmovd         0x64(%rdx),%xmm3
-  DB  197,249,112,219,0                   ; vpshufd       $0x0,%xmm3,%xmm3
-  DB  196,193,97,101,208                  ; vpcmpgtw      %xmm8,%xmm3,%xmm2
-  DB  196,65,105,223,192                  ; vpandn        %xmm8,%xmm2,%xmm8
-  DB  197,225,101,208                     ; vpcmpgtw      %xmm0,%xmm3,%xmm2
-  DB  197,233,223,192                     ; vpandn        %xmm0,%xmm2,%xmm0
-  DB  197,225,101,209                     ; vpcmpgtw      %xmm1,%xmm3,%xmm2
-  DB  197,233,223,201                     ; vpandn        %xmm1,%xmm2,%xmm1
-  DB  196,193,97,101,209                  ; vpcmpgtw      %xmm9,%xmm3,%xmm2
-  DB  196,193,105,223,209                 ; vpandn        %xmm9,%xmm2,%xmm2
-  DB  196,66,121,51,208                   ; vpmovzxwd     %xmm8,%xmm10
-  DB  196,98,121,51,201                   ; vpmovzxwd     %xmm1,%xmm9
-  DB  197,225,239,219                     ; vpxor         %xmm3,%xmm3,%xmm3
-  DB  197,57,105,195                      ; vpunpckhwd    %xmm3,%xmm8,%xmm8
-  DB  197,241,105,203                     ; vpunpckhwd    %xmm3,%xmm1,%xmm1
-  DB  196,98,121,51,216                   ; vpmovzxwd     %xmm0,%xmm11
-  DB  196,98,121,51,226                   ; vpmovzxwd     %xmm2,%xmm12
-  DB  197,121,105,235                     ; vpunpckhwd    %xmm3,%xmm0,%xmm13
-  DB  197,105,105,243                     ; vpunpckhwd    %xmm3,%xmm2,%xmm14
-  DB  196,193,121,114,242,13              ; vpslld        $0xd,%xmm10,%xmm0
-  DB  196,193,105,114,241,13              ; vpslld        $0xd,%xmm9,%xmm2
-  DB  196,227,125,24,194,1                ; vinsertf128   $0x1,%xmm2,%ymm0,%ymm0
-  DB  196,98,125,24,74,92                 ; vbroadcastss  0x5c(%rdx),%ymm9
-  DB  197,180,89,192                      ; vmulps        %ymm0,%ymm9,%ymm0
-  DB  196,193,105,114,240,13              ; vpslld        $0xd,%xmm8,%xmm2
-  DB  197,241,114,241,13                  ; vpslld        $0xd,%xmm1,%xmm1
-  DB  196,227,109,24,201,1                ; vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
-  DB  197,180,89,201                      ; vmulps        %ymm1,%ymm9,%ymm1
-  DB  196,193,105,114,243,13              ; vpslld        $0xd,%xmm11,%xmm2
-  DB  196,193,97,114,244,13               ; vpslld        $0xd,%xmm12,%xmm3
-  DB  196,227,109,24,211,1                ; vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
-  DB  197,180,89,210                      ; vmulps        %ymm2,%ymm9,%ymm2
-  DB  196,193,57,114,245,13               ; vpslld        $0xd,%xmm13,%xmm8
-  DB  196,193,97,114,246,13               ; vpslld        $0xd,%xmm14,%xmm3
-  DB  196,227,61,24,219,1                 ; vinsertf128   $0x1,%xmm3,%ymm8,%ymm3
-  DB  197,180,89,219                      ; vmulps        %ymm3,%ymm9,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  197,251,16,12,248                   ; vmovsd        (%rax,%rdi,8),%xmm1
-  DB  196,65,57,87,192                    ; vxorpd        %xmm8,%xmm8,%xmm8
-  DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  117,6                               ; jne           10c8 <_sk_load_f16_avx+0x114>
-  DB  197,250,126,201                     ; vmovq         %xmm1,%xmm1
-  DB  235,30                              ; jmp           10e6 <_sk_load_f16_avx+0x132>
-  DB  197,241,22,76,248,8                 ; vmovhpd       0x8(%rax,%rdi,8),%xmm1,%xmm1
-  DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,18                              ; jb            10e6 <_sk_load_f16_avx+0x132>
-  DB  197,251,16,84,248,16                ; vmovsd        0x10(%rax,%rdi,8),%xmm2
-  DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  117,19                              ; jne           10f3 <_sk_load_f16_avx+0x13f>
-  DB  197,250,126,210                     ; vmovq         %xmm2,%xmm2
-  DB  235,46                              ; jmp           1114 <_sk_load_f16_avx+0x160>
-  DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  197,233,87,210                      ; vxorpd        %xmm2,%xmm2,%xmm2
-  DB  233,230,254,255,255                 ; jmpq          fd9 <_sk_load_f16_avx+0x25>
-  DB  197,233,22,84,248,24                ; vmovhpd       0x18(%rax,%rdi,8),%xmm2,%xmm2
-  DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,21                              ; jb            1114 <_sk_load_f16_avx+0x160>
-  DB  197,251,16,92,248,32                ; vmovsd        0x20(%rax,%rdi,8),%xmm3
-  DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  117,18                              ; jne           111d <_sk_load_f16_avx+0x169>
-  DB  197,250,126,219                     ; vmovq         %xmm3,%xmm3
-  DB  233,197,254,255,255                 ; jmpq          fd9 <_sk_load_f16_avx+0x25>
-  DB  197,225,87,219                      ; vxorpd        %xmm3,%xmm3,%xmm3
-  DB  233,188,254,255,255                 ; jmpq          fd9 <_sk_load_f16_avx+0x25>
-  DB  197,225,22,92,248,40                ; vmovhpd       0x28(%rax,%rdi,8),%xmm3,%xmm3
-  DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  15,130,172,254,255,255              ; jb            fd9 <_sk_load_f16_avx+0x25>
-  DB  197,123,16,68,248,48                ; vmovsd        0x30(%rax,%rdi,8),%xmm8
-  DB  233,161,254,255,255                 ; jmpq          fd9 <_sk_load_f16_avx+0x25>
-
-PUBLIC _sk_store_f16_avx
-_sk_store_f16_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  196,98,125,24,66,96                 ; vbroadcastss  0x60(%rdx),%ymm8
-  DB  197,60,89,200                       ; vmulps        %ymm0,%ymm8,%ymm9
-  DB  196,67,125,25,202,1                 ; vextractf128  $0x1,%ymm9,%xmm10
-  DB  196,193,41,114,210,13               ; vpsrld        $0xd,%xmm10,%xmm10
-  DB  196,193,49,114,209,13               ; vpsrld        $0xd,%xmm9,%xmm9
-  DB  197,60,89,217                       ; vmulps        %ymm1,%ymm8,%ymm11
-  DB  196,67,125,25,220,1                 ; vextractf128  $0x1,%ymm11,%xmm12
-  DB  196,193,25,114,212,13               ; vpsrld        $0xd,%xmm12,%xmm12
-  DB  196,193,33,114,211,13               ; vpsrld        $0xd,%xmm11,%xmm11
-  DB  197,60,89,234                       ; vmulps        %ymm2,%ymm8,%ymm13
-  DB  196,67,125,25,238,1                 ; vextractf128  $0x1,%ymm13,%xmm14
-  DB  196,193,9,114,214,13                ; vpsrld        $0xd,%xmm14,%xmm14
-  DB  196,193,17,114,213,13               ; vpsrld        $0xd,%xmm13,%xmm13
-  DB  197,60,89,195                       ; vmulps        %ymm3,%ymm8,%ymm8
-  DB  196,67,125,25,199,1                 ; vextractf128  $0x1,%ymm8,%xmm15
-  DB  196,193,1,114,215,13                ; vpsrld        $0xd,%xmm15,%xmm15
-  DB  196,193,57,114,208,13               ; vpsrld        $0xd,%xmm8,%xmm8
-  DB  196,193,33,115,251,2                ; vpslldq       $0x2,%xmm11,%xmm11
-  DB  196,65,33,235,201                   ; vpor          %xmm9,%xmm11,%xmm9
-  DB  196,193,33,115,252,2                ; vpslldq       $0x2,%xmm12,%xmm11
-  DB  196,65,33,235,226                   ; vpor          %xmm10,%xmm11,%xmm12
-  DB  196,193,57,115,248,2                ; vpslldq       $0x2,%xmm8,%xmm8
-  DB  196,65,57,235,197                   ; vpor          %xmm13,%xmm8,%xmm8
-  DB  196,193,41,115,255,2                ; vpslldq       $0x2,%xmm15,%xmm10
-  DB  196,65,41,235,238                   ; vpor          %xmm14,%xmm10,%xmm13
-  DB  196,65,49,98,216                    ; vpunpckldq    %xmm8,%xmm9,%xmm11
-  DB  196,65,49,106,208                   ; vpunpckhdq    %xmm8,%xmm9,%xmm10
-  DB  196,65,25,98,205                    ; vpunpckldq    %xmm13,%xmm12,%xmm9
-  DB  196,65,25,106,197                   ; vpunpckhdq    %xmm13,%xmm12,%xmm8
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,27                              ; jne           11fb <_sk_store_f16_avx+0xc3>
-  DB  197,120,17,28,248                   ; vmovups       %xmm11,(%rax,%rdi,8)
-  DB  197,120,17,84,248,16                ; vmovups       %xmm10,0x10(%rax,%rdi,8)
-  DB  197,120,17,76,248,32                ; vmovups       %xmm9,0x20(%rax,%rdi,8)
-  DB  197,122,127,68,248,48               ; vmovdqu       %xmm8,0x30(%rax,%rdi,8)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  197,121,214,28,248                  ; vmovq         %xmm11,(%rax,%rdi,8)
-  DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,241                             ; je            11f7 <_sk_store_f16_avx+0xbf>
-  DB  197,121,23,92,248,8                 ; vmovhpd       %xmm11,0x8(%rax,%rdi,8)
-  DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,229                             ; jb            11f7 <_sk_store_f16_avx+0xbf>
-  DB  197,121,214,84,248,16               ; vmovq         %xmm10,0x10(%rax,%rdi,8)
-  DB  116,221                             ; je            11f7 <_sk_store_f16_avx+0xbf>
-  DB  197,121,23,84,248,24                ; vmovhpd       %xmm10,0x18(%rax,%rdi,8)
-  DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,209                             ; jb            11f7 <_sk_store_f16_avx+0xbf>
-  DB  197,121,214,76,248,32               ; vmovq         %xmm9,0x20(%rax,%rdi,8)
-  DB  116,201                             ; je            11f7 <_sk_store_f16_avx+0xbf>
-  DB  197,121,23,76,248,40                ; vmovhpd       %xmm9,0x28(%rax,%rdi,8)
-  DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,189                             ; jb            11f7 <_sk_store_f16_avx+0xbf>
-  DB  197,121,214,68,248,48               ; vmovq         %xmm8,0x30(%rax,%rdi,8)
-  DB  235,181                             ; jmp           11f7 <_sk_store_f16_avx+0xbf>
-
-PUBLIC _sk_store_f32_avx
-_sk_store_f32_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  76,139,0                            ; mov           (%rax),%r8
-  DB  72,141,4,189,0,0,0,0                ; lea           0x0(,%rdi,4),%rax
-  DB  197,124,20,193                      ; vunpcklps     %ymm1,%ymm0,%ymm8
-  DB  197,124,21,217                      ; vunpckhps     %ymm1,%ymm0,%ymm11
-  DB  197,108,20,203                      ; vunpcklps     %ymm3,%ymm2,%ymm9
-  DB  197,108,21,227                      ; vunpckhps     %ymm3,%ymm2,%ymm12
-  DB  196,65,61,20,209                    ; vunpcklpd     %ymm9,%ymm8,%ymm10
-  DB  196,65,61,21,201                    ; vunpckhpd     %ymm9,%ymm8,%ymm9
-  DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
-  DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
-  DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,55                              ; jne           12af <_sk_store_f32_avx+0x6d>
-  DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
-  DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
-  DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
-  DB  196,67,61,6,195,49                  ; vperm2f128    $0x31,%ymm11,%ymm8,%ymm8
-  DB  196,65,125,17,36,128                ; vmovupd       %ymm12,(%r8,%rax,4)
-  DB  196,65,125,17,108,128,32            ; vmovupd       %ymm13,0x20(%r8,%rax,4)
-  DB  196,65,125,17,76,128,64             ; vmovupd       %ymm9,0x40(%r8,%rax,4)
-  DB  196,65,125,17,68,128,96             ; vmovupd       %ymm8,0x60(%r8,%rax,4)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-  DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
-  DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            12ab <_sk_store_f32_avx+0x69>
-  DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
-  DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            12ab <_sk_store_f32_avx+0x69>
-  DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
-  DB  116,218                             ; je            12ab <_sk_store_f32_avx+0x69>
-  DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
-  DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            12ab <_sk_store_f32_avx+0x69>
-  DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  DB  116,195                             ; je            12ab <_sk_store_f32_avx+0x69>
-  DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
-  DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,181                             ; jb            12ab <_sk_store_f32_avx+0x69>
-  DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  DB  235,171                             ; jmp           12ab <_sk_store_f32_avx+0x69>
-
-PUBLIC _sk_clamp_x_avx
-_sk_clamp_x_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  197,60,95,200                       ; vmaxps        %ymm0,%ymm8,%ymm9
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,99,125,25,192,1                 ; vextractf128  $0x1,%ymm8,%xmm0
-  DB  196,65,41,118,210                   ; vpcmpeqd      %xmm10,%xmm10,%xmm10
-  DB  196,193,121,254,194                 ; vpaddd        %xmm10,%xmm0,%xmm0
-  DB  196,65,57,254,194                   ; vpaddd        %xmm10,%xmm8,%xmm8
-  DB  196,227,61,24,192,1                 ; vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
-  DB  197,180,93,192                      ; vminps        %ymm0,%ymm9,%ymm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_y_avx
-_sk_clamp_y_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  197,60,95,201                       ; vmaxps        %ymm1,%ymm8,%ymm9
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,99,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm1
-  DB  196,65,41,118,210                   ; vpcmpeqd      %xmm10,%xmm10,%xmm10
-  DB  196,193,113,254,202                 ; vpaddd        %xmm10,%xmm1,%xmm1
-  DB  196,65,57,254,194                   ; vpaddd        %xmm10,%xmm8,%xmm8
-  DB  196,227,61,24,201,1                 ; vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
-  DB  197,180,93,201                      ; vminps        %ymm1,%ymm9,%ymm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_repeat_x_avx
-_sk_repeat_x_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,65,124,94,200                   ; vdivps        %ymm8,%ymm0,%ymm9
-  DB  196,67,125,8,201,1                  ; vroundps      $0x1,%ymm9,%ymm9
-  DB  196,65,52,89,200                    ; vmulps        %ymm8,%ymm9,%ymm9
-  DB  196,65,124,92,201                   ; vsubps        %ymm9,%ymm0,%ymm9
-  DB  196,99,125,25,192,1                 ; vextractf128  $0x1,%ymm8,%xmm0
-  DB  196,65,41,118,210                   ; vpcmpeqd      %xmm10,%xmm10,%xmm10
-  DB  196,193,121,254,194                 ; vpaddd        %xmm10,%xmm0,%xmm0
-  DB  196,65,57,254,194                   ; vpaddd        %xmm10,%xmm8,%xmm8
-  DB  196,227,61,24,192,1                 ; vinsertf128   $0x1,%xmm0,%ymm8,%ymm0
-  DB  197,180,93,192                      ; vminps        %ymm0,%ymm9,%ymm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_repeat_y_avx
-_sk_repeat_y_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,65,116,94,200                   ; vdivps        %ymm8,%ymm1,%ymm9
-  DB  196,67,125,8,201,1                  ; vroundps      $0x1,%ymm9,%ymm9
-  DB  196,65,52,89,200                    ; vmulps        %ymm8,%ymm9,%ymm9
-  DB  196,65,116,92,201                   ; vsubps        %ymm9,%ymm1,%ymm9
-  DB  196,99,125,25,193,1                 ; vextractf128  $0x1,%ymm8,%xmm1
-  DB  196,65,41,118,210                   ; vpcmpeqd      %xmm10,%xmm10,%xmm10
-  DB  196,193,113,254,202                 ; vpaddd        %xmm10,%xmm1,%xmm1
-  DB  196,65,57,254,194                   ; vpaddd        %xmm10,%xmm8,%xmm8
-  DB  196,227,61,24,201,1                 ; vinsertf128   $0x1,%xmm1,%ymm8,%ymm1
-  DB  197,180,93,201                      ; vminps        %ymm1,%ymm9,%ymm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_mirror_x_avx
-_sk_mirror_x_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,122,16,0                        ; vmovss        (%rax),%xmm8
-  DB  196,65,121,112,200,0                ; vpshufd       $0x0,%xmm8,%xmm9
-  DB  196,67,53,24,201,1                  ; vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
-  DB  196,65,124,92,209                   ; vsubps        %ymm9,%ymm0,%ymm10
-  DB  196,193,58,88,192                   ; vaddss        %xmm8,%xmm8,%xmm0
-  DB  196,227,121,4,192,0                 ; vpermilps     $0x0,%xmm0,%xmm0
-  DB  196,227,125,24,192,1                ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
-  DB  197,44,94,192                       ; vdivps        %ymm0,%ymm10,%ymm8
-  DB  196,67,125,8,192,1                  ; vroundps      $0x1,%ymm8,%ymm8
-  DB  197,188,89,192                      ; vmulps        %ymm0,%ymm8,%ymm0
-  DB  197,172,92,192                      ; vsubps        %ymm0,%ymm10,%ymm0
-  DB  196,193,124,92,193                  ; vsubps        %ymm9,%ymm0,%ymm0
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  197,60,92,192                       ; vsubps        %ymm0,%ymm8,%ymm8
-  DB  197,60,84,192                       ; vandps        %ymm0,%ymm8,%ymm8
-  DB  196,99,125,25,200,1                 ; vextractf128  $0x1,%ymm9,%xmm0
-  DB  196,65,41,118,210                   ; vpcmpeqd      %xmm10,%xmm10,%xmm10
-  DB  196,193,121,254,194                 ; vpaddd        %xmm10,%xmm0,%xmm0
-  DB  196,65,49,254,202                   ; vpaddd        %xmm10,%xmm9,%xmm9
-  DB  196,227,53,24,192,1                 ; vinsertf128   $0x1,%xmm0,%ymm9,%ymm0
-  DB  197,188,93,192                      ; vminps        %ymm0,%ymm8,%ymm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_mirror_y_avx
-_sk_mirror_y_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,122,16,0                        ; vmovss        (%rax),%xmm8
-  DB  196,65,121,112,200,0                ; vpshufd       $0x0,%xmm8,%xmm9
-  DB  196,67,53,24,201,1                  ; vinsertf128   $0x1,%xmm9,%ymm9,%ymm9
-  DB  196,65,116,92,209                   ; vsubps        %ymm9,%ymm1,%ymm10
-  DB  196,193,58,88,200                   ; vaddss        %xmm8,%xmm8,%xmm1
-  DB  196,227,121,4,201,0                 ; vpermilps     $0x0,%xmm1,%xmm1
-  DB  196,227,117,24,201,1                ; vinsertf128   $0x1,%xmm1,%ymm1,%ymm1
-  DB  197,44,94,193                       ; vdivps        %ymm1,%ymm10,%ymm8
-  DB  196,67,125,8,192,1                  ; vroundps      $0x1,%ymm8,%ymm8
-  DB  197,188,89,201                      ; vmulps        %ymm1,%ymm8,%ymm1
-  DB  197,172,92,201                      ; vsubps        %ymm1,%ymm10,%ymm1
-  DB  196,193,116,92,201                  ; vsubps        %ymm9,%ymm1,%ymm1
-  DB  196,65,60,87,192                    ; vxorps        %ymm8,%ymm8,%ymm8
-  DB  197,60,92,193                       ; vsubps        %ymm1,%ymm8,%ymm8
-  DB  197,60,84,193                       ; vandps        %ymm1,%ymm8,%ymm8
-  DB  196,99,125,25,201,1                 ; vextractf128  $0x1,%ymm9,%xmm1
-  DB  196,65,41,118,210                   ; vpcmpeqd      %xmm10,%xmm10,%xmm10
-  DB  196,193,113,254,202                 ; vpaddd        %xmm10,%xmm1,%xmm1
-  DB  196,65,49,254,202                   ; vpaddd        %xmm10,%xmm9,%xmm9
-  DB  196,227,53,24,201,1                 ; vinsertf128   $0x1,%xmm1,%ymm9,%ymm1
-  DB  197,188,93,201                      ; vminps        %ymm1,%ymm8,%ymm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_2x3_avx
-_sk_matrix_2x3_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,98,125,24,72,8                  ; vbroadcastss  0x8(%rax),%ymm9
-  DB  196,98,125,24,80,16                 ; vbroadcastss  0x10(%rax),%ymm10
-  DB  197,52,89,201                       ; vmulps        %ymm1,%ymm9,%ymm9
-  DB  196,65,52,88,202                    ; vaddps        %ymm10,%ymm9,%ymm9
-  DB  197,60,89,192                       ; vmulps        %ymm0,%ymm8,%ymm8
-  DB  196,65,60,88,193                    ; vaddps        %ymm9,%ymm8,%ymm8
-  DB  196,98,125,24,72,4                  ; vbroadcastss  0x4(%rax),%ymm9
-  DB  196,98,125,24,80,12                 ; vbroadcastss  0xc(%rax),%ymm10
-  DB  196,98,125,24,88,20                 ; vbroadcastss  0x14(%rax),%ymm11
-  DB  197,172,89,201                      ; vmulps        %ymm1,%ymm10,%ymm1
-  DB  196,193,116,88,203                  ; vaddps        %ymm11,%ymm1,%ymm1
-  DB  197,180,89,192                      ; vmulps        %ymm0,%ymm9,%ymm0
-  DB  197,252,88,201                      ; vaddps        %ymm1,%ymm0,%ymm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,124,41,192                      ; vmovaps       %ymm8,%ymm0
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_3x4_avx
-_sk_matrix_3x4_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,98,125,24,72,12                 ; vbroadcastss  0xc(%rax),%ymm9
-  DB  196,98,125,24,80,24                 ; vbroadcastss  0x18(%rax),%ymm10
-  DB  196,98,125,24,88,36                 ; vbroadcastss  0x24(%rax),%ymm11
-  DB  197,44,89,210                       ; vmulps        %ymm2,%ymm10,%ymm10
-  DB  196,65,44,88,211                    ; vaddps        %ymm11,%ymm10,%ymm10
-  DB  197,52,89,201                       ; vmulps        %ymm1,%ymm9,%ymm9
-  DB  196,65,52,88,202                    ; vaddps        %ymm10,%ymm9,%ymm9
-  DB  197,60,89,192                       ; vmulps        %ymm0,%ymm8,%ymm8
-  DB  196,65,60,88,193                    ; vaddps        %ymm9,%ymm8,%ymm8
-  DB  196,98,125,24,72,4                  ; vbroadcastss  0x4(%rax),%ymm9
-  DB  196,98,125,24,80,16                 ; vbroadcastss  0x10(%rax),%ymm10
-  DB  196,98,125,24,88,28                 ; vbroadcastss  0x1c(%rax),%ymm11
-  DB  196,98,125,24,96,40                 ; vbroadcastss  0x28(%rax),%ymm12
-  DB  197,36,89,218                       ; vmulps        %ymm2,%ymm11,%ymm11
-  DB  196,65,36,88,220                    ; vaddps        %ymm12,%ymm11,%ymm11
-  DB  197,44,89,209                       ; vmulps        %ymm1,%ymm10,%ymm10
-  DB  196,65,44,88,211                    ; vaddps        %ymm11,%ymm10,%ymm10
-  DB  197,52,89,200                       ; vmulps        %ymm0,%ymm9,%ymm9
-  DB  196,65,52,88,202                    ; vaddps        %ymm10,%ymm9,%ymm9
-  DB  196,98,125,24,80,8                  ; vbroadcastss  0x8(%rax),%ymm10
-  DB  196,98,125,24,88,20                 ; vbroadcastss  0x14(%rax),%ymm11
-  DB  196,98,125,24,96,32                 ; vbroadcastss  0x20(%rax),%ymm12
-  DB  196,98,125,24,104,44                ; vbroadcastss  0x2c(%rax),%ymm13
-  DB  197,156,89,210                      ; vmulps        %ymm2,%ymm12,%ymm2
-  DB  196,193,108,88,213                  ; vaddps        %ymm13,%ymm2,%ymm2
-  DB  197,164,89,201                      ; vmulps        %ymm1,%ymm11,%ymm1
-  DB  197,244,88,202                      ; vaddps        %ymm2,%ymm1,%ymm1
-  DB  197,172,89,192                      ; vmulps        %ymm0,%ymm10,%ymm0
-  DB  197,252,88,209                      ; vaddps        %ymm1,%ymm0,%ymm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,124,41,192                      ; vmovaps       %ymm8,%ymm0
-  DB  197,124,41,201                      ; vmovaps       %ymm9,%ymm1
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_perspective_avx
-_sk_matrix_perspective_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,98,125,24,0                     ; vbroadcastss  (%rax),%ymm8
-  DB  196,98,125,24,72,4                  ; vbroadcastss  0x4(%rax),%ymm9
-  DB  196,98,125,24,80,8                  ; vbroadcastss  0x8(%rax),%ymm10
-  DB  197,52,89,201                       ; vmulps        %ymm1,%ymm9,%ymm9
-  DB  196,65,52,88,202                    ; vaddps        %ymm10,%ymm9,%ymm9
-  DB  197,60,89,192                       ; vmulps        %ymm0,%ymm8,%ymm8
-  DB  196,65,60,88,193                    ; vaddps        %ymm9,%ymm8,%ymm8
-  DB  196,98,125,24,72,12                 ; vbroadcastss  0xc(%rax),%ymm9
-  DB  196,98,125,24,80,16                 ; vbroadcastss  0x10(%rax),%ymm10
-  DB  196,98,125,24,88,20                 ; vbroadcastss  0x14(%rax),%ymm11
-  DB  197,44,89,209                       ; vmulps        %ymm1,%ymm10,%ymm10
-  DB  196,65,44,88,211                    ; vaddps        %ymm11,%ymm10,%ymm10
-  DB  197,52,89,200                       ; vmulps        %ymm0,%ymm9,%ymm9
-  DB  196,65,52,88,202                    ; vaddps        %ymm10,%ymm9,%ymm9
-  DB  196,98,125,24,80,24                 ; vbroadcastss  0x18(%rax),%ymm10
-  DB  196,98,125,24,88,28                 ; vbroadcastss  0x1c(%rax),%ymm11
-  DB  196,98,125,24,96,32                 ; vbroadcastss  0x20(%rax),%ymm12
-  DB  197,164,89,201                      ; vmulps        %ymm1,%ymm11,%ymm1
-  DB  196,193,116,88,204                  ; vaddps        %ymm12,%ymm1,%ymm1
-  DB  197,172,89,192                      ; vmulps        %ymm0,%ymm10,%ymm0
-  DB  197,252,88,193                      ; vaddps        %ymm1,%ymm0,%ymm0
-  DB  197,252,83,200                      ; vrcpps        %ymm0,%ymm1
-  DB  197,188,89,193                      ; vmulps        %ymm1,%ymm8,%ymm0
-  DB  197,180,89,201                      ; vmulps        %ymm1,%ymm9,%ymm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_linear_gradient_2stops_avx
-_sk_linear_gradient_2stops_avx LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  196,226,125,24,72,16                ; vbroadcastss  0x10(%rax),%ymm1
-  DB  196,226,125,24,16                   ; vbroadcastss  (%rax),%ymm2
-  DB  197,244,89,200                      ; vmulps        %ymm0,%ymm1,%ymm1
-  DB  197,108,88,193                      ; vaddps        %ymm1,%ymm2,%ymm8
-  DB  196,226,125,24,72,20                ; vbroadcastss  0x14(%rax),%ymm1
-  DB  196,226,125,24,80,4                 ; vbroadcastss  0x4(%rax),%ymm2
-  DB  197,244,89,200                      ; vmulps        %ymm0,%ymm1,%ymm1
-  DB  197,236,88,201                      ; vaddps        %ymm1,%ymm2,%ymm1
-  DB  196,226,125,24,80,24                ; vbroadcastss  0x18(%rax),%ymm2
-  DB  196,226,125,24,88,8                 ; vbroadcastss  0x8(%rax),%ymm3
-  DB  197,236,89,208                      ; vmulps        %ymm0,%ymm2,%ymm2
-  DB  197,228,88,210                      ; vaddps        %ymm2,%ymm3,%ymm2
-  DB  196,226,125,24,88,28                ; vbroadcastss  0x1c(%rax),%ymm3
-  DB  196,98,125,24,72,12                 ; vbroadcastss  0xc(%rax),%ymm9
-  DB  197,228,89,192                      ; vmulps        %ymm0,%ymm3,%ymm0
-  DB  197,180,88,216                      ; vaddps        %ymm0,%ymm9,%ymm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  197,124,41,192                      ; vmovaps       %ymm8,%ymm0
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_start_pipeline_sse41
-_sk_start_pipeline_sse41 LABEL PROC
-  DB  65,87                               ; push          %r15
-  DB  65,86                               ; push          %r14
-  DB  65,85                               ; push          %r13
-  DB  65,84                               ; push          %r12
-  DB  86                                  ; push          %rsi
-  DB  87                                  ; push          %rdi
-  DB  83                                  ; push          %rbx
-  DB  72,129,236,160,0,0,0                ; sub           $0xa0,%rsp
-  DB  68,15,41,188,36,144,0,0,0           ; movaps        %xmm15,0x90(%rsp)
-  DB  68,15,41,180,36,128,0,0,0           ; movaps        %xmm14,0x80(%rsp)
-  DB  68,15,41,108,36,112                 ; movaps        %xmm13,0x70(%rsp)
-  DB  68,15,41,100,36,96                  ; movaps        %xmm12,0x60(%rsp)
-  DB  68,15,41,92,36,80                   ; movaps        %xmm11,0x50(%rsp)
-  DB  68,15,41,84,36,64                   ; movaps        %xmm10,0x40(%rsp)
-  DB  68,15,41,76,36,48                   ; movaps        %xmm9,0x30(%rsp)
-  DB  68,15,41,68,36,32                   ; movaps        %xmm8,0x20(%rsp)
-  DB  15,41,124,36,16                     ; movaps        %xmm7,0x10(%rsp)
-  DB  15,41,52,36                         ; movaps        %xmm6,(%rsp)
-  DB  77,137,207                          ; mov           %r9,%r15
-  DB  77,137,198                          ; mov           %r8,%r14
-  DB  72,137,203                          ; mov           %rcx,%rbx
-  DB  72,137,214                          ; mov           %rdx,%rsi
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  73,137,196                          ; mov           %rax,%r12
-  DB  73,137,245                          ; mov           %rsi,%r13
-  DB  72,141,67,4                         ; lea           0x4(%rbx),%rax
-  DB  76,57,248                           ; cmp           %r15,%rax
-  DB  118,5                               ; jbe           73 <_sk_start_pipeline_sse41+0x73>
-  DB  72,137,216                          ; mov           %rbx,%rax
-  DB  235,52                              ; jmp           a7 <_sk_start_pipeline_sse41+0xa7>
-  DB  15,87,192                           ; xorps         %xmm0,%xmm0
-  DB  15,87,201                           ; xorps         %xmm1,%xmm1
-  DB  15,87,210                           ; xorps         %xmm2,%xmm2
-  DB  15,87,219                           ; xorps         %xmm3,%xmm3
-  DB  15,87,228                           ; xorps         %xmm4,%xmm4
-  DB  15,87,237                           ; xorps         %xmm5,%xmm5
-  DB  15,87,246                           ; xorps         %xmm6,%xmm6
-  DB  15,87,255                           ; xorps         %xmm7,%xmm7
-  DB  72,137,223                          ; mov           %rbx,%rdi
-  DB  76,137,238                          ; mov           %r13,%rsi
-  DB  76,137,242                          ; mov           %r14,%rdx
-  DB  65,255,212                          ; callq         *%r12
-  DB  72,141,67,4                         ; lea           0x4(%rbx),%rax
-  DB  72,131,195,8                        ; add           $0x8,%rbx
-  DB  76,57,251                           ; cmp           %r15,%rbx
-  DB  72,137,195                          ; mov           %rax,%rbx
-  DB  118,204                             ; jbe           73 <_sk_start_pipeline_sse41+0x73>
-  DB  15,40,52,36                         ; movaps        (%rsp),%xmm6
-  DB  15,40,124,36,16                     ; movaps        0x10(%rsp),%xmm7
-  DB  68,15,40,68,36,32                   ; movaps        0x20(%rsp),%xmm8
-  DB  68,15,40,76,36,48                   ; movaps        0x30(%rsp),%xmm9
-  DB  68,15,40,84,36,64                   ; movaps        0x40(%rsp),%xmm10
-  DB  68,15,40,92,36,80                   ; movaps        0x50(%rsp),%xmm11
-  DB  68,15,40,100,36,96                  ; movaps        0x60(%rsp),%xmm12
-  DB  68,15,40,108,36,112                 ; movaps        0x70(%rsp),%xmm13
-  DB  68,15,40,180,36,128,0,0,0           ; movaps        0x80(%rsp),%xmm14
-  DB  68,15,40,188,36,144,0,0,0           ; movaps        0x90(%rsp),%xmm15
-  DB  72,129,196,160,0,0,0                ; add           $0xa0,%rsp
-  DB  91                                  ; pop           %rbx
-  DB  95                                  ; pop           %rdi
-  DB  94                                  ; pop           %rsi
-  DB  65,92                               ; pop           %r12
-  DB  65,93                               ; pop           %r13
-  DB  65,94                               ; pop           %r14
-  DB  65,95                               ; pop           %r15
-  DB  195                                 ; retq
-
-PUBLIC _sk_just_return_sse41
-_sk_just_return_sse41 LABEL PROC
-  DB  195                                 ; retq
-
-PUBLIC _sk_seed_shader_sse41
-_sk_seed_shader_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  102,15,110,199                      ; movd          %edi,%xmm0
-  DB  102,15,112,192,0                    ; pshufd        $0x0,%xmm0,%xmm0
-  DB  15,91,200                           ; cvtdq2ps      %xmm0,%xmm1
-  DB  243,15,16,18                        ; movss         (%rdx),%xmm2
-  DB  243,15,16,90,4                      ; movss         0x4(%rdx),%xmm3
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  15,88,203                           ; addps         %xmm3,%xmm1
-  DB  15,16,66,20                         ; movups        0x14(%rdx),%xmm0
-  DB  15,88,193                           ; addps         %xmm1,%xmm0
-  DB  102,15,110,8                        ; movd          (%rax),%xmm1
-  DB  102,15,112,201,0                    ; pshufd        $0x0,%xmm1,%xmm1
-  DB  15,91,201                           ; cvtdq2ps      %xmm1,%xmm1
-  DB  15,88,203                           ; addps         %xmm3,%xmm1
-  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,87,219                           ; xorps         %xmm3,%xmm3
-  DB  15,87,228                           ; xorps         %xmm4,%xmm4
-  DB  15,87,237                           ; xorps         %xmm5,%xmm5
-  DB  15,87,246                           ; xorps         %xmm6,%xmm6
-  DB  15,87,255                           ; xorps         %xmm7,%xmm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_constant_color_sse41
-_sk_constant_color_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,16,24                            ; movups        (%rax),%xmm3
-  DB  15,40,195                           ; movaps        %xmm3,%xmm0
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  15,40,203                           ; movaps        %xmm3,%xmm1
-  DB  15,198,201,85                       ; shufps        $0x55,%xmm1,%xmm1
-  DB  15,40,211                           ; movaps        %xmm3,%xmm2
-  DB  15,198,210,170                      ; shufps        $0xaa,%xmm2,%xmm2
-  DB  15,198,219,255                      ; shufps        $0xff,%xmm3,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clear_sse41
-_sk_clear_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,87,192                           ; xorps         %xmm0,%xmm0
-  DB  15,87,201                           ; xorps         %xmm1,%xmm1
-  DB  15,87,210                           ; xorps         %xmm2,%xmm2
-  DB  15,87,219                           ; xorps         %xmm3,%xmm3
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_plus__sse41
-_sk_plus__sse41 LABEL PROC
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  15,88,223                           ; addps         %xmm7,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_srcover_sse41
-_sk_srcover_sse41 LABEL PROC
-  DB  243,68,15,16,2                      ; movss         (%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,92,195                        ; subps         %xmm3,%xmm8
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,204                        ; mulps         %xmm4,%xmm9
-  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,205                        ; mulps         %xmm5,%xmm9
-  DB  65,15,88,201                        ; addps         %xmm9,%xmm1
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,206                        ; mulps         %xmm6,%xmm9
-  DB  65,15,88,209                        ; addps         %xmm9,%xmm2
-  DB  68,15,89,199                        ; mulps         %xmm7,%xmm8
-  DB  65,15,88,216                        ; addps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_dstover_sse41
-_sk_dstover_sse41 LABEL PROC
-  DB  243,68,15,16,2                      ; movss         (%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,92,199                        ; subps         %xmm7,%xmm8
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  15,88,223                           ; addps         %xmm7,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_0_sse41
-_sk_clamp_0_sse41 LABEL PROC
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  65,15,95,192                        ; maxps         %xmm8,%xmm0
-  DB  65,15,95,200                        ; maxps         %xmm8,%xmm1
-  DB  65,15,95,208                        ; maxps         %xmm8,%xmm2
-  DB  65,15,95,216                        ; maxps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_1_sse41
-_sk_clamp_1_sse41 LABEL PROC
-  DB  243,68,15,16,2                      ; movss         (%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,93,192                        ; minps         %xmm8,%xmm0
-  DB  65,15,93,200                        ; minps         %xmm8,%xmm1
-  DB  65,15,93,208                        ; minps         %xmm8,%xmm2
-  DB  65,15,93,216                        ; minps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_a_sse41
-_sk_clamp_a_sse41 LABEL PROC
-  DB  243,68,15,16,2                      ; movss         (%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,93,216                        ; minps         %xmm8,%xmm3
-  DB  15,93,195                           ; minps         %xmm3,%xmm0
-  DB  15,93,203                           ; minps         %xmm3,%xmm1
-  DB  15,93,211                           ; minps         %xmm3,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_set_rgb_sse41
-_sk_set_rgb_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,15,16,0                         ; movss         (%rax),%xmm0
-  DB  243,15,16,72,4                      ; movss         0x4(%rax),%xmm1
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  243,15,16,80,8                      ; movss         0x8(%rax),%xmm2
-  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_swap_rb_sse41
-_sk_swap_rb_sse41 LABEL PROC
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,40,194                           ; movaps        %xmm2,%xmm0
-  DB  65,15,40,208                        ; movaps        %xmm8,%xmm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_swap_sse41
-_sk_swap_sse41 LABEL PROC
-  DB  68,15,40,195                        ; movaps        %xmm3,%xmm8
-  DB  68,15,40,202                        ; movaps        %xmm2,%xmm9
-  DB  68,15,40,209                        ; movaps        %xmm1,%xmm10
-  DB  68,15,40,216                        ; movaps        %xmm0,%xmm11
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,40,196                           ; movaps        %xmm4,%xmm0
-  DB  15,40,205                           ; movaps        %xmm5,%xmm1
-  DB  15,40,214                           ; movaps        %xmm6,%xmm2
-  DB  15,40,223                           ; movaps        %xmm7,%xmm3
-  DB  65,15,40,227                        ; movaps        %xmm11,%xmm4
-  DB  65,15,40,234                        ; movaps        %xmm10,%xmm5
-  DB  65,15,40,241                        ; movaps        %xmm9,%xmm6
-  DB  65,15,40,248                        ; movaps        %xmm8,%xmm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_move_src_dst_sse41
-_sk_move_src_dst_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,40,224                           ; movaps        %xmm0,%xmm4
-  DB  15,40,233                           ; movaps        %xmm1,%xmm5
-  DB  15,40,242                           ; movaps        %xmm2,%xmm6
-  DB  15,40,251                           ; movaps        %xmm3,%xmm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_move_dst_src_sse41
-_sk_move_dst_src_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,40,196                           ; movaps        %xmm4,%xmm0
-  DB  15,40,205                           ; movaps        %xmm5,%xmm1
-  DB  15,40,214                           ; movaps        %xmm6,%xmm2
-  DB  15,40,223                           ; movaps        %xmm7,%xmm3
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_premul_sse41
-_sk_premul_sse41 LABEL PROC
-  DB  15,89,195                           ; mulps         %xmm3,%xmm0
-  DB  15,89,203                           ; mulps         %xmm3,%xmm1
-  DB  15,89,211                           ; mulps         %xmm3,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_unpremul_sse41
-_sk_unpremul_sse41 LABEL PROC
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  69,15,87,201                        ; xorps         %xmm9,%xmm9
-  DB  243,68,15,16,18                     ; movss         (%rdx),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  68,15,94,211                        ; divps         %xmm3,%xmm10
-  DB  15,40,195                           ; movaps        %xmm3,%xmm0
-  DB  65,15,194,193,0                     ; cmpeqps       %xmm9,%xmm0
-  DB  102,69,15,56,20,209                 ; blendvps      %xmm0,%xmm9,%xmm10
-  DB  69,15,89,194                        ; mulps         %xmm10,%xmm8
-  DB  65,15,89,202                        ; mulps         %xmm10,%xmm1
-  DB  65,15,89,210                        ; mulps         %xmm10,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,192                        ; movaps        %xmm8,%xmm0
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_from_srgb_sse41
-_sk_from_srgb_sse41 LABEL PROC
-  DB  68,15,40,194                        ; movaps        %xmm2,%xmm8
-  DB  243,68,15,16,90,64                  ; movss         0x40(%rdx),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,40,211                        ; movaps        %xmm11,%xmm10
-  DB  68,15,89,208                        ; mulps         %xmm0,%xmm10
-  DB  68,15,40,240                        ; movaps        %xmm0,%xmm14
-  DB  69,15,89,246                        ; mulps         %xmm14,%xmm14
-  DB  243,15,16,82,60                     ; movss         0x3c(%rdx),%xmm2
-  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
-  DB  243,68,15,16,98,52                  ; movss         0x34(%rdx),%xmm12
-  DB  243,68,15,16,106,56                 ; movss         0x38(%rdx),%xmm13
-  DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
-  DB  68,15,40,202                        ; movaps        %xmm2,%xmm9
-  DB  68,15,89,200                        ; mulps         %xmm0,%xmm9
-  DB  69,15,88,205                        ; addps         %xmm13,%xmm9
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  69,15,89,206                        ; mulps         %xmm14,%xmm9
-  DB  69,15,88,204                        ; addps         %xmm12,%xmm9
-  DB  243,68,15,16,114,68                 ; movss         0x44(%rdx),%xmm14
-  DB  69,15,198,246,0                     ; shufps        $0x0,%xmm14,%xmm14
-  DB  65,15,194,198,1                     ; cmpltps       %xmm14,%xmm0
-  DB  102,69,15,56,20,202                 ; blendvps      %xmm0,%xmm10,%xmm9
-  DB  69,15,40,251                        ; movaps        %xmm11,%xmm15
-  DB  68,15,89,249                        ; mulps         %xmm1,%xmm15
-  DB  15,40,193                           ; movaps        %xmm1,%xmm0
-  DB  15,89,192                           ; mulps         %xmm0,%xmm0
-  DB  68,15,40,210                        ; movaps        %xmm2,%xmm10
-  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
-  DB  69,15,88,213                        ; addps         %xmm13,%xmm10
-  DB  68,15,89,208                        ; mulps         %xmm0,%xmm10
-  DB  69,15,88,212                        ; addps         %xmm12,%xmm10
-  DB  65,15,194,206,1                     ; cmpltps       %xmm14,%xmm1
-  DB  15,40,193                           ; movaps        %xmm1,%xmm0
-  DB  102,69,15,56,20,215                 ; blendvps      %xmm0,%xmm15,%xmm10
-  DB  69,15,89,216                        ; mulps         %xmm8,%xmm11
-  DB  65,15,40,192                        ; movaps        %xmm8,%xmm0
-  DB  15,89,192                           ; mulps         %xmm0,%xmm0
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  65,15,88,213                        ; addps         %xmm13,%xmm2
-  DB  15,89,208                           ; mulps         %xmm0,%xmm2
-  DB  65,15,88,212                        ; addps         %xmm12,%xmm2
-  DB  69,15,194,198,1                     ; cmpltps       %xmm14,%xmm8
-  DB  65,15,40,192                        ; movaps        %xmm8,%xmm0
-  DB  102,65,15,56,20,211                 ; blendvps      %xmm0,%xmm11,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,193                        ; movaps        %xmm9,%xmm0
-  DB  65,15,40,202                        ; movaps        %xmm10,%xmm1
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_to_srgb_sse41
-_sk_to_srgb_sse41 LABEL PROC
-  DB  72,131,236,24                       ; sub           $0x18,%rsp
-  DB  15,41,60,36                         ; movaps        %xmm7,(%rsp)
-  DB  15,40,254                           ; movaps        %xmm6,%xmm7
-  DB  15,40,245                           ; movaps        %xmm5,%xmm6
-  DB  15,40,236                           ; movaps        %xmm4,%xmm5
-  DB  15,40,227                           ; movaps        %xmm3,%xmm4
-  DB  68,15,40,194                        ; movaps        %xmm2,%xmm8
-  DB  15,40,217                           ; movaps        %xmm1,%xmm3
-  DB  15,82,208                           ; rsqrtps       %xmm0,%xmm2
-  DB  68,15,83,202                        ; rcpps         %xmm2,%xmm9
-  DB  68,15,82,210                        ; rsqrtps       %xmm2,%xmm10
-  DB  243,15,16,18                        ; movss         (%rdx),%xmm2
-  DB  243,68,15,16,90,72                  ; movss         0x48(%rdx),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  65,15,40,203                        ; movaps        %xmm11,%xmm1
-  DB  15,89,200                           ; mulps         %xmm0,%xmm1
-  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
-  DB  243,68,15,16,98,76                  ; movss         0x4c(%rdx),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  243,68,15,16,106,80                 ; movss         0x50(%rdx),%xmm13
-  DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
-  DB  243,68,15,16,114,84                 ; movss         0x54(%rdx),%xmm14
-  DB  69,15,198,246,0                     ; shufps        $0x0,%xmm14,%xmm14
-  DB  69,15,89,205                        ; mulps         %xmm13,%xmm9
-  DB  69,15,88,206                        ; addps         %xmm14,%xmm9
-  DB  69,15,89,212                        ; mulps         %xmm12,%xmm10
-  DB  69,15,88,209                        ; addps         %xmm9,%xmm10
-  DB  68,15,40,202                        ; movaps        %xmm2,%xmm9
-  DB  69,15,93,202                        ; minps         %xmm10,%xmm9
-  DB  243,68,15,16,122,88                 ; movss         0x58(%rdx),%xmm15
-  DB  69,15,198,255,0                     ; shufps        $0x0,%xmm15,%xmm15
-  DB  65,15,194,199,1                     ; cmpltps       %xmm15,%xmm0
-  DB  102,68,15,56,20,201                 ; blendvps      %xmm0,%xmm1,%xmm9
-  DB  15,82,195                           ; rsqrtps       %xmm3,%xmm0
-  DB  15,83,200                           ; rcpps         %xmm0,%xmm1
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  65,15,89,205                        ; mulps         %xmm13,%xmm1
-  DB  65,15,88,206                        ; addps         %xmm14,%xmm1
-  DB  65,15,89,196                        ; mulps         %xmm12,%xmm0
-  DB  15,88,193                           ; addps         %xmm1,%xmm0
-  DB  68,15,40,210                        ; movaps        %xmm2,%xmm10
-  DB  68,15,93,208                        ; minps         %xmm0,%xmm10
-  DB  65,15,40,203                        ; movaps        %xmm11,%xmm1
-  DB  15,89,203                           ; mulps         %xmm3,%xmm1
-  DB  65,15,194,223,1                     ; cmpltps       %xmm15,%xmm3
-  DB  15,40,195                           ; movaps        %xmm3,%xmm0
-  DB  102,68,15,56,20,209                 ; blendvps      %xmm0,%xmm1,%xmm10
-  DB  65,15,82,192                        ; rsqrtps       %xmm8,%xmm0
-  DB  15,83,200                           ; rcpps         %xmm0,%xmm1
-  DB  65,15,89,205                        ; mulps         %xmm13,%xmm1
-  DB  65,15,88,206                        ; addps         %xmm14,%xmm1
-  DB  15,82,192                           ; rsqrtps       %xmm0,%xmm0
-  DB  65,15,89,196                        ; mulps         %xmm12,%xmm0
-  DB  15,88,193                           ; addps         %xmm1,%xmm0
-  DB  15,93,208                           ; minps         %xmm0,%xmm2
-  DB  69,15,89,216                        ; mulps         %xmm8,%xmm11
-  DB  69,15,194,199,1                     ; cmpltps       %xmm15,%xmm8
-  DB  65,15,40,192                        ; movaps        %xmm8,%xmm0
-  DB  102,65,15,56,20,211                 ; blendvps      %xmm0,%xmm11,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,193                        ; movaps        %xmm9,%xmm0
-  DB  65,15,40,202                        ; movaps        %xmm10,%xmm1
-  DB  15,40,220                           ; movaps        %xmm4,%xmm3
-  DB  15,40,229                           ; movaps        %xmm5,%xmm4
-  DB  15,40,238                           ; movaps        %xmm6,%xmm5
-  DB  15,40,247                           ; movaps        %xmm7,%xmm6
-  DB  15,40,60,36                         ; movaps        (%rsp),%xmm7
-  DB  72,131,196,24                       ; add           $0x18,%rsp
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_scale_1_float_sse41
-_sk_scale_1_float_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_scale_u8_sse41
-_sk_scale_u8_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,68,15,56,49,4,56                ; pmovzxbd      (%rax,%rdi,1),%xmm8
-  DB  69,15,91,192                        ; cvtdq2ps      %xmm8,%xmm8
-  DB  243,68,15,16,74,12                  ; movss         0xc(%rdx),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  65,15,89,193                        ; mulps         %xmm9,%xmm0
-  DB  65,15,89,201                        ; mulps         %xmm9,%xmm1
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  65,15,89,217                        ; mulps         %xmm9,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_lerp_1_float_sse41
-_sk_lerp_1_float_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  15,92,196                           ; subps         %xmm4,%xmm0
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  15,92,205                           ; subps         %xmm5,%xmm1
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  15,92,214                           ; subps         %xmm6,%xmm2
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  15,92,223                           ; subps         %xmm7,%xmm3
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  15,88,223                           ; addps         %xmm7,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_lerp_u8_sse41
-_sk_lerp_u8_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,68,15,56,49,4,56                ; pmovzxbd      (%rax,%rdi,1),%xmm8
-  DB  69,15,91,192                        ; cvtdq2ps      %xmm8,%xmm8
-  DB  243,68,15,16,74,12                  ; movss         0xc(%rdx),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  15,92,196                           ; subps         %xmm4,%xmm0
-  DB  65,15,89,193                        ; mulps         %xmm9,%xmm0
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  15,92,205                           ; subps         %xmm5,%xmm1
-  DB  65,15,89,201                        ; mulps         %xmm9,%xmm1
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  15,92,214                           ; subps         %xmm6,%xmm2
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  15,92,223                           ; subps         %xmm7,%xmm3
-  DB  65,15,89,217                        ; mulps         %xmm9,%xmm3
-  DB  15,88,223                           ; addps         %xmm7,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_lerp_565_sse41
-_sk_lerp_565_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,68,15,56,51,4,120               ; pmovzxwd      (%rax,%rdi,2),%xmm8
-  DB  102,15,110,90,104                   ; movd          0x68(%rdx),%xmm3
-  DB  102,15,112,219,0                    ; pshufd        $0x0,%xmm3,%xmm3
-  DB  102,65,15,219,216                   ; pand          %xmm8,%xmm3
-  DB  68,15,91,203                        ; cvtdq2ps      %xmm3,%xmm9
-  DB  243,15,16,26                        ; movss         (%rdx),%xmm3
-  DB  243,68,15,16,82,116                 ; movss         0x74(%rdx),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  102,68,15,110,74,108                ; movd          0x6c(%rdx),%xmm9
-  DB  102,69,15,112,201,0                 ; pshufd        $0x0,%xmm9,%xmm9
-  DB  102,69,15,219,200                   ; pand          %xmm8,%xmm9
-  DB  69,15,91,201                        ; cvtdq2ps      %xmm9,%xmm9
-  DB  243,68,15,16,90,120                 ; movss         0x78(%rdx),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,89,217                        ; mulps         %xmm9,%xmm11
-  DB  102,68,15,110,74,112                ; movd          0x70(%rdx),%xmm9
-  DB  102,69,15,112,201,0                 ; pshufd        $0x0,%xmm9,%xmm9
-  DB  102,69,15,219,200                   ; pand          %xmm8,%xmm9
-  DB  69,15,91,193                        ; cvtdq2ps      %xmm9,%xmm8
-  DB  243,68,15,16,74,124                 ; movss         0x7c(%rdx),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  15,92,196                           ; subps         %xmm4,%xmm0
-  DB  65,15,89,194                        ; mulps         %xmm10,%xmm0
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  15,92,205                           ; subps         %xmm5,%xmm1
-  DB  65,15,89,203                        ; mulps         %xmm11,%xmm1
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  15,92,214                           ; subps         %xmm6,%xmm2
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_tables_sse41
-_sk_load_tables_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,8                            ; mov           (%rax),%rcx
-  DB  76,139,64,8                         ; mov           0x8(%rax),%r8
-  DB  243,68,15,111,4,185                 ; movdqu        (%rcx,%rdi,4),%xmm8
-  DB  102,15,110,66,16                    ; movd          0x10(%rdx),%xmm0
-  DB  102,15,112,192,0                    ; pshufd        $0x0,%xmm0,%xmm0
-  DB  102,65,15,111,200                   ; movdqa        %xmm8,%xmm1
-  DB  102,15,114,209,8                    ; psrld         $0x8,%xmm1
-  DB  102,15,219,200                      ; pand          %xmm0,%xmm1
-  DB  102,65,15,111,208                   ; movdqa        %xmm8,%xmm2
-  DB  102,15,114,210,16                   ; psrld         $0x10,%xmm2
-  DB  102,15,219,208                      ; pand          %xmm0,%xmm2
-  DB  102,65,15,219,192                   ; pand          %xmm8,%xmm0
-  DB  102,72,15,58,22,193,1               ; pextrq        $0x1,%xmm0,%rcx
-  DB  65,137,201                          ; mov           %ecx,%r9d
-  DB  72,193,233,32                       ; shr           $0x20,%rcx
-  DB  102,73,15,126,194                   ; movq          %xmm0,%r10
-  DB  69,137,211                          ; mov           %r10d,%r11d
-  DB  73,193,234,32                       ; shr           $0x20,%r10
-  DB  243,67,15,16,4,152                  ; movss         (%r8,%r11,4),%xmm0
-  DB  102,67,15,58,33,4,144,16            ; insertps      $0x10,(%r8,%r10,4),%xmm0
-  DB  102,67,15,58,33,4,136,32            ; insertps      $0x20,(%r8,%r9,4),%xmm0
-  DB  102,65,15,58,33,4,136,48            ; insertps      $0x30,(%r8,%rcx,4),%xmm0
-  DB  72,139,72,16                        ; mov           0x10(%rax),%rcx
-  DB  102,73,15,58,22,200,1               ; pextrq        $0x1,%xmm1,%r8
-  DB  69,137,193                          ; mov           %r8d,%r9d
-  DB  73,193,232,32                       ; shr           $0x20,%r8
-  DB  102,73,15,126,202                   ; movq          %xmm1,%r10
-  DB  69,137,211                          ; mov           %r10d,%r11d
-  DB  73,193,234,32                       ; shr           $0x20,%r10
-  DB  243,66,15,16,12,153                 ; movss         (%rcx,%r11,4),%xmm1
-  DB  102,66,15,58,33,12,145,16           ; insertps      $0x10,(%rcx,%r10,4),%xmm1
-  DB  243,66,15,16,28,137                 ; movss         (%rcx,%r9,4),%xmm3
-  DB  102,15,58,33,203,32                 ; insertps      $0x20,%xmm3,%xmm1
-  DB  243,66,15,16,28,129                 ; movss         (%rcx,%r8,4),%xmm3
-  DB  102,15,58,33,203,48                 ; insertps      $0x30,%xmm3,%xmm1
-  DB  72,139,64,24                        ; mov           0x18(%rax),%rax
-  DB  102,72,15,58,22,209,1               ; pextrq        $0x1,%xmm2,%rcx
-  DB  65,137,200                          ; mov           %ecx,%r8d
-  DB  72,193,233,32                       ; shr           $0x20,%rcx
-  DB  102,73,15,126,209                   ; movq          %xmm2,%r9
-  DB  69,137,202                          ; mov           %r9d,%r10d
-  DB  73,193,233,32                       ; shr           $0x20,%r9
-  DB  243,66,15,16,20,144                 ; movss         (%rax,%r10,4),%xmm2
-  DB  102,66,15,58,33,20,136,16           ; insertps      $0x10,(%rax,%r9,4),%xmm2
-  DB  243,66,15,16,28,128                 ; movss         (%rax,%r8,4),%xmm3
-  DB  102,15,58,33,211,32                 ; insertps      $0x20,%xmm3,%xmm2
-  DB  243,15,16,28,136                    ; movss         (%rax,%rcx,4),%xmm3
-  DB  102,15,58,33,211,48                 ; insertps      $0x30,%xmm3,%xmm2
-  DB  102,65,15,114,208,24                ; psrld         $0x18,%xmm8
-  DB  69,15,91,192                        ; cvtdq2ps      %xmm8,%xmm8
-  DB  243,15,16,90,12                     ; movss         0xc(%rdx),%xmm3
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_a8_sse41
-_sk_load_a8_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,15,56,49,4,56                   ; pmovzxbd      (%rax,%rdi,1),%xmm0
-  DB  15,91,192                           ; cvtdq2ps      %xmm0,%xmm0
-  DB  243,15,16,90,12                     ; movss         0xc(%rdx),%xmm3
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  15,89,216                           ; mulps         %xmm0,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,87,192                           ; xorps         %xmm0,%xmm0
-  DB  15,87,201                           ; xorps         %xmm1,%xmm1
-  DB  15,87,210                           ; xorps         %xmm2,%xmm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_a8_sse41
-_sk_store_a8_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,68,15,16,66,8                   ; movss         0x8(%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,89,195                        ; mulps         %xmm3,%xmm8
-  DB  102,69,15,91,192                    ; cvtps2dq      %xmm8,%xmm8
-  DB  102,69,15,56,43,192                 ; packusdw      %xmm8,%xmm8
-  DB  102,69,15,103,192                   ; packuswb      %xmm8,%xmm8
-  DB  102,68,15,126,4,56                  ; movd          %xmm8,(%rax,%rdi,1)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_565_sse41
-_sk_load_565_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,68,15,56,51,12,120              ; pmovzxwd      (%rax,%rdi,2),%xmm9
-  DB  102,15,110,66,104                   ; movd          0x68(%rdx),%xmm0
-  DB  102,15,112,192,0                    ; pshufd        $0x0,%xmm0,%xmm0
-  DB  102,65,15,219,193                   ; pand          %xmm9,%xmm0
-  DB  15,91,200                           ; cvtdq2ps      %xmm0,%xmm1
-  DB  243,15,16,26                        ; movss         (%rdx),%xmm3
-  DB  243,15,16,66,116                    ; movss         0x74(%rdx),%xmm0
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  15,89,193                           ; mulps         %xmm1,%xmm0
-  DB  102,15,110,74,108                   ; movd          0x6c(%rdx),%xmm1
-  DB  102,15,112,201,0                    ; pshufd        $0x0,%xmm1,%xmm1
-  DB  102,65,15,219,201                   ; pand          %xmm9,%xmm1
-  DB  68,15,91,193                        ; cvtdq2ps      %xmm1,%xmm8
-  DB  243,15,16,74,120                    ; movss         0x78(%rdx),%xmm1
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  102,15,110,82,112                   ; movd          0x70(%rdx),%xmm2
-  DB  102,15,112,210,0                    ; pshufd        $0x0,%xmm2,%xmm2
-  DB  102,65,15,219,209                   ; pand          %xmm9,%xmm2
-  DB  68,15,91,194                        ; cvtdq2ps      %xmm2,%xmm8
-  DB  243,15,16,82,124                    ; movss         0x7c(%rdx),%xmm2
-  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_565_sse41
-_sk_store_565_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,68,15,16,130,128,0,0,0          ; movss         0x80(%rdx),%xmm8
-  DB  243,68,15,16,138,132,0,0,0          ; movss         0x84(%rdx),%xmm9
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  69,15,40,208                        ; movaps        %xmm8,%xmm10
-  DB  68,15,89,208                        ; mulps         %xmm0,%xmm10
-  DB  102,69,15,91,210                    ; cvtps2dq      %xmm10,%xmm10
-  DB  102,65,15,114,242,11                ; pslld         $0xb,%xmm10
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  102,69,15,91,201                    ; cvtps2dq      %xmm9,%xmm9
-  DB  102,65,15,114,241,5                 ; pslld         $0x5,%xmm9
-  DB  102,69,15,235,202                   ; por           %xmm10,%xmm9
-  DB  68,15,89,194                        ; mulps         %xmm2,%xmm8
-  DB  102,69,15,91,192                    ; cvtps2dq      %xmm8,%xmm8
-  DB  102,69,15,86,193                    ; orpd          %xmm9,%xmm8
-  DB  102,69,15,56,43,192                 ; packusdw      %xmm8,%xmm8
-  DB  102,68,15,214,4,120                 ; movq          %xmm8,(%rax,%rdi,2)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_8888_sse41
-_sk_load_8888_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,15,111,28,184                   ; movdqu        (%rax,%rdi,4),%xmm3
-  DB  102,15,110,66,16                    ; movd          0x10(%rdx),%xmm0
-  DB  102,15,112,192,0                    ; pshufd        $0x0,%xmm0,%xmm0
-  DB  102,15,111,203                      ; movdqa        %xmm3,%xmm1
-  DB  102,15,114,209,8                    ; psrld         $0x8,%xmm1
-  DB  102,15,219,200                      ; pand          %xmm0,%xmm1
-  DB  102,15,111,211                      ; movdqa        %xmm3,%xmm2
-  DB  102,15,114,210,16                   ; psrld         $0x10,%xmm2
-  DB  102,15,219,208                      ; pand          %xmm0,%xmm2
-  DB  102,15,219,195                      ; pand          %xmm3,%xmm0
-  DB  15,91,192                           ; cvtdq2ps      %xmm0,%xmm0
-  DB  243,68,15,16,66,12                  ; movss         0xc(%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  15,91,201                           ; cvtdq2ps      %xmm1,%xmm1
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  15,91,210                           ; cvtdq2ps      %xmm2,%xmm2
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  102,15,114,211,24                   ; psrld         $0x18,%xmm3
-  DB  15,91,219                           ; cvtdq2ps      %xmm3,%xmm3
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_8888_sse41
-_sk_store_8888_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,68,15,16,66,8                   ; movss         0x8(%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,200                        ; mulps         %xmm0,%xmm9
-  DB  102,69,15,91,201                    ; cvtps2dq      %xmm9,%xmm9
-  DB  69,15,40,208                        ; movaps        %xmm8,%xmm10
-  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
-  DB  102,69,15,91,210                    ; cvtps2dq      %xmm10,%xmm10
-  DB  102,65,15,114,242,8                 ; pslld         $0x8,%xmm10
-  DB  102,69,15,235,209                   ; por           %xmm9,%xmm10
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,202                        ; mulps         %xmm2,%xmm9
-  DB  102,69,15,91,201                    ; cvtps2dq      %xmm9,%xmm9
-  DB  102,65,15,114,241,16                ; pslld         $0x10,%xmm9
-  DB  68,15,89,195                        ; mulps         %xmm3,%xmm8
-  DB  102,69,15,91,192                    ; cvtps2dq      %xmm8,%xmm8
-  DB  102,65,15,114,240,24                ; pslld         $0x18,%xmm8
-  DB  102,69,15,235,193                   ; por           %xmm9,%xmm8
-  DB  102,69,15,235,194                   ; por           %xmm10,%xmm8
-  DB  243,68,15,127,4,184                 ; movdqu        %xmm8,(%rax,%rdi,4)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_f16_sse41
-_sk_load_f16_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,15,111,4,248                    ; movdqu        (%rax,%rdi,8),%xmm0
-  DB  243,15,111,76,248,16                ; movdqu        0x10(%rax,%rdi,8),%xmm1
-  DB  102,15,111,208                      ; movdqa        %xmm0,%xmm2
-  DB  102,15,97,209                       ; punpcklwd     %xmm1,%xmm2
-  DB  102,15,105,193                      ; punpckhwd     %xmm1,%xmm0
-  DB  102,68,15,111,194                   ; movdqa        %xmm2,%xmm8
-  DB  102,68,15,97,192                    ; punpcklwd     %xmm0,%xmm8
-  DB  102,15,105,208                      ; punpckhwd     %xmm0,%xmm2
-  DB  102,15,110,66,100                   ; movd          0x64(%rdx),%xmm0
-  DB  102,15,112,216,0                    ; pshufd        $0x0,%xmm0,%xmm3
-  DB  102,15,111,203                      ; movdqa        %xmm3,%xmm1
-  DB  102,65,15,101,200                   ; pcmpgtw       %xmm8,%xmm1
-  DB  102,65,15,223,200                   ; pandn         %xmm8,%xmm1
-  DB  102,15,101,218                      ; pcmpgtw       %xmm2,%xmm3
-  DB  102,15,223,218                      ; pandn         %xmm2,%xmm3
-  DB  102,15,56,51,193                    ; pmovzxwd      %xmm1,%xmm0
-  DB  102,15,114,240,13                   ; pslld         $0xd,%xmm0
-  DB  102,15,110,82,92                    ; movd          0x5c(%rdx),%xmm2
-  DB  102,68,15,112,194,0                 ; pshufd        $0x0,%xmm2,%xmm8
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  102,69,15,239,201                   ; pxor          %xmm9,%xmm9
-  DB  102,65,15,105,201                   ; punpckhwd     %xmm9,%xmm1
-  DB  102,15,114,241,13                   ; pslld         $0xd,%xmm1
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  102,15,56,51,211                    ; pmovzxwd      %xmm3,%xmm2
-  DB  102,15,114,242,13                   ; pslld         $0xd,%xmm2
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  102,65,15,105,217                   ; punpckhwd     %xmm9,%xmm3
-  DB  102,15,114,243,13                   ; pslld         $0xd,%xmm3
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_f16_sse41
-_sk_store_f16_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,68,15,110,66,96                 ; movd          0x60(%rdx),%xmm8
-  DB  102,69,15,112,192,0                 ; pshufd        $0x0,%xmm8,%xmm8
-  DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
-  DB  68,15,89,200                        ; mulps         %xmm0,%xmm9
-  DB  102,65,15,114,209,13                ; psrld         $0xd,%xmm9
-  DB  102,69,15,111,208                   ; movdqa        %xmm8,%xmm10
-  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
-  DB  102,65,15,114,210,13                ; psrld         $0xd,%xmm10
-  DB  102,69,15,111,216                   ; movdqa        %xmm8,%xmm11
-  DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
-  DB  102,65,15,114,211,13                ; psrld         $0xd,%xmm11
-  DB  68,15,89,195                        ; mulps         %xmm3,%xmm8
-  DB  102,65,15,114,208,13                ; psrld         $0xd,%xmm8
-  DB  102,65,15,115,250,2                 ; pslldq        $0x2,%xmm10
-  DB  102,69,15,235,209                   ; por           %xmm9,%xmm10
-  DB  102,65,15,115,248,2                 ; pslldq        $0x2,%xmm8
-  DB  102,69,15,235,195                   ; por           %xmm11,%xmm8
-  DB  102,69,15,111,202                   ; movdqa        %xmm10,%xmm9
-  DB  102,69,15,98,200                    ; punpckldq     %xmm8,%xmm9
-  DB  243,68,15,127,12,248                ; movdqu        %xmm9,(%rax,%rdi,8)
-  DB  102,69,15,106,208                   ; punpckhdq     %xmm8,%xmm10
-  DB  243,68,15,127,84,248,16             ; movdqu        %xmm10,0x10(%rax,%rdi,8)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_f32_sse41
-_sk_store_f32_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,137,249                          ; mov           %rdi,%rcx
-  DB  72,193,225,4                        ; shl           $0x4,%rcx
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  68,15,40,200                        ; movaps        %xmm0,%xmm9
-  DB  68,15,20,201                        ; unpcklps      %xmm1,%xmm9
-  DB  68,15,40,210                        ; movaps        %xmm2,%xmm10
-  DB  68,15,40,218                        ; movaps        %xmm2,%xmm11
-  DB  68,15,20,219                        ; unpcklps      %xmm3,%xmm11
-  DB  68,15,21,193                        ; unpckhps      %xmm1,%xmm8
-  DB  68,15,21,211                        ; unpckhps      %xmm3,%xmm10
-  DB  69,15,40,225                        ; movaps        %xmm9,%xmm12
-  DB  102,69,15,20,227                    ; unpcklpd      %xmm11,%xmm12
-  DB  102,69,15,21,203                    ; unpckhpd      %xmm11,%xmm9
-  DB  69,15,40,216                        ; movaps        %xmm8,%xmm11
-  DB  102,69,15,20,218                    ; unpcklpd      %xmm10,%xmm11
-  DB  102,69,15,21,194                    ; unpckhpd      %xmm10,%xmm8
-  DB  102,68,15,17,36,8                   ; movupd        %xmm12,(%rax,%rcx,1)
-  DB  102,68,15,17,76,8,16                ; movupd        %xmm9,0x10(%rax,%rcx,1)
-  DB  102,68,15,17,92,8,32                ; movupd        %xmm11,0x20(%rax,%rcx,1)
-  DB  102,68,15,17,68,8,48                ; movupd        %xmm8,0x30(%rax,%rcx,1)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_x_sse41
-_sk_clamp_x_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  68,15,95,192                        ; maxps         %xmm0,%xmm8
-  DB  243,68,15,16,8                      ; movss         (%rax),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  102,15,118,192                      ; pcmpeqd       %xmm0,%xmm0
-  DB  102,65,15,254,193                   ; paddd         %xmm9,%xmm0
-  DB  68,15,93,192                        ; minps         %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,192                        ; movaps        %xmm8,%xmm0
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_y_sse41
-_sk_clamp_y_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  68,15,95,193                        ; maxps         %xmm1,%xmm8
-  DB  243,68,15,16,8                      ; movss         (%rax),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  102,15,118,201                      ; pcmpeqd       %xmm1,%xmm1
-  DB  102,65,15,254,201                   ; paddd         %xmm9,%xmm1
-  DB  68,15,93,193                        ; minps         %xmm1,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,200                        ; movaps        %xmm8,%xmm1
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_repeat_x_sse41
-_sk_repeat_x_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,40,200                        ; movaps        %xmm0,%xmm9
-  DB  69,15,94,200                        ; divps         %xmm8,%xmm9
-  DB  102,69,15,58,8,201,1                ; roundps       $0x1,%xmm9,%xmm9
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  65,15,92,193                        ; subps         %xmm9,%xmm0
-  DB  102,69,15,118,201                   ; pcmpeqd       %xmm9,%xmm9
-  DB  102,69,15,254,200                   ; paddd         %xmm8,%xmm9
-  DB  65,15,93,193                        ; minps         %xmm9,%xmm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_repeat_y_sse41
-_sk_repeat_y_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,40,201                        ; movaps        %xmm1,%xmm9
-  DB  69,15,94,200                        ; divps         %xmm8,%xmm9
-  DB  102,69,15,58,8,201,1                ; roundps       $0x1,%xmm9,%xmm9
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  65,15,92,201                        ; subps         %xmm9,%xmm1
-  DB  102,69,15,118,201                   ; pcmpeqd       %xmm9,%xmm9
-  DB  102,69,15,254,200                   ; paddd         %xmm8,%xmm9
-  DB  65,15,93,201                        ; minps         %xmm9,%xmm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_mirror_x_sse41
-_sk_mirror_x_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  65,15,92,193                        ; subps         %xmm9,%xmm0
-  DB  243,69,15,88,192                    ; addss         %xmm8,%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,40,208                        ; movaps        %xmm0,%xmm10
-  DB  69,15,94,208                        ; divps         %xmm8,%xmm10
-  DB  102,69,15,58,8,210,1                ; roundps       $0x1,%xmm10,%xmm10
-  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
-  DB  65,15,92,194                        ; subps         %xmm10,%xmm0
-  DB  65,15,92,193                        ; subps         %xmm9,%xmm0
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  68,15,92,192                        ; subps         %xmm0,%xmm8
-  DB  65,15,84,192                        ; andps         %xmm8,%xmm0
-  DB  102,69,15,118,192                   ; pcmpeqd       %xmm8,%xmm8
-  DB  102,69,15,254,193                   ; paddd         %xmm9,%xmm8
-  DB  65,15,93,192                        ; minps         %xmm8,%xmm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_mirror_y_sse41
-_sk_mirror_y_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  65,15,92,201                        ; subps         %xmm9,%xmm1
-  DB  243,69,15,88,192                    ; addss         %xmm8,%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,40,209                        ; movaps        %xmm1,%xmm10
-  DB  69,15,94,208                        ; divps         %xmm8,%xmm10
-  DB  102,69,15,58,8,210,1                ; roundps       $0x1,%xmm10,%xmm10
-  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
-  DB  65,15,92,202                        ; subps         %xmm10,%xmm1
-  DB  65,15,92,201                        ; subps         %xmm9,%xmm1
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  68,15,92,193                        ; subps         %xmm1,%xmm8
-  DB  65,15,84,200                        ; andps         %xmm8,%xmm1
-  DB  102,69,15,118,192                   ; pcmpeqd       %xmm8,%xmm8
-  DB  102,69,15,254,193                   ; paddd         %xmm9,%xmm8
-  DB  65,15,93,200                        ; minps         %xmm8,%xmm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_2x3_sse41
-_sk_matrix_2x3_sse41 LABEL PROC
-  DB  68,15,40,201                        ; movaps        %xmm1,%xmm9
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,15,16,0                         ; movss         (%rax),%xmm0
-  DB  243,15,16,72,4                      ; movss         0x4(%rax),%xmm1
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  243,68,15,16,80,8                   ; movss         0x8(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,16                  ; movss         0x10(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,88,194                        ; addps         %xmm10,%xmm0
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  243,68,15,16,80,12                  ; movss         0xc(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,20                  ; movss         0x14(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  65,15,88,202                        ; addps         %xmm10,%xmm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_3x4_sse41
-_sk_matrix_3x4_sse41 LABEL PROC
-  DB  68,15,40,201                        ; movaps        %xmm1,%xmm9
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,15,16,0                         ; movss         (%rax),%xmm0
-  DB  243,15,16,72,4                      ; movss         0x4(%rax),%xmm1
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  243,68,15,16,80,12                  ; movss         0xc(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,24                  ; movss         0x18(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  243,68,15,16,96,36                  ; movss         0x24(%rax),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
-  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,88,194                        ; addps         %xmm10,%xmm0
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  243,68,15,16,80,16                  ; movss         0x10(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,28                  ; movss         0x1c(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  243,68,15,16,96,40                  ; movss         0x28(%rax),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
-  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  65,15,88,202                        ; addps         %xmm10,%xmm1
-  DB  243,68,15,16,80,8                   ; movss         0x8(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,20                  ; movss         0x14(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  243,68,15,16,96,32                  ; movss         0x20(%rax),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  243,68,15,16,104,44                 ; movss         0x2c(%rax),%xmm13
-  DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
-  DB  68,15,89,226                        ; mulps         %xmm2,%xmm12
-  DB  69,15,88,229                        ; addps         %xmm13,%xmm12
-  DB  69,15,89,217                        ; mulps         %xmm9,%xmm11
-  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
-  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,210                        ; movaps        %xmm10,%xmm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_perspective_sse41
-_sk_matrix_perspective_sse41 LABEL PROC
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,15,16,0                         ; movss         (%rax),%xmm0
-  DB  243,68,15,16,72,4                   ; movss         0x4(%rax),%xmm9
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  243,68,15,16,80,8                   ; movss         0x8(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  69,15,88,202                        ; addps         %xmm10,%xmm9
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
-  DB  243,68,15,16,72,12                  ; movss         0xc(%rax),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  243,68,15,16,80,16                  ; movss         0x10(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,20                  ; movss         0x14(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  69,15,88,202                        ; addps         %xmm10,%xmm9
-  DB  243,68,15,16,80,24                  ; movss         0x18(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,28                  ; movss         0x1c(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  243,68,15,16,96,32                  ; movss         0x20(%rax),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  68,15,89,217                        ; mulps         %xmm1,%xmm11
-  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
-  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,83,202                        ; rcpps         %xmm10,%xmm1
-  DB  15,89,193                           ; mulps         %xmm1,%xmm0
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,201                        ; movaps        %xmm9,%xmm1
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_linear_gradient_2stops_sse41
-_sk_linear_gradient_2stops_sse41 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  68,15,16,8                          ; movups        (%rax),%xmm9
-  DB  15,16,88,16                         ; movups        0x10(%rax),%xmm3
-  DB  68,15,40,195                        ; movaps        %xmm3,%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,40,201                        ; movaps        %xmm9,%xmm1
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  68,15,89,192                        ; mulps         %xmm0,%xmm8
-  DB  68,15,88,193                        ; addps         %xmm1,%xmm8
-  DB  15,40,203                           ; movaps        %xmm3,%xmm1
-  DB  15,198,201,85                       ; shufps        $0x55,%xmm1,%xmm1
-  DB  65,15,40,209                        ; movaps        %xmm9,%xmm2
-  DB  15,198,210,85                       ; shufps        $0x55,%xmm2,%xmm2
-  DB  15,89,200                           ; mulps         %xmm0,%xmm1
-  DB  15,88,202                           ; addps         %xmm2,%xmm1
-  DB  15,40,211                           ; movaps        %xmm3,%xmm2
-  DB  15,198,210,170                      ; shufps        $0xaa,%xmm2,%xmm2
-  DB  69,15,40,209                        ; movaps        %xmm9,%xmm10
-  DB  69,15,198,210,170                   ; shufps        $0xaa,%xmm10,%xmm10
-  DB  15,89,208                           ; mulps         %xmm0,%xmm2
-  DB  65,15,88,210                        ; addps         %xmm10,%xmm2
-  DB  15,198,219,255                      ; shufps        $0xff,%xmm3,%xmm3
-  DB  69,15,198,201,255                   ; shufps        $0xff,%xmm9,%xmm9
-  DB  15,89,216                           ; mulps         %xmm0,%xmm3
-  DB  65,15,88,217                        ; addps         %xmm9,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,192                        ; movaps        %xmm8,%xmm0
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_start_pipeline_sse2
-_sk_start_pipeline_sse2 LABEL PROC
-  DB  65,87                               ; push          %r15
-  DB  65,86                               ; push          %r14
-  DB  65,85                               ; push          %r13
-  DB  65,84                               ; push          %r12
-  DB  86                                  ; push          %rsi
-  DB  87                                  ; push          %rdi
-  DB  83                                  ; push          %rbx
-  DB  72,129,236,160,0,0,0                ; sub           $0xa0,%rsp
-  DB  68,15,41,188,36,144,0,0,0           ; movaps        %xmm15,0x90(%rsp)
-  DB  68,15,41,180,36,128,0,0,0           ; movaps        %xmm14,0x80(%rsp)
-  DB  68,15,41,108,36,112                 ; movaps        %xmm13,0x70(%rsp)
-  DB  68,15,41,100,36,96                  ; movaps        %xmm12,0x60(%rsp)
-  DB  68,15,41,92,36,80                   ; movaps        %xmm11,0x50(%rsp)
-  DB  68,15,41,84,36,64                   ; movaps        %xmm10,0x40(%rsp)
-  DB  68,15,41,76,36,48                   ; movaps        %xmm9,0x30(%rsp)
-  DB  68,15,41,68,36,32                   ; movaps        %xmm8,0x20(%rsp)
-  DB  15,41,124,36,16                     ; movaps        %xmm7,0x10(%rsp)
-  DB  15,41,52,36                         ; movaps        %xmm6,(%rsp)
-  DB  77,137,207                          ; mov           %r9,%r15
-  DB  77,137,198                          ; mov           %r8,%r14
-  DB  72,137,203                          ; mov           %rcx,%rbx
-  DB  72,137,214                          ; mov           %rdx,%rsi
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  73,137,196                          ; mov           %rax,%r12
-  DB  73,137,245                          ; mov           %rsi,%r13
-  DB  72,141,67,4                         ; lea           0x4(%rbx),%rax
-  DB  76,57,248                           ; cmp           %r15,%rax
-  DB  118,5                               ; jbe           73 <_sk_start_pipeline_sse2+0x73>
-  DB  72,137,216                          ; mov           %rbx,%rax
-  DB  235,52                              ; jmp           a7 <_sk_start_pipeline_sse2+0xa7>
-  DB  15,87,192                           ; xorps         %xmm0,%xmm0
-  DB  15,87,201                           ; xorps         %xmm1,%xmm1
-  DB  15,87,210                           ; xorps         %xmm2,%xmm2
-  DB  15,87,219                           ; xorps         %xmm3,%xmm3
-  DB  15,87,228                           ; xorps         %xmm4,%xmm4
-  DB  15,87,237                           ; xorps         %xmm5,%xmm5
-  DB  15,87,246                           ; xorps         %xmm6,%xmm6
-  DB  15,87,255                           ; xorps         %xmm7,%xmm7
-  DB  72,137,223                          ; mov           %rbx,%rdi
-  DB  76,137,238                          ; mov           %r13,%rsi
-  DB  76,137,242                          ; mov           %r14,%rdx
-  DB  65,255,212                          ; callq         *%r12
-  DB  72,141,67,4                         ; lea           0x4(%rbx),%rax
-  DB  72,131,195,8                        ; add           $0x8,%rbx
-  DB  76,57,251                           ; cmp           %r15,%rbx
-  DB  72,137,195                          ; mov           %rax,%rbx
-  DB  118,204                             ; jbe           73 <_sk_start_pipeline_sse2+0x73>
-  DB  15,40,52,36                         ; movaps        (%rsp),%xmm6
-  DB  15,40,124,36,16                     ; movaps        0x10(%rsp),%xmm7
-  DB  68,15,40,68,36,32                   ; movaps        0x20(%rsp),%xmm8
-  DB  68,15,40,76,36,48                   ; movaps        0x30(%rsp),%xmm9
-  DB  68,15,40,84,36,64                   ; movaps        0x40(%rsp),%xmm10
-  DB  68,15,40,92,36,80                   ; movaps        0x50(%rsp),%xmm11
-  DB  68,15,40,100,36,96                  ; movaps        0x60(%rsp),%xmm12
-  DB  68,15,40,108,36,112                 ; movaps        0x70(%rsp),%xmm13
-  DB  68,15,40,180,36,128,0,0,0           ; movaps        0x80(%rsp),%xmm14
-  DB  68,15,40,188,36,144,0,0,0           ; movaps        0x90(%rsp),%xmm15
-  DB  72,129,196,160,0,0,0                ; add           $0xa0,%rsp
-  DB  91                                  ; pop           %rbx
-  DB  95                                  ; pop           %rdi
-  DB  94                                  ; pop           %rsi
-  DB  65,92                               ; pop           %r12
-  DB  65,93                               ; pop           %r13
-  DB  65,94                               ; pop           %r14
-  DB  65,95                               ; pop           %r15
-  DB  195                                 ; retq
-
-PUBLIC _sk_just_return_sse2
-_sk_just_return_sse2 LABEL PROC
-  DB  195                                 ; retq
-
-PUBLIC _sk_seed_shader_sse2
-_sk_seed_shader_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  102,15,110,199                      ; movd          %edi,%xmm0
-  DB  102,15,112,192,0                    ; pshufd        $0x0,%xmm0,%xmm0
-  DB  15,91,200                           ; cvtdq2ps      %xmm0,%xmm1
-  DB  243,15,16,18                        ; movss         (%rdx),%xmm2
-  DB  243,15,16,90,4                      ; movss         0x4(%rdx),%xmm3
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  15,88,203                           ; addps         %xmm3,%xmm1
-  DB  15,16,66,20                         ; movups        0x14(%rdx),%xmm0
-  DB  15,88,193                           ; addps         %xmm1,%xmm0
-  DB  102,15,110,8                        ; movd          (%rax),%xmm1
-  DB  102,15,112,201,0                    ; pshufd        $0x0,%xmm1,%xmm1
-  DB  15,91,201                           ; cvtdq2ps      %xmm1,%xmm1
-  DB  15,88,203                           ; addps         %xmm3,%xmm1
-  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,87,219                           ; xorps         %xmm3,%xmm3
-  DB  15,87,228                           ; xorps         %xmm4,%xmm4
-  DB  15,87,237                           ; xorps         %xmm5,%xmm5
-  DB  15,87,246                           ; xorps         %xmm6,%xmm6
-  DB  15,87,255                           ; xorps         %xmm7,%xmm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_constant_color_sse2
-_sk_constant_color_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,16,24                            ; movups        (%rax),%xmm3
-  DB  15,40,195                           ; movaps        %xmm3,%xmm0
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  15,40,203                           ; movaps        %xmm3,%xmm1
-  DB  15,198,201,85                       ; shufps        $0x55,%xmm1,%xmm1
-  DB  15,40,211                           ; movaps        %xmm3,%xmm2
-  DB  15,198,210,170                      ; shufps        $0xaa,%xmm2,%xmm2
-  DB  15,198,219,255                      ; shufps        $0xff,%xmm3,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clear_sse2
-_sk_clear_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,87,192                           ; xorps         %xmm0,%xmm0
-  DB  15,87,201                           ; xorps         %xmm1,%xmm1
-  DB  15,87,210                           ; xorps         %xmm2,%xmm2
-  DB  15,87,219                           ; xorps         %xmm3,%xmm3
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_plus__sse2
-_sk_plus__sse2 LABEL PROC
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  15,88,223                           ; addps         %xmm7,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_srcover_sse2
-_sk_srcover_sse2 LABEL PROC
-  DB  243,68,15,16,2                      ; movss         (%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,92,195                        ; subps         %xmm3,%xmm8
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,204                        ; mulps         %xmm4,%xmm9
-  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,205                        ; mulps         %xmm5,%xmm9
-  DB  65,15,88,201                        ; addps         %xmm9,%xmm1
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,206                        ; mulps         %xmm6,%xmm9
-  DB  65,15,88,209                        ; addps         %xmm9,%xmm2
-  DB  68,15,89,199                        ; mulps         %xmm7,%xmm8
-  DB  65,15,88,216                        ; addps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_dstover_sse2
-_sk_dstover_sse2 LABEL PROC
-  DB  243,68,15,16,2                      ; movss         (%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,92,199                        ; subps         %xmm7,%xmm8
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  15,88,223                           ; addps         %xmm7,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_0_sse2
-_sk_clamp_0_sse2 LABEL PROC
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  65,15,95,192                        ; maxps         %xmm8,%xmm0
-  DB  65,15,95,200                        ; maxps         %xmm8,%xmm1
-  DB  65,15,95,208                        ; maxps         %xmm8,%xmm2
-  DB  65,15,95,216                        ; maxps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_1_sse2
-_sk_clamp_1_sse2 LABEL PROC
-  DB  243,68,15,16,2                      ; movss         (%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,93,192                        ; minps         %xmm8,%xmm0
-  DB  65,15,93,200                        ; minps         %xmm8,%xmm1
-  DB  65,15,93,208                        ; minps         %xmm8,%xmm2
-  DB  65,15,93,216                        ; minps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_a_sse2
-_sk_clamp_a_sse2 LABEL PROC
-  DB  243,68,15,16,2                      ; movss         (%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,93,216                        ; minps         %xmm8,%xmm3
-  DB  15,93,195                           ; minps         %xmm3,%xmm0
-  DB  15,93,203                           ; minps         %xmm3,%xmm1
-  DB  15,93,211                           ; minps         %xmm3,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_set_rgb_sse2
-_sk_set_rgb_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,15,16,0                         ; movss         (%rax),%xmm0
-  DB  243,15,16,72,4                      ; movss         0x4(%rax),%xmm1
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  243,15,16,80,8                      ; movss         0x8(%rax),%xmm2
-  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_swap_rb_sse2
-_sk_swap_rb_sse2 LABEL PROC
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,40,194                           ; movaps        %xmm2,%xmm0
-  DB  65,15,40,208                        ; movaps        %xmm8,%xmm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_swap_sse2
-_sk_swap_sse2 LABEL PROC
-  DB  68,15,40,195                        ; movaps        %xmm3,%xmm8
-  DB  68,15,40,202                        ; movaps        %xmm2,%xmm9
-  DB  68,15,40,209                        ; movaps        %xmm1,%xmm10
-  DB  68,15,40,216                        ; movaps        %xmm0,%xmm11
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,40,196                           ; movaps        %xmm4,%xmm0
-  DB  15,40,205                           ; movaps        %xmm5,%xmm1
-  DB  15,40,214                           ; movaps        %xmm6,%xmm2
-  DB  15,40,223                           ; movaps        %xmm7,%xmm3
-  DB  65,15,40,227                        ; movaps        %xmm11,%xmm4
-  DB  65,15,40,234                        ; movaps        %xmm10,%xmm5
-  DB  65,15,40,241                        ; movaps        %xmm9,%xmm6
-  DB  65,15,40,248                        ; movaps        %xmm8,%xmm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_move_src_dst_sse2
-_sk_move_src_dst_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,40,224                           ; movaps        %xmm0,%xmm4
-  DB  15,40,233                           ; movaps        %xmm1,%xmm5
-  DB  15,40,242                           ; movaps        %xmm2,%xmm6
-  DB  15,40,251                           ; movaps        %xmm3,%xmm7
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_move_dst_src_sse2
-_sk_move_dst_src_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,40,196                           ; movaps        %xmm4,%xmm0
-  DB  15,40,205                           ; movaps        %xmm5,%xmm1
-  DB  15,40,214                           ; movaps        %xmm6,%xmm2
-  DB  15,40,223                           ; movaps        %xmm7,%xmm3
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_premul_sse2
-_sk_premul_sse2 LABEL PROC
-  DB  15,89,195                           ; mulps         %xmm3,%xmm0
-  DB  15,89,203                           ; mulps         %xmm3,%xmm1
-  DB  15,89,211                           ; mulps         %xmm3,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_unpremul_sse2
-_sk_unpremul_sse2 LABEL PROC
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  68,15,194,195,0                     ; cmpeqps       %xmm3,%xmm8
-  DB  243,68,15,16,10                     ; movss         (%rdx),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  68,15,94,203                        ; divps         %xmm3,%xmm9
-  DB  69,15,85,193                        ; andnps        %xmm9,%xmm8
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_from_srgb_sse2
-_sk_from_srgb_sse2 LABEL PROC
-  DB  243,68,15,16,66,64                  ; movss         0x40(%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  69,15,40,232                        ; movaps        %xmm8,%xmm13
-  DB  68,15,89,232                        ; mulps         %xmm0,%xmm13
-  DB  68,15,40,224                        ; movaps        %xmm0,%xmm12
-  DB  69,15,89,228                        ; mulps         %xmm12,%xmm12
-  DB  243,68,15,16,74,60                  ; movss         0x3c(%rdx),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  243,68,15,16,82,52                  ; movss         0x34(%rdx),%xmm10
-  DB  243,68,15,16,90,56                  ; movss         0x38(%rdx),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,40,241                        ; movaps        %xmm9,%xmm14
-  DB  68,15,89,240                        ; mulps         %xmm0,%xmm14
-  DB  69,15,88,243                        ; addps         %xmm11,%xmm14
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  69,15,89,244                        ; mulps         %xmm12,%xmm14
-  DB  69,15,88,242                        ; addps         %xmm10,%xmm14
-  DB  243,68,15,16,98,68                  ; movss         0x44(%rdx),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  65,15,194,196,1                     ; cmpltps       %xmm12,%xmm0
-  DB  68,15,84,232                        ; andps         %xmm0,%xmm13
-  DB  65,15,85,198                        ; andnps        %xmm14,%xmm0
-  DB  65,15,86,197                        ; orps          %xmm13,%xmm0
-  DB  69,15,40,232                        ; movaps        %xmm8,%xmm13
-  DB  68,15,89,233                        ; mulps         %xmm1,%xmm13
-  DB  68,15,40,241                        ; movaps        %xmm1,%xmm14
-  DB  69,15,89,246                        ; mulps         %xmm14,%xmm14
-  DB  69,15,40,249                        ; movaps        %xmm9,%xmm15
-  DB  68,15,89,249                        ; mulps         %xmm1,%xmm15
-  DB  69,15,88,251                        ; addps         %xmm11,%xmm15
-  DB  69,15,89,254                        ; mulps         %xmm14,%xmm15
-  DB  69,15,88,250                        ; addps         %xmm10,%xmm15
-  DB  65,15,194,204,1                     ; cmpltps       %xmm12,%xmm1
-  DB  68,15,84,233                        ; andps         %xmm1,%xmm13
-  DB  65,15,85,207                        ; andnps        %xmm15,%xmm1
-  DB  65,15,86,205                        ; orps          %xmm13,%xmm1
-  DB  68,15,89,194                        ; mulps         %xmm2,%xmm8
-  DB  68,15,40,234                        ; movaps        %xmm2,%xmm13
-  DB  69,15,89,237                        ; mulps         %xmm13,%xmm13
-  DB  68,15,89,202                        ; mulps         %xmm2,%xmm9
-  DB  69,15,88,203                        ; addps         %xmm11,%xmm9
-  DB  69,15,89,205                        ; mulps         %xmm13,%xmm9
-  DB  69,15,88,202                        ; addps         %xmm10,%xmm9
-  DB  65,15,194,212,1                     ; cmpltps       %xmm12,%xmm2
-  DB  68,15,84,194                        ; andps         %xmm2,%xmm8
-  DB  65,15,85,209                        ; andnps        %xmm9,%xmm2
-  DB  65,15,86,208                        ; orps          %xmm8,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_to_srgb_sse2
-_sk_to_srgb_sse2 LABEL PROC
-  DB  72,131,236,40                       ; sub           $0x28,%rsp
-  DB  15,41,124,36,16                     ; movaps        %xmm7,0x10(%rsp)
-  DB  15,41,52,36                         ; movaps        %xmm6,(%rsp)
-  DB  15,40,245                           ; movaps        %xmm5,%xmm6
-  DB  15,40,236                           ; movaps        %xmm4,%xmm5
-  DB  15,40,227                           ; movaps        %xmm3,%xmm4
-  DB  68,15,82,192                        ; rsqrtps       %xmm0,%xmm8
-  DB  69,15,83,232                        ; rcpps         %xmm8,%xmm13
-  DB  69,15,82,248                        ; rsqrtps       %xmm8,%xmm15
-  DB  243,15,16,26                        ; movss         (%rdx),%xmm3
-  DB  243,68,15,16,66,72                  ; movss         0x48(%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  69,15,40,240                        ; movaps        %xmm8,%xmm14
-  DB  68,15,89,240                        ; mulps         %xmm0,%xmm14
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  243,68,15,16,82,76                  ; movss         0x4c(%rdx),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,90,80                  ; movss         0x50(%rdx),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  243,68,15,16,98,84                  ; movss         0x54(%rdx),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  69,15,89,235                        ; mulps         %xmm11,%xmm13
-  DB  69,15,88,236                        ; addps         %xmm12,%xmm13
-  DB  69,15,89,250                        ; mulps         %xmm10,%xmm15
-  DB  69,15,88,253                        ; addps         %xmm13,%xmm15
-  DB  68,15,40,203                        ; movaps        %xmm3,%xmm9
-  DB  69,15,93,207                        ; minps         %xmm15,%xmm9
-  DB  243,68,15,16,106,88                 ; movss         0x58(%rdx),%xmm13
-  DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
-  DB  65,15,194,197,1                     ; cmpltps       %xmm13,%xmm0
-  DB  68,15,84,240                        ; andps         %xmm0,%xmm14
-  DB  65,15,85,193                        ; andnps        %xmm9,%xmm0
-  DB  65,15,86,198                        ; orps          %xmm14,%xmm0
-  DB  68,15,82,201                        ; rsqrtps       %xmm1,%xmm9
-  DB  69,15,83,241                        ; rcpps         %xmm9,%xmm14
-  DB  69,15,82,201                        ; rsqrtps       %xmm9,%xmm9
-  DB  69,15,89,243                        ; mulps         %xmm11,%xmm14
-  DB  69,15,88,244                        ; addps         %xmm12,%xmm14
-  DB  69,15,89,202                        ; mulps         %xmm10,%xmm9
-  DB  69,15,88,206                        ; addps         %xmm14,%xmm9
-  DB  68,15,40,243                        ; movaps        %xmm3,%xmm14
-  DB  69,15,93,241                        ; minps         %xmm9,%xmm14
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  65,15,194,205,1                     ; cmpltps       %xmm13,%xmm1
-  DB  68,15,84,201                        ; andps         %xmm1,%xmm9
-  DB  65,15,85,206                        ; andnps        %xmm14,%xmm1
-  DB  65,15,86,201                        ; orps          %xmm9,%xmm1
-  DB  68,15,82,202                        ; rsqrtps       %xmm2,%xmm9
-  DB  69,15,83,241                        ; rcpps         %xmm9,%xmm14
-  DB  69,15,89,243                        ; mulps         %xmm11,%xmm14
-  DB  69,15,88,244                        ; addps         %xmm12,%xmm14
-  DB  65,15,82,249                        ; rsqrtps       %xmm9,%xmm7
-  DB  65,15,89,250                        ; mulps         %xmm10,%xmm7
-  DB  65,15,88,254                        ; addps         %xmm14,%xmm7
-  DB  15,93,223                           ; minps         %xmm7,%xmm3
-  DB  68,15,89,194                        ; mulps         %xmm2,%xmm8
-  DB  65,15,194,213,1                     ; cmpltps       %xmm13,%xmm2
-  DB  68,15,84,194                        ; andps         %xmm2,%xmm8
-  DB  15,85,211                           ; andnps        %xmm3,%xmm2
-  DB  65,15,86,208                        ; orps          %xmm8,%xmm2
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,40,220                           ; movaps        %xmm4,%xmm3
-  DB  15,40,229                           ; movaps        %xmm5,%xmm4
-  DB  15,40,238                           ; movaps        %xmm6,%xmm5
-  DB  15,40,52,36                         ; movaps        (%rsp),%xmm6
-  DB  15,40,124,36,16                     ; movaps        0x10(%rsp),%xmm7
-  DB  72,131,196,40                       ; add           $0x28,%rsp
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_scale_1_float_sse2
-_sk_scale_1_float_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_scale_u8_sse2
-_sk_scale_u8_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,68,15,110,4,56                  ; movd          (%rax,%rdi,1),%xmm8
-  DB  102,69,15,239,201                   ; pxor          %xmm9,%xmm9
-  DB  102,69,15,96,193                    ; punpcklbw     %xmm9,%xmm8
-  DB  102,69,15,97,193                    ; punpcklwd     %xmm9,%xmm8
-  DB  69,15,91,192                        ; cvtdq2ps      %xmm8,%xmm8
-  DB  243,68,15,16,74,12                  ; movss         0xc(%rdx),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  65,15,89,193                        ; mulps         %xmm9,%xmm0
-  DB  65,15,89,201                        ; mulps         %xmm9,%xmm1
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  65,15,89,217                        ; mulps         %xmm9,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_lerp_1_float_sse2
-_sk_lerp_1_float_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  15,92,196                           ; subps         %xmm4,%xmm0
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  15,92,205                           ; subps         %xmm5,%xmm1
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  15,92,214                           ; subps         %xmm6,%xmm2
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  15,92,223                           ; subps         %xmm7,%xmm3
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  15,88,223                           ; addps         %xmm7,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_lerp_u8_sse2
-_sk_lerp_u8_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,68,15,110,4,56                  ; movd          (%rax,%rdi,1),%xmm8
-  DB  102,69,15,239,201                   ; pxor          %xmm9,%xmm9
-  DB  102,69,15,96,193                    ; punpcklbw     %xmm9,%xmm8
-  DB  102,69,15,97,193                    ; punpcklwd     %xmm9,%xmm8
-  DB  69,15,91,192                        ; cvtdq2ps      %xmm8,%xmm8
-  DB  243,68,15,16,74,12                  ; movss         0xc(%rdx),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  15,92,196                           ; subps         %xmm4,%xmm0
-  DB  65,15,89,193                        ; mulps         %xmm9,%xmm0
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  15,92,205                           ; subps         %xmm5,%xmm1
-  DB  65,15,89,201                        ; mulps         %xmm9,%xmm1
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  15,92,214                           ; subps         %xmm6,%xmm2
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  15,92,223                           ; subps         %xmm7,%xmm3
-  DB  65,15,89,217                        ; mulps         %xmm9,%xmm3
-  DB  15,88,223                           ; addps         %xmm7,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_lerp_565_sse2
-_sk_lerp_565_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,68,15,126,4,120                 ; movq          (%rax,%rdi,2),%xmm8
-  DB  102,15,239,219                      ; pxor          %xmm3,%xmm3
-  DB  102,68,15,97,195                    ; punpcklwd     %xmm3,%xmm8
-  DB  102,15,110,90,104                   ; movd          0x68(%rdx),%xmm3
-  DB  102,15,112,219,0                    ; pshufd        $0x0,%xmm3,%xmm3
-  DB  102,65,15,219,216                   ; pand          %xmm8,%xmm3
-  DB  68,15,91,203                        ; cvtdq2ps      %xmm3,%xmm9
-  DB  243,15,16,26                        ; movss         (%rdx),%xmm3
-  DB  243,68,15,16,82,116                 ; movss         0x74(%rdx),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  102,68,15,110,74,108                ; movd          0x6c(%rdx),%xmm9
-  DB  102,69,15,112,201,0                 ; pshufd        $0x0,%xmm9,%xmm9
-  DB  102,69,15,219,200                   ; pand          %xmm8,%xmm9
-  DB  69,15,91,201                        ; cvtdq2ps      %xmm9,%xmm9
-  DB  243,68,15,16,90,120                 ; movss         0x78(%rdx),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,89,217                        ; mulps         %xmm9,%xmm11
-  DB  102,68,15,110,74,112                ; movd          0x70(%rdx),%xmm9
-  DB  102,69,15,112,201,0                 ; pshufd        $0x0,%xmm9,%xmm9
-  DB  102,69,15,219,200                   ; pand          %xmm8,%xmm9
-  DB  69,15,91,193                        ; cvtdq2ps      %xmm9,%xmm8
-  DB  243,68,15,16,74,124                 ; movss         0x7c(%rdx),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  15,92,196                           ; subps         %xmm4,%xmm0
-  DB  65,15,89,194                        ; mulps         %xmm10,%xmm0
-  DB  15,88,196                           ; addps         %xmm4,%xmm0
-  DB  15,92,205                           ; subps         %xmm5,%xmm1
-  DB  65,15,89,203                        ; mulps         %xmm11,%xmm1
-  DB  15,88,205                           ; addps         %xmm5,%xmm1
-  DB  15,92,214                           ; subps         %xmm6,%xmm2
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  15,88,214                           ; addps         %xmm6,%xmm2
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_tables_sse2
-_sk_load_tables_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,8                            ; mov           (%rax),%rcx
-  DB  76,139,64,8                         ; mov           0x8(%rax),%r8
-  DB  243,68,15,111,4,185                 ; movdqu        (%rcx,%rdi,4),%xmm8
-  DB  102,15,110,66,16                    ; movd          0x10(%rdx),%xmm0
-  DB  102,15,112,192,0                    ; pshufd        $0x0,%xmm0,%xmm0
-  DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
-  DB  102,65,15,114,209,8                 ; psrld         $0x8,%xmm9
-  DB  102,68,15,219,200                   ; pand          %xmm0,%xmm9
-  DB  102,69,15,111,208                   ; movdqa        %xmm8,%xmm10
-  DB  102,65,15,114,210,16                ; psrld         $0x10,%xmm10
-  DB  102,68,15,219,208                   ; pand          %xmm0,%xmm10
-  DB  102,65,15,219,192                   ; pand          %xmm8,%xmm0
-  DB  102,15,112,216,78                   ; pshufd        $0x4e,%xmm0,%xmm3
-  DB  102,72,15,126,217                   ; movq          %xmm3,%rcx
-  DB  65,137,201                          ; mov           %ecx,%r9d
-  DB  72,193,233,32                       ; shr           $0x20,%rcx
-  DB  102,73,15,126,194                   ; movq          %xmm0,%r10
-  DB  69,137,211                          ; mov           %r10d,%r11d
-  DB  73,193,234,32                       ; shr           $0x20,%r10
-  DB  243,67,15,16,28,144                 ; movss         (%r8,%r10,4),%xmm3
-  DB  243,65,15,16,4,136                  ; movss         (%r8,%rcx,4),%xmm0
-  DB  15,20,216                           ; unpcklps      %xmm0,%xmm3
-  DB  243,67,15,16,4,152                  ; movss         (%r8,%r11,4),%xmm0
-  DB  243,67,15,16,12,136                 ; movss         (%r8,%r9,4),%xmm1
-  DB  15,20,193                           ; unpcklps      %xmm1,%xmm0
-  DB  15,20,195                           ; unpcklps      %xmm3,%xmm0
-  DB  72,139,72,16                        ; mov           0x10(%rax),%rcx
-  DB  102,65,15,112,201,78                ; pshufd        $0x4e,%xmm9,%xmm1
-  DB  102,73,15,126,200                   ; movq          %xmm1,%r8
-  DB  69,137,193                          ; mov           %r8d,%r9d
-  DB  73,193,232,32                       ; shr           $0x20,%r8
-  DB  102,77,15,126,202                   ; movq          %xmm9,%r10
-  DB  69,137,211                          ; mov           %r10d,%r11d
-  DB  73,193,234,32                       ; shr           $0x20,%r10
-  DB  243,66,15,16,28,145                 ; movss         (%rcx,%r10,4),%xmm3
-  DB  243,66,15,16,12,129                 ; movss         (%rcx,%r8,4),%xmm1
-  DB  15,20,217                           ; unpcklps      %xmm1,%xmm3
-  DB  243,66,15,16,12,153                 ; movss         (%rcx,%r11,4),%xmm1
-  DB  243,66,15,16,20,137                 ; movss         (%rcx,%r9,4),%xmm2
-  DB  15,20,202                           ; unpcklps      %xmm2,%xmm1
-  DB  15,20,203                           ; unpcklps      %xmm3,%xmm1
-  DB  72,139,64,24                        ; mov           0x18(%rax),%rax
-  DB  102,65,15,112,210,78                ; pshufd        $0x4e,%xmm10,%xmm2
-  DB  102,72,15,126,209                   ; movq          %xmm2,%rcx
-  DB  65,137,200                          ; mov           %ecx,%r8d
-  DB  72,193,233,32                       ; shr           $0x20,%rcx
-  DB  102,77,15,126,209                   ; movq          %xmm10,%r9
-  DB  69,137,202                          ; mov           %r9d,%r10d
-  DB  73,193,233,32                       ; shr           $0x20,%r9
-  DB  243,70,15,16,12,136                 ; movss         (%rax,%r9,4),%xmm9
-  DB  243,15,16,20,136                    ; movss         (%rax,%rcx,4),%xmm2
-  DB  68,15,20,202                        ; unpcklps      %xmm2,%xmm9
-  DB  243,66,15,16,20,144                 ; movss         (%rax,%r10,4),%xmm2
-  DB  243,66,15,16,28,128                 ; movss         (%rax,%r8,4),%xmm3
-  DB  15,20,211                           ; unpcklps      %xmm3,%xmm2
-  DB  65,15,20,209                        ; unpcklps      %xmm9,%xmm2
-  DB  102,65,15,114,208,24                ; psrld         $0x18,%xmm8
-  DB  69,15,91,192                        ; cvtdq2ps      %xmm8,%xmm8
-  DB  243,15,16,90,12                     ; movss         0xc(%rdx),%xmm3
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_a8_sse2
-_sk_load_a8_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,15,110,4,56                     ; movd          (%rax,%rdi,1),%xmm0
-  DB  102,15,239,201                      ; pxor          %xmm1,%xmm1
-  DB  102,15,96,193                       ; punpcklbw     %xmm1,%xmm0
-  DB  102,15,97,193                       ; punpcklwd     %xmm1,%xmm0
-  DB  15,91,192                           ; cvtdq2ps      %xmm0,%xmm0
-  DB  243,15,16,90,12                     ; movss         0xc(%rdx),%xmm3
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  15,89,216                           ; mulps         %xmm0,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  15,87,192                           ; xorps         %xmm0,%xmm0
-  DB  102,15,239,201                      ; pxor          %xmm1,%xmm1
-  DB  15,87,210                           ; xorps         %xmm2,%xmm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_a8_sse2
-_sk_store_a8_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,68,15,16,66,8                   ; movss         0x8(%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,89,195                        ; mulps         %xmm3,%xmm8
-  DB  102,69,15,91,192                    ; cvtps2dq      %xmm8,%xmm8
-  DB  102,65,15,114,240,16                ; pslld         $0x10,%xmm8
-  DB  102,65,15,114,224,16                ; psrad         $0x10,%xmm8
-  DB  102,69,15,107,192                   ; packssdw      %xmm8,%xmm8
-  DB  102,69,15,103,192                   ; packuswb      %xmm8,%xmm8
-  DB  102,68,15,126,4,56                  ; movd          %xmm8,(%rax,%rdi,1)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_565_sse2
-_sk_load_565_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,68,15,126,12,120                ; movq          (%rax,%rdi,2),%xmm9
-  DB  102,15,239,192                      ; pxor          %xmm0,%xmm0
-  DB  102,68,15,97,200                    ; punpcklwd     %xmm0,%xmm9
-  DB  102,15,110,66,104                   ; movd          0x68(%rdx),%xmm0
-  DB  102,15,112,192,0                    ; pshufd        $0x0,%xmm0,%xmm0
-  DB  102,65,15,219,193                   ; pand          %xmm9,%xmm0
-  DB  15,91,200                           ; cvtdq2ps      %xmm0,%xmm1
-  DB  243,15,16,26                        ; movss         (%rdx),%xmm3
-  DB  243,15,16,66,116                    ; movss         0x74(%rdx),%xmm0
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  15,89,193                           ; mulps         %xmm1,%xmm0
-  DB  102,15,110,74,108                   ; movd          0x6c(%rdx),%xmm1
-  DB  102,15,112,201,0                    ; pshufd        $0x0,%xmm1,%xmm1
-  DB  102,65,15,219,201                   ; pand          %xmm9,%xmm1
-  DB  68,15,91,193                        ; cvtdq2ps      %xmm1,%xmm8
-  DB  243,15,16,74,120                    ; movss         0x78(%rdx),%xmm1
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  102,15,110,82,112                   ; movd          0x70(%rdx),%xmm2
-  DB  102,15,112,210,0                    ; pshufd        $0x0,%xmm2,%xmm2
-  DB  102,65,15,219,209                   ; pand          %xmm9,%xmm2
-  DB  68,15,91,194                        ; cvtdq2ps      %xmm2,%xmm8
-  DB  243,15,16,82,124                    ; movss         0x7c(%rdx),%xmm2
-  DB  15,198,210,0                        ; shufps        $0x0,%xmm2,%xmm2
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  15,198,219,0                        ; shufps        $0x0,%xmm3,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_565_sse2
-_sk_store_565_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,68,15,16,130,128,0,0,0          ; movss         0x80(%rdx),%xmm8
-  DB  243,68,15,16,138,132,0,0,0          ; movss         0x84(%rdx),%xmm9
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  69,15,40,208                        ; movaps        %xmm8,%xmm10
-  DB  68,15,89,208                        ; mulps         %xmm0,%xmm10
-  DB  102,69,15,91,210                    ; cvtps2dq      %xmm10,%xmm10
-  DB  102,65,15,114,242,11                ; pslld         $0xb,%xmm10
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  102,69,15,91,201                    ; cvtps2dq      %xmm9,%xmm9
-  DB  102,65,15,114,241,5                 ; pslld         $0x5,%xmm9
-  DB  102,69,15,235,202                   ; por           %xmm10,%xmm9
-  DB  68,15,89,194                        ; mulps         %xmm2,%xmm8
-  DB  102,69,15,91,192                    ; cvtps2dq      %xmm8,%xmm8
-  DB  102,69,15,86,193                    ; orpd          %xmm9,%xmm8
-  DB  102,65,15,114,240,16                ; pslld         $0x10,%xmm8
-  DB  102,65,15,114,224,16                ; psrad         $0x10,%xmm8
-  DB  102,69,15,107,192                   ; packssdw      %xmm8,%xmm8
-  DB  102,68,15,214,4,120                 ; movq          %xmm8,(%rax,%rdi,2)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_8888_sse2
-_sk_load_8888_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,15,111,28,184                   ; movdqu        (%rax,%rdi,4),%xmm3
-  DB  102,15,110,66,16                    ; movd          0x10(%rdx),%xmm0
-  DB  102,15,112,192,0                    ; pshufd        $0x0,%xmm0,%xmm0
-  DB  102,15,111,203                      ; movdqa        %xmm3,%xmm1
-  DB  102,15,114,209,8                    ; psrld         $0x8,%xmm1
-  DB  102,15,219,200                      ; pand          %xmm0,%xmm1
-  DB  102,15,111,211                      ; movdqa        %xmm3,%xmm2
-  DB  102,15,114,210,16                   ; psrld         $0x10,%xmm2
-  DB  102,15,219,208                      ; pand          %xmm0,%xmm2
-  DB  102,15,219,195                      ; pand          %xmm3,%xmm0
-  DB  15,91,192                           ; cvtdq2ps      %xmm0,%xmm0
-  DB  243,68,15,16,66,12                  ; movss         0xc(%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  15,91,201                           ; cvtdq2ps      %xmm1,%xmm1
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  15,91,210                           ; cvtdq2ps      %xmm2,%xmm2
-  DB  65,15,89,208                        ; mulps         %xmm8,%xmm2
-  DB  102,15,114,211,24                   ; psrld         $0x18,%xmm3
-  DB  15,91,219                           ; cvtdq2ps      %xmm3,%xmm3
-  DB  65,15,89,216                        ; mulps         %xmm8,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_8888_sse2
-_sk_store_8888_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,68,15,16,66,8                   ; movss         0x8(%rdx),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,200                        ; mulps         %xmm0,%xmm9
-  DB  102,69,15,91,201                    ; cvtps2dq      %xmm9,%xmm9
-  DB  69,15,40,208                        ; movaps        %xmm8,%xmm10
-  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
-  DB  102,69,15,91,210                    ; cvtps2dq      %xmm10,%xmm10
-  DB  102,65,15,114,242,8                 ; pslld         $0x8,%xmm10
-  DB  102,69,15,235,209                   ; por           %xmm9,%xmm10
-  DB  69,15,40,200                        ; movaps        %xmm8,%xmm9
-  DB  68,15,89,202                        ; mulps         %xmm2,%xmm9
-  DB  102,69,15,91,201                    ; cvtps2dq      %xmm9,%xmm9
-  DB  102,65,15,114,241,16                ; pslld         $0x10,%xmm9
-  DB  68,15,89,195                        ; mulps         %xmm3,%xmm8
-  DB  102,69,15,91,192                    ; cvtps2dq      %xmm8,%xmm8
-  DB  102,65,15,114,240,24                ; pslld         $0x18,%xmm8
-  DB  102,69,15,235,193                   ; por           %xmm9,%xmm8
-  DB  102,69,15,235,194                   ; por           %xmm10,%xmm8
-  DB  243,68,15,127,4,184                 ; movdqu        %xmm8,(%rax,%rdi,4)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_load_f16_sse2
-_sk_load_f16_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  243,15,111,4,248                    ; movdqu        (%rax,%rdi,8),%xmm0
-  DB  243,15,111,76,248,16                ; movdqu        0x10(%rax,%rdi,8),%xmm1
-  DB  102,15,111,208                      ; movdqa        %xmm0,%xmm2
-  DB  102,15,97,209                       ; punpcklwd     %xmm1,%xmm2
-  DB  102,15,105,193                      ; punpckhwd     %xmm1,%xmm0
-  DB  102,68,15,111,194                   ; movdqa        %xmm2,%xmm8
-  DB  102,68,15,97,192                    ; punpcklwd     %xmm0,%xmm8
-  DB  102,15,105,208                      ; punpckhwd     %xmm0,%xmm2
-  DB  102,15,110,66,100                   ; movd          0x64(%rdx),%xmm0
-  DB  102,15,112,216,0                    ; pshufd        $0x0,%xmm0,%xmm3
-  DB  102,15,111,203                      ; movdqa        %xmm3,%xmm1
-  DB  102,65,15,101,200                   ; pcmpgtw       %xmm8,%xmm1
-  DB  102,65,15,223,200                   ; pandn         %xmm8,%xmm1
-  DB  102,15,101,218                      ; pcmpgtw       %xmm2,%xmm3
-  DB  102,15,223,218                      ; pandn         %xmm2,%xmm3
-  DB  102,69,15,239,192                   ; pxor          %xmm8,%xmm8
-  DB  102,15,111,193                      ; movdqa        %xmm1,%xmm0
-  DB  102,65,15,97,192                    ; punpcklwd     %xmm8,%xmm0
-  DB  102,15,114,240,13                   ; pslld         $0xd,%xmm0
-  DB  102,15,110,82,92                    ; movd          0x5c(%rdx),%xmm2
-  DB  102,68,15,112,202,0                 ; pshufd        $0x0,%xmm2,%xmm9
-  DB  65,15,89,193                        ; mulps         %xmm9,%xmm0
-  DB  102,65,15,105,200                   ; punpckhwd     %xmm8,%xmm1
-  DB  102,15,114,241,13                   ; pslld         $0xd,%xmm1
-  DB  65,15,89,201                        ; mulps         %xmm9,%xmm1
-  DB  102,15,111,211                      ; movdqa        %xmm3,%xmm2
-  DB  102,65,15,97,208                    ; punpcklwd     %xmm8,%xmm2
-  DB  102,15,114,242,13                   ; pslld         $0xd,%xmm2
-  DB  65,15,89,209                        ; mulps         %xmm9,%xmm2
-  DB  102,65,15,105,216                   ; punpckhwd     %xmm8,%xmm3
-  DB  102,15,114,243,13                   ; pslld         $0xd,%xmm3
-  DB  65,15,89,217                        ; mulps         %xmm9,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_f16_sse2
-_sk_store_f16_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  102,68,15,110,66,96                 ; movd          0x60(%rdx),%xmm8
-  DB  102,69,15,112,192,0                 ; pshufd        $0x0,%xmm8,%xmm8
-  DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
-  DB  68,15,89,200                        ; mulps         %xmm0,%xmm9
-  DB  102,65,15,114,209,13                ; psrld         $0xd,%xmm9
-  DB  102,69,15,111,208                   ; movdqa        %xmm8,%xmm10
-  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
-  DB  102,65,15,114,210,13                ; psrld         $0xd,%xmm10
-  DB  102,69,15,111,216                   ; movdqa        %xmm8,%xmm11
-  DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
-  DB  102,65,15,114,211,13                ; psrld         $0xd,%xmm11
-  DB  68,15,89,195                        ; mulps         %xmm3,%xmm8
-  DB  102,65,15,114,208,13                ; psrld         $0xd,%xmm8
-  DB  102,65,15,115,250,2                 ; pslldq        $0x2,%xmm10
-  DB  102,69,15,235,209                   ; por           %xmm9,%xmm10
-  DB  102,65,15,115,248,2                 ; pslldq        $0x2,%xmm8
-  DB  102,69,15,235,195                   ; por           %xmm11,%xmm8
-  DB  102,69,15,111,202                   ; movdqa        %xmm10,%xmm9
-  DB  102,69,15,98,200                    ; punpckldq     %xmm8,%xmm9
-  DB  243,68,15,127,12,248                ; movdqu        %xmm9,(%rax,%rdi,8)
-  DB  102,69,15,106,208                   ; punpckhdq     %xmm8,%xmm10
-  DB  243,68,15,127,84,248,16             ; movdqu        %xmm10,0x10(%rax,%rdi,8)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_store_f32_sse2
-_sk_store_f32_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  72,139,0                            ; mov           (%rax),%rax
-  DB  72,137,249                          ; mov           %rdi,%rcx
-  DB  72,193,225,4                        ; shl           $0x4,%rcx
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  68,15,40,200                        ; movaps        %xmm0,%xmm9
-  DB  68,15,20,201                        ; unpcklps      %xmm1,%xmm9
-  DB  68,15,40,210                        ; movaps        %xmm2,%xmm10
-  DB  68,15,40,218                        ; movaps        %xmm2,%xmm11
-  DB  68,15,20,219                        ; unpcklps      %xmm3,%xmm11
-  DB  68,15,21,193                        ; unpckhps      %xmm1,%xmm8
-  DB  68,15,21,211                        ; unpckhps      %xmm3,%xmm10
-  DB  69,15,40,225                        ; movaps        %xmm9,%xmm12
-  DB  102,69,15,20,227                    ; unpcklpd      %xmm11,%xmm12
-  DB  102,69,15,21,203                    ; unpckhpd      %xmm11,%xmm9
-  DB  69,15,40,216                        ; movaps        %xmm8,%xmm11
-  DB  102,69,15,20,218                    ; unpcklpd      %xmm10,%xmm11
-  DB  102,69,15,21,194                    ; unpckhpd      %xmm10,%xmm8
-  DB  102,68,15,17,36,8                   ; movupd        %xmm12,(%rax,%rcx,1)
-  DB  102,68,15,17,76,8,16                ; movupd        %xmm9,0x10(%rax,%rcx,1)
-  DB  102,68,15,17,92,8,32                ; movupd        %xmm11,0x20(%rax,%rcx,1)
-  DB  102,68,15,17,68,8,48                ; movupd        %xmm8,0x30(%rax,%rcx,1)
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_x_sse2
-_sk_clamp_x_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  68,15,95,192                        ; maxps         %xmm0,%xmm8
-  DB  243,68,15,16,8                      ; movss         (%rax),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  102,15,118,192                      ; pcmpeqd       %xmm0,%xmm0
-  DB  102,65,15,254,193                   ; paddd         %xmm9,%xmm0
-  DB  68,15,93,192                        ; minps         %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,192                        ; movaps        %xmm8,%xmm0
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_clamp_y_sse2
-_sk_clamp_y_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  69,15,87,192                        ; xorps         %xmm8,%xmm8
-  DB  68,15,95,193                        ; maxps         %xmm1,%xmm8
-  DB  243,68,15,16,8                      ; movss         (%rax),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  102,15,118,201                      ; pcmpeqd       %xmm1,%xmm1
-  DB  102,65,15,254,201                   ; paddd         %xmm9,%xmm1
-  DB  68,15,93,193                        ; minps         %xmm1,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,200                        ; movaps        %xmm8,%xmm1
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_repeat_x_sse2
-_sk_repeat_x_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,40,200                        ; movaps        %xmm0,%xmm9
-  DB  69,15,94,200                        ; divps         %xmm8,%xmm9
-  DB  243,69,15,91,209                    ; cvttps2dq     %xmm9,%xmm10
-  DB  69,15,91,210                        ; cvtdq2ps      %xmm10,%xmm10
-  DB  69,15,194,202,1                     ; cmpltps       %xmm10,%xmm9
-  DB  243,68,15,16,26                     ; movss         (%rdx),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,84,217                        ; andps         %xmm9,%xmm11
-  DB  69,15,92,211                        ; subps         %xmm11,%xmm10
-  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
-  DB  65,15,92,194                        ; subps         %xmm10,%xmm0
-  DB  102,69,15,118,201                   ; pcmpeqd       %xmm9,%xmm9
-  DB  102,69,15,254,200                   ; paddd         %xmm8,%xmm9
-  DB  65,15,93,193                        ; minps         %xmm9,%xmm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_repeat_y_sse2
-_sk_repeat_y_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,0                      ; movss         (%rax),%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  68,15,40,201                        ; movaps        %xmm1,%xmm9
-  DB  69,15,94,200                        ; divps         %xmm8,%xmm9
-  DB  243,69,15,91,209                    ; cvttps2dq     %xmm9,%xmm10
-  DB  69,15,91,210                        ; cvtdq2ps      %xmm10,%xmm10
-  DB  69,15,194,202,1                     ; cmpltps       %xmm10,%xmm9
-  DB  243,68,15,16,26                     ; movss         (%rdx),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,84,217                        ; andps         %xmm9,%xmm11
-  DB  69,15,92,211                        ; subps         %xmm11,%xmm10
-  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
-  DB  65,15,92,202                        ; subps         %xmm10,%xmm1
-  DB  102,69,15,118,201                   ; pcmpeqd       %xmm9,%xmm9
-  DB  102,69,15,254,200                   ; paddd         %xmm8,%xmm9
-  DB  65,15,93,201                        ; minps         %xmm9,%xmm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_mirror_x_sse2
-_sk_mirror_x_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,8                      ; movss         (%rax),%xmm9
-  DB  69,15,40,193                        ; movaps        %xmm9,%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,92,192                        ; subps         %xmm8,%xmm0
-  DB  243,69,15,88,201                    ; addss         %xmm9,%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  68,15,40,208                        ; movaps        %xmm0,%xmm10
-  DB  69,15,94,209                        ; divps         %xmm9,%xmm10
-  DB  243,69,15,91,218                    ; cvttps2dq     %xmm10,%xmm11
-  DB  69,15,91,219                        ; cvtdq2ps      %xmm11,%xmm11
-  DB  69,15,194,211,1                     ; cmpltps       %xmm11,%xmm10
-  DB  243,68,15,16,34                     ; movss         (%rdx),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  69,15,84,226                        ; andps         %xmm10,%xmm12
-  DB  69,15,87,210                        ; xorps         %xmm10,%xmm10
-  DB  69,15,92,220                        ; subps         %xmm12,%xmm11
-  DB  69,15,89,217                        ; mulps         %xmm9,%xmm11
-  DB  65,15,92,195                        ; subps         %xmm11,%xmm0
-  DB  65,15,92,192                        ; subps         %xmm8,%xmm0
-  DB  68,15,92,208                        ; subps         %xmm0,%xmm10
-  DB  65,15,84,194                        ; andps         %xmm10,%xmm0
-  DB  102,69,15,118,201                   ; pcmpeqd       %xmm9,%xmm9
-  DB  102,69,15,254,200                   ; paddd         %xmm8,%xmm9
-  DB  65,15,93,193                        ; minps         %xmm9,%xmm0
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_mirror_y_sse2
-_sk_mirror_y_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,68,15,16,8                      ; movss         (%rax),%xmm9
-  DB  69,15,40,193                        ; movaps        %xmm9,%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,92,200                        ; subps         %xmm8,%xmm1
-  DB  243,69,15,88,201                    ; addss         %xmm9,%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  68,15,40,209                        ; movaps        %xmm1,%xmm10
-  DB  69,15,94,209                        ; divps         %xmm9,%xmm10
-  DB  243,69,15,91,218                    ; cvttps2dq     %xmm10,%xmm11
-  DB  69,15,91,219                        ; cvtdq2ps      %xmm11,%xmm11
-  DB  69,15,194,211,1                     ; cmpltps       %xmm11,%xmm10
-  DB  243,68,15,16,34                     ; movss         (%rdx),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  69,15,84,226                        ; andps         %xmm10,%xmm12
-  DB  69,15,87,210                        ; xorps         %xmm10,%xmm10
-  DB  69,15,92,220                        ; subps         %xmm12,%xmm11
-  DB  69,15,89,217                        ; mulps         %xmm9,%xmm11
-  DB  65,15,92,203                        ; subps         %xmm11,%xmm1
-  DB  65,15,92,200                        ; subps         %xmm8,%xmm1
-  DB  68,15,92,209                        ; subps         %xmm1,%xmm10
-  DB  65,15,84,202                        ; andps         %xmm10,%xmm1
-  DB  102,69,15,118,201                   ; pcmpeqd       %xmm9,%xmm9
-  DB  102,69,15,254,200                   ; paddd         %xmm8,%xmm9
-  DB  65,15,93,201                        ; minps         %xmm9,%xmm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_2x3_sse2
-_sk_matrix_2x3_sse2 LABEL PROC
-  DB  68,15,40,201                        ; movaps        %xmm1,%xmm9
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,15,16,0                         ; movss         (%rax),%xmm0
-  DB  243,15,16,72,4                      ; movss         0x4(%rax),%xmm1
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  243,68,15,16,80,8                   ; movss         0x8(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,16                  ; movss         0x10(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,88,194                        ; addps         %xmm10,%xmm0
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  243,68,15,16,80,12                  ; movss         0xc(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,20                  ; movss         0x14(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  65,15,88,202                        ; addps         %xmm10,%xmm1
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_3x4_sse2
-_sk_matrix_3x4_sse2 LABEL PROC
-  DB  68,15,40,201                        ; movaps        %xmm1,%xmm9
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,15,16,0                         ; movss         (%rax),%xmm0
-  DB  243,15,16,72,4                      ; movss         0x4(%rax),%xmm1
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  243,68,15,16,80,12                  ; movss         0xc(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,24                  ; movss         0x18(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  243,68,15,16,96,36                  ; movss         0x24(%rax),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
-  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,88,194                        ; addps         %xmm10,%xmm0
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  243,68,15,16,80,16                  ; movss         0x10(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,28                  ; movss         0x1c(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  243,68,15,16,96,40                  ; movss         0x28(%rax),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
-  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
-  DB  69,15,89,209                        ; mulps         %xmm9,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,89,200                        ; mulps         %xmm8,%xmm1
-  DB  65,15,88,202                        ; addps         %xmm10,%xmm1
-  DB  243,68,15,16,80,8                   ; movss         0x8(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,20                  ; movss         0x14(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  243,68,15,16,96,32                  ; movss         0x20(%rax),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  243,68,15,16,104,44                 ; movss         0x2c(%rax),%xmm13
-  DB  69,15,198,237,0                     ; shufps        $0x0,%xmm13,%xmm13
-  DB  68,15,89,226                        ; mulps         %xmm2,%xmm12
-  DB  69,15,88,229                        ; addps         %xmm13,%xmm12
-  DB  69,15,89,217                        ; mulps         %xmm9,%xmm11
-  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
-  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,210                        ; movaps        %xmm10,%xmm2
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_matrix_perspective_sse2
-_sk_matrix_perspective_sse2 LABEL PROC
-  DB  68,15,40,192                        ; movaps        %xmm0,%xmm8
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  243,15,16,0                         ; movss         (%rax),%xmm0
-  DB  243,68,15,16,72,4                   ; movss         0x4(%rax),%xmm9
-  DB  15,198,192,0                        ; shufps        $0x0,%xmm0,%xmm0
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  243,68,15,16,80,8                   ; movss         0x8(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  69,15,88,202                        ; addps         %xmm10,%xmm9
-  DB  65,15,89,192                        ; mulps         %xmm8,%xmm0
-  DB  65,15,88,193                        ; addps         %xmm9,%xmm0
-  DB  243,68,15,16,72,12                  ; movss         0xc(%rax),%xmm9
-  DB  69,15,198,201,0                     ; shufps        $0x0,%xmm9,%xmm9
-  DB  243,68,15,16,80,16                  ; movss         0x10(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,20                  ; movss         0x14(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  69,15,89,200                        ; mulps         %xmm8,%xmm9
-  DB  69,15,88,202                        ; addps         %xmm10,%xmm9
-  DB  243,68,15,16,80,24                  ; movss         0x18(%rax),%xmm10
-  DB  69,15,198,210,0                     ; shufps        $0x0,%xmm10,%xmm10
-  DB  243,68,15,16,88,28                  ; movss         0x1c(%rax),%xmm11
-  DB  69,15,198,219,0                     ; shufps        $0x0,%xmm11,%xmm11
-  DB  243,68,15,16,96,32                  ; movss         0x20(%rax),%xmm12
-  DB  69,15,198,228,0                     ; shufps        $0x0,%xmm12,%xmm12
-  DB  68,15,89,217                        ; mulps         %xmm1,%xmm11
-  DB  69,15,88,220                        ; addps         %xmm12,%xmm11
-  DB  69,15,89,208                        ; mulps         %xmm8,%xmm10
-  DB  69,15,88,211                        ; addps         %xmm11,%xmm10
-  DB  65,15,83,202                        ; rcpps         %xmm10,%xmm1
-  DB  15,89,193                           ; mulps         %xmm1,%xmm0
-  DB  68,15,89,201                        ; mulps         %xmm1,%xmm9
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,201                        ; movaps        %xmm9,%xmm1
-  DB  255,224                             ; jmpq          *%rax
-
-PUBLIC _sk_linear_gradient_2stops_sse2
-_sk_linear_gradient_2stops_sse2 LABEL PROC
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  68,15,16,8                          ; movups        (%rax),%xmm9
-  DB  15,16,88,16                         ; movups        0x10(%rax),%xmm3
-  DB  68,15,40,195                        ; movaps        %xmm3,%xmm8
-  DB  69,15,198,192,0                     ; shufps        $0x0,%xmm8,%xmm8
-  DB  65,15,40,201                        ; movaps        %xmm9,%xmm1
-  DB  15,198,201,0                        ; shufps        $0x0,%xmm1,%xmm1
-  DB  68,15,89,192                        ; mulps         %xmm0,%xmm8
-  DB  68,15,88,193                        ; addps         %xmm1,%xmm8
-  DB  15,40,203                           ; movaps        %xmm3,%xmm1
-  DB  15,198,201,85                       ; shufps        $0x55,%xmm1,%xmm1
-  DB  65,15,40,209                        ; movaps        %xmm9,%xmm2
-  DB  15,198,210,85                       ; shufps        $0x55,%xmm2,%xmm2
-  DB  15,89,200                           ; mulps         %xmm0,%xmm1
-  DB  15,88,202                           ; addps         %xmm2,%xmm1
-  DB  15,40,211                           ; movaps        %xmm3,%xmm2
-  DB  15,198,210,170                      ; shufps        $0xaa,%xmm2,%xmm2
-  DB  69,15,40,209                        ; movaps        %xmm9,%xmm10
-  DB  69,15,198,210,170                   ; shufps        $0xaa,%xmm10,%xmm10
-  DB  15,89,208                           ; mulps         %xmm0,%xmm2
-  DB  65,15,88,210                        ; addps         %xmm10,%xmm2
-  DB  15,198,219,255                      ; shufps        $0xff,%xmm3,%xmm3
-  DB  69,15,198,201,255                   ; shufps        $0xff,%xmm9,%xmm9
-  DB  15,89,216                           ; mulps         %xmm0,%xmm3
-  DB  65,15,88,217                        ; addps         %xmm9,%xmm3
-  DB  72,173                              ; lods          %ds:(%rsi),%rax
-  DB  65,15,40,192                        ; movaps        %xmm8,%xmm0
-  DB  255,224                             ; jmpq          *%rax
-END
index 4a04779..118291d 100755 (executable)
@@ -71,15 +71,7 @@ subprocess.check_call(clang + cflags + vfp4 +
                       ['-c', 'src/jumper/SkJumper_stages.cpp'] +
                       ['-o', 'vfp4.o'])
 
-def parse_object_file(dot_o, directive, target=None):
-  globl, label, comment = '.globl', ':', '// '
-  if 'win' in dot_o:
-    globl, label, comment = 'PUBLIC', ' LABEL PROC', '; '
-
-  dehex = lambda h: '0x'+h
-  if directive != '.long':
-    dehex = lambda h: str(int(h, 16))
-
+def parse_object_file(dot_o, array_type, target=None):
   cmd = [objdump]
   if target:
     cmd += ['--target', target]
@@ -92,6 +84,7 @@ def parse_object_file(dot_o, directive, target=None):
       assert section not in section_headers
 
   # Ok.  Let's disassemble.
+  active = False
   disassemble = ['-d', '--insn-width=9', dot_o]
   for line in subprocess.check_output(cmd + disassemble).split('\n'):
     line = line.strip()
@@ -102,9 +95,11 @@ def parse_object_file(dot_o, directive, target=None):
     # E.g. 00000000000003a4 <_load_f16>:
     m = re.match('''[0-9a-f]+ <_?(.*)>:''', line)
     if m:
+      if active:
+        print '};'
       print
-      print globl + ' _' + m.group(1)
-      print '_' + m.group(1) + label
+      print 'CODE const', array_type, m.group(1) + '[] = {'
+      active = True
       continue
 
     columns = line.split('\t')
@@ -118,50 +113,54 @@ def parse_object_file(dot_o, directive, target=None):
         inst, args = columns[2].split(' ', 1)
     code, inst, args = code.strip(), inst.strip(), args.strip()
 
-    hexed = ','.join(dehex(x) for x in code.split(' '))
-    print '  ' + directive + '  ' + hexed + ' '*(36-len(hexed)) + \
-          comment + inst  + (' '*(14-len(inst)) + args if args else '')
-
-sys.stdout = open('src/jumper/SkJumper_generated.S', 'w')
-
-print '''# Copyright 2017 Google Inc.
-#
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-# This file is generated semi-automatically with this command:
-#   $ src/jumper/build_stages.py
+    dehex = lambda x: '0x'+x
+    if array_type == 'uint8_t':
+      dehex = lambda x: str(int(x, 16))
+
+    hexed = ''.join(dehex(x) + ',' for x in code.split(' '))
+    print '  ' + hexed + ' '*(40-len(hexed)) + \
+          '//' + inst  + (' '*(14-len(inst)) + args if args else '')
+  print '};'
+
+sys.stdout = open('src/jumper/SkJumper_generated.cpp', 'w')
+
+print '''/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+// This file is generated semi-automatically with this command:
+//   $ src/jumper/build_stages.py
+
+#include <stdint.h>
+
+#if defined(_MSC_VER)
+    #pragma section("code", read,execute)
+    #define CODE extern "C" __declspec(allocate("code"))
+#elif defined(__MACH__)
+    #define CODE extern "C" __attribute__((section("__TEXT,__text")))
+#else
+    #define CODE extern "C" __attribute__((section(".text")))
+#endif
 '''
-print '.text'
-
 print '#if defined(__aarch64__)'
-print '.balign 4'
-parse_object_file('aarch64.o', '.long')
+parse_object_file('aarch64.o', 'uint32_t')
 
 print '#elif defined(__arm__)'
-print '.balign 4'
-parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
+parse_object_file('vfp4.o', 'uint32_t', target='elf32-littlearm')
 
 print '#elif defined(__x86_64__)'
-parse_object_file('hsw.o',   '.byte')
-parse_object_file('avx.o',   '.byte')
-parse_object_file('sse41.o', '.byte')
-parse_object_file('sse2.o',  '.byte')
-print '#endif'
+parse_object_file('hsw.o',   'uint8_t')
+parse_object_file('avx.o',   'uint8_t')
+parse_object_file('sse41.o', 'uint8_t')
+parse_object_file('sse2.o',  'uint8_t')
 
-sys.stdout = open('src/jumper/SkJumper_generated_win.S', 'w')
+print '#elif defined(_M_X64)'
+parse_object_file('win_hsw.o',   'uint8_t')
+parse_object_file('win_avx.o',   'uint8_t')
+parse_object_file('win_sse41.o', 'uint8_t')
+parse_object_file('win_sse2.o',  'uint8_t')
 
-print '''; Copyright 2017 Google Inc.
-;
-; Use of this source code is governed by a BSD-style license that can be
-; found in the LICENSE file.
-
-; This file is generated semi-automatically with this command:
-;   $ src/jumper/build_stages.py
-'''
-print '_text SEGMENT'
-parse_object_file('win_hsw.o',   'DB')
-parse_object_file('win_avx.o',   'DB')
-parse_object_file('win_sse41.o', 'DB')
-parse_object_file('win_sse2.o',  'DB')
-print 'END'
+print '#endif'