Add AVX to the SkJumper mix.
authorMike Klein <mtklein@chromium.org>
Thu, 23 Feb 2017 13:04:49 +0000 (08:04 -0500)
committerMike Klein <mtklein@chromium.org>
Thu, 23 Feb 2017 13:37:39 +0000 (13:37 +0000)
AVX is a nice little halfway point between SSE4.1 and HSW, in terms
of instructions available, performance, and availability.

Intel chips have had AVX since ~2011, compared to ~2013 for HSW and
~2007 for SSE4.1.  Like HSW it's got 8-wide 256-bit float vectors,
but integer (and double) operations are essentially still only 128-bit.
It also doesn't have F16 conversion or FMA instructions.

It doesn't look like this is going to be a burden to maintain, and only
adds a few KB of code size.  In exchange, we now run 8x wide on 45% to
70% of x86 machines, depending on the OS.

In my brief testing, speed eerily resembles exact geometric progression:
   SSE4.1:        1x speed (baseline)
      AVX: ~sqrt(2)x speed
      HSW:       ~2x speed

This adds all the basic plumbing for AVX but leaves it disabled.
I'll flip it on once I've implemented the f16 TODOs.

Change-Id: I1c378dabb8a06386646371bf78ade9e9432b006f
Reviewed-on: https://skia-review.googlesource.com/8898
Reviewed-by: Mike Klein <mtklein@chromium.org>
Commit-Queue: Mike Klein <mtklein@chromium.org>

src/jumper/SkJumper.cpp
src/jumper/SkJumper_generated.S
src/jumper/SkJumper_generated_win.S
src/jumper/SkJumper_stages.cpp
src/jumper/build_stages.py

index 18a5f02..b5271a6 100644 (file)
@@ -106,16 +106,21 @@ extern "C" {
 
 #elif defined(__x86_64__) || defined(_M_X64)
     size_t ASM(start_pipeline,hsw  )(size_t, void**, K*, size_t);
+    size_t ASM(start_pipeline,avx  )(size_t, void**, K*, size_t);
     size_t ASM(start_pipeline,sse41)(size_t, void**, K*, size_t);
     size_t ASM(start_pipeline,sse2 )(size_t, void**, K*, size_t);
 
     StageFn ASM(just_return,hsw),
+            ASM(just_return,avx),
             ASM(just_return,sse41),
             ASM(just_return,sse2);
 
     #define M(st) StageFn ASM(st,hsw);
         STAGES(M)
     #undef M
+    #define M(st) StageFn ASM(st,avx);
+        STAGES(M)
+    #undef M
     #define M(st) StageFn ASM(st,sse41);
         STAGES(M)
     #undef M
@@ -170,6 +175,18 @@ extern "C" {
         #undef M
         }
     }
+    static StageFn* lookup_avx(SkRasterPipeline::StockStage st) {
+        switch (st) {
+            default:
+        #ifdef WHATS_NEXT
+                gMissing[st]++;
+        #endif
+                return nullptr;
+        #define M(st) case SkRasterPipeline::st: return ASM(st,avx);
+            STAGES(M)
+        #undef M
+        }
+    }
     static StageFn* lookup_sse41(SkRasterPipeline::StockStage st) {
         switch (st) {
             default:
@@ -259,6 +276,11 @@ bool SkRasterPipeline::run_with_jumper(size_t x, size_t n) const {
             return false;
         }
     }
+    if (0 && SkCpu::Supports(SkCpu::AVX)) {
+        if (!build_and_run(8, lookup_avx, ASM(just_return,avx), ASM(start_pipeline,avx))) {
+            return false;
+        }
+    }
     if (1 && SkCpu::Supports(SkCpu::SSE41)) {
         if (!build_and_run(4, lookup_sse41, ASM(just_return,sse41), ASM(start_pipeline,sse41))) {
             return false;
index 5d7ec00..25bfc1b 100644 (file)
@@ -1854,6 +1854,674 @@ _sk_linear_gradient_2stops_hsw:
   .byte  0xc5,0x7c,0x29,0xc0                             // vmovaps       %ymm8,%ymm0
   .byte  0xff,0xe0                                       // jmpq          *%rax
 
+.globl _sk_start_pipeline_avx
+_sk_start_pipeline_avx:
+  .byte  0x41,0x57                                       // push          %r15
+  .byte  0x41,0x56                                       // push          %r14
+  .byte  0x41,0x55                                       // push          %r13
+  .byte  0x41,0x54                                       // push          %r12
+  .byte  0x53                                            // push          %rbx
+  .byte  0x49,0x89,0xcf                                  // mov           %rcx,%r15
+  .byte  0x49,0x89,0xd6                                  // mov           %rdx,%r14
+  .byte  0x48,0x89,0xfb                                  // mov           %rdi,%rbx
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x49,0x89,0xc4                                  // mov           %rax,%r12
+  .byte  0x49,0x89,0xf5                                  // mov           %rsi,%r13
+  .byte  0x48,0x8d,0x43,0x08                             // lea           0x8(%rbx),%rax
+  .byte  0x4c,0x39,0xf8                                  // cmp           %r15,%rax
+  .byte  0x76,0x05                                       // jbe           28 <_sk_start_pipeline_avx+0x28>
+  .byte  0x48,0x89,0xd8                                  // mov           %rbx,%rax
+  .byte  0xeb,0x3c                                       // jmp           64 <_sk_start_pipeline_avx+0x64>
+  .byte  0xc5,0xfc,0x57,0xc0                             // vxorps        %ymm0,%ymm0,%ymm0
+  .byte  0xc5,0xf4,0x57,0xc9                             // vxorps        %ymm1,%ymm1,%ymm1
+  .byte  0xc5,0xec,0x57,0xd2                             // vxorps        %ymm2,%ymm2,%ymm2
+  .byte  0xc5,0xe4,0x57,0xdb                             // vxorps        %ymm3,%ymm3,%ymm3
+  .byte  0xc5,0xdc,0x57,0xe4                             // vxorps        %ymm4,%ymm4,%ymm4
+  .byte  0xc5,0xd4,0x57,0xed                             // vxorps        %ymm5,%ymm5,%ymm5
+  .byte  0xc5,0xcc,0x57,0xf6                             // vxorps        %ymm6,%ymm6,%ymm6
+  .byte  0xc5,0xc4,0x57,0xff                             // vxorps        %ymm7,%ymm7,%ymm7
+  .byte  0x48,0x89,0xdf                                  // mov           %rbx,%rdi
+  .byte  0x4c,0x89,0xee                                  // mov           %r13,%rsi
+  .byte  0x4c,0x89,0xf2                                  // mov           %r14,%rdx
+  .byte  0x41,0xff,0xd4                                  // callq         *%r12
+  .byte  0x48,0x8d,0x43,0x08                             // lea           0x8(%rbx),%rax
+  .byte  0x48,0x83,0xc3,0x10                             // add           $0x10,%rbx
+  .byte  0x4c,0x39,0xfb                                  // cmp           %r15,%rbx
+  .byte  0x48,0x89,0xc3                                  // mov           %rax,%rbx
+  .byte  0x76,0xc4                                       // jbe           28 <_sk_start_pipeline_avx+0x28>
+  .byte  0x5b                                            // pop           %rbx
+  .byte  0x41,0x5c                                       // pop           %r12
+  .byte  0x41,0x5d                                       // pop           %r13
+  .byte  0x41,0x5e                                       // pop           %r14
+  .byte  0x41,0x5f                                       // pop           %r15
+  .byte  0xc5,0xf8,0x77                                  // vzeroupper
+  .byte  0xc3                                            // retq
+
+.globl _sk_just_return_avx
+_sk_just_return_avx:
+  .byte  0xc3                                            // retq
+
+.globl _sk_seed_shader_avx
+_sk_seed_shader_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xf9,0x6e,0xc7                             // vmovd         %edi,%xmm0
+  .byte  0xc4,0xe3,0x79,0x04,0xc0,0x00                   // vpermilps     $0x0,%xmm0,%xmm0
+  .byte  0xc4,0xe3,0x7d,0x18,0xc0,0x01                   // vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  .byte  0xc5,0xfc,0x5b,0xc0                             // vcvtdq2ps     %ymm0,%ymm0
+  .byte  0xc4,0xe2,0x7d,0x18,0x4a,0x04                   // vbroadcastss  0x4(%rdx),%ymm1
+  .byte  0xc5,0xfc,0x58,0xc1                             // vaddps        %ymm1,%ymm0,%ymm0
+  .byte  0xc5,0xfc,0x58,0x42,0x14                        // vaddps        0x14(%rdx),%ymm0,%ymm0
+  .byte  0xc5,0xf9,0x6e,0x10                             // vmovd         (%rax),%xmm2
+  .byte  0xc4,0xe3,0x79,0x04,0xd2,0x00                   // vpermilps     $0x0,%xmm2,%xmm2
+  .byte  0xc4,0xe3,0x6d,0x18,0xd2,0x01                   // vinsertf128   $0x1,%xmm2,%ymm2,%ymm2
+  .byte  0xc5,0xfc,0x5b,0xd2                             // vcvtdq2ps     %ymm2,%ymm2
+  .byte  0xc5,0xec,0x58,0xc9                             // vaddps        %ymm1,%ymm2,%ymm1
+  .byte  0xc4,0xe2,0x7d,0x18,0x12                        // vbroadcastss  (%rdx),%ymm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xe4,0x57,0xdb                             // vxorps        %ymm3,%ymm3,%ymm3
+  .byte  0xc5,0xdc,0x57,0xe4                             // vxorps        %ymm4,%ymm4,%ymm4
+  .byte  0xc5,0xd4,0x57,0xed                             // vxorps        %ymm5,%ymm5,%ymm5
+  .byte  0xc5,0xcc,0x57,0xf6                             // vxorps        %ymm6,%ymm6,%ymm6
+  .byte  0xc5,0xc4,0x57,0xff                             // vxorps        %ymm7,%ymm7,%ymm7
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_constant_color_avx
+_sk_constant_color_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0xe2,0x7d,0x18,0x00                        // vbroadcastss  (%rax),%ymm0
+  .byte  0xc4,0xe2,0x7d,0x18,0x48,0x04                   // vbroadcastss  0x4(%rax),%ymm1
+  .byte  0xc4,0xe2,0x7d,0x18,0x50,0x08                   // vbroadcastss  0x8(%rax),%ymm2
+  .byte  0xc4,0xe2,0x7d,0x18,0x58,0x0c                   // vbroadcastss  0xc(%rax),%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_clear_avx
+_sk_clear_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xfc,0x57,0xc0                             // vxorps        %ymm0,%ymm0,%ymm0
+  .byte  0xc5,0xf4,0x57,0xc9                             // vxorps        %ymm1,%ymm1,%ymm1
+  .byte  0xc5,0xec,0x57,0xd2                             // vxorps        %ymm2,%ymm2,%ymm2
+  .byte  0xc5,0xe4,0x57,0xdb                             // vxorps        %ymm3,%ymm3,%ymm3
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_plus__avx
+_sk_plus__avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xfc,0x58,0xc4                             // vaddps        %ymm4,%ymm0,%ymm0
+  .byte  0xc5,0xf4,0x58,0xcd                             // vaddps        %ymm5,%ymm1,%ymm1
+  .byte  0xc5,0xec,0x58,0xd6                             // vaddps        %ymm6,%ymm2,%ymm2
+  .byte  0xc5,0xe4,0x58,0xdf                             // vaddps        %ymm7,%ymm3,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_srcover_avx
+_sk_srcover_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x02                        // vbroadcastss  (%rdx),%ymm8
+  .byte  0xc5,0x3c,0x5c,0xc3                             // vsubps        %ymm3,%ymm8,%ymm8
+  .byte  0xc5,0x3c,0x59,0xcc                             // vmulps        %ymm4,%ymm8,%ymm9
+  .byte  0xc5,0xb4,0x58,0xc0                             // vaddps        %ymm0,%ymm9,%ymm0
+  .byte  0xc5,0x3c,0x59,0xcd                             // vmulps        %ymm5,%ymm8,%ymm9
+  .byte  0xc5,0xb4,0x58,0xc9                             // vaddps        %ymm1,%ymm9,%ymm1
+  .byte  0xc5,0x3c,0x59,0xce                             // vmulps        %ymm6,%ymm8,%ymm9
+  .byte  0xc5,0xb4,0x58,0xd2                             // vaddps        %ymm2,%ymm9,%ymm2
+  .byte  0xc5,0x3c,0x59,0xc7                             // vmulps        %ymm7,%ymm8,%ymm8
+  .byte  0xc5,0xbc,0x58,0xdb                             // vaddps        %ymm3,%ymm8,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_dstover_avx
+_sk_dstover_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x02                        // vbroadcastss  (%rdx),%ymm8
+  .byte  0xc5,0x3c,0x5c,0xc7                             // vsubps        %ymm7,%ymm8,%ymm8
+  .byte  0xc5,0xbc,0x59,0xc0                             // vmulps        %ymm0,%ymm8,%ymm0
+  .byte  0xc5,0xfc,0x58,0xc4                             // vaddps        %ymm4,%ymm0,%ymm0
+  .byte  0xc5,0xbc,0x59,0xc9                             // vmulps        %ymm1,%ymm8,%ymm1
+  .byte  0xc5,0xf4,0x58,0xcd                             // vaddps        %ymm5,%ymm1,%ymm1
+  .byte  0xc5,0xbc,0x59,0xd2                             // vmulps        %ymm2,%ymm8,%ymm2
+  .byte  0xc5,0xec,0x58,0xd6                             // vaddps        %ymm6,%ymm2,%ymm2
+  .byte  0xc5,0xbc,0x59,0xdb                             // vmulps        %ymm3,%ymm8,%ymm3
+  .byte  0xc5,0xe4,0x58,0xdf                             // vaddps        %ymm7,%ymm3,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_clamp_0_avx
+_sk_clamp_0_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x41,0x3c,0x57,0xc0                        // vxorps        %ymm8,%ymm8,%ymm8
+  .byte  0xc4,0xc1,0x7c,0x5f,0xc0                        // vmaxps        %ymm8,%ymm0,%ymm0
+  .byte  0xc4,0xc1,0x74,0x5f,0xc8                        // vmaxps        %ymm8,%ymm1,%ymm1
+  .byte  0xc4,0xc1,0x6c,0x5f,0xd0                        // vmaxps        %ymm8,%ymm2,%ymm2
+  .byte  0xc4,0xc1,0x64,0x5f,0xd8                        // vmaxps        %ymm8,%ymm3,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_clamp_1_avx
+_sk_clamp_1_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x02                        // vbroadcastss  (%rdx),%ymm8
+  .byte  0xc4,0xc1,0x7c,0x5d,0xc0                        // vminps        %ymm8,%ymm0,%ymm0
+  .byte  0xc4,0xc1,0x74,0x5d,0xc8                        // vminps        %ymm8,%ymm1,%ymm1
+  .byte  0xc4,0xc1,0x6c,0x5d,0xd0                        // vminps        %ymm8,%ymm2,%ymm2
+  .byte  0xc4,0xc1,0x64,0x5d,0xd8                        // vminps        %ymm8,%ymm3,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_clamp_a_avx
+_sk_clamp_a_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x02                        // vbroadcastss  (%rdx),%ymm8
+  .byte  0xc4,0xc1,0x64,0x5d,0xd8                        // vminps        %ymm8,%ymm3,%ymm3
+  .byte  0xc5,0xfc,0x5d,0xc3                             // vminps        %ymm3,%ymm0,%ymm0
+  .byte  0xc5,0xf4,0x5d,0xcb                             // vminps        %ymm3,%ymm1,%ymm1
+  .byte  0xc5,0xec,0x5d,0xd3                             // vminps        %ymm3,%ymm2,%ymm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_set_rgb_avx
+_sk_set_rgb_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0xe2,0x7d,0x18,0x00                        // vbroadcastss  (%rax),%ymm0
+  .byte  0xc4,0xe2,0x7d,0x18,0x48,0x04                   // vbroadcastss  0x4(%rax),%ymm1
+  .byte  0xc4,0xe2,0x7d,0x18,0x50,0x08                   // vbroadcastss  0x8(%rax),%ymm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_swap_rb_avx
+_sk_swap_rb_avx:
+  .byte  0xc5,0x7c,0x28,0xc0                             // vmovaps       %ymm0,%ymm8
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xfc,0x28,0xc2                             // vmovaps       %ymm2,%ymm0
+  .byte  0xc5,0x7c,0x29,0xc2                             // vmovaps       %ymm8,%ymm2
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_swap_avx
+_sk_swap_avx:
+  .byte  0xc5,0x7c,0x28,0xc3                             // vmovaps       %ymm3,%ymm8
+  .byte  0xc5,0x7c,0x28,0xca                             // vmovaps       %ymm2,%ymm9
+  .byte  0xc5,0x7c,0x28,0xd1                             // vmovaps       %ymm1,%ymm10
+  .byte  0xc5,0x7c,0x28,0xd8                             // vmovaps       %ymm0,%ymm11
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xfc,0x28,0xc4                             // vmovaps       %ymm4,%ymm0
+  .byte  0xc5,0xfc,0x28,0xcd                             // vmovaps       %ymm5,%ymm1
+  .byte  0xc5,0xfc,0x28,0xd6                             // vmovaps       %ymm6,%ymm2
+  .byte  0xc5,0xfc,0x28,0xdf                             // vmovaps       %ymm7,%ymm3
+  .byte  0xc5,0x7c,0x29,0xdc                             // vmovaps       %ymm11,%ymm4
+  .byte  0xc5,0x7c,0x29,0xd5                             // vmovaps       %ymm10,%ymm5
+  .byte  0xc5,0x7c,0x29,0xce                             // vmovaps       %ymm9,%ymm6
+  .byte  0xc5,0x7c,0x29,0xc7                             // vmovaps       %ymm8,%ymm7
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_move_src_dst_avx
+_sk_move_src_dst_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xfc,0x28,0xe0                             // vmovaps       %ymm0,%ymm4
+  .byte  0xc5,0xfc,0x28,0xe9                             // vmovaps       %ymm1,%ymm5
+  .byte  0xc5,0xfc,0x28,0xf2                             // vmovaps       %ymm2,%ymm6
+  .byte  0xc5,0xfc,0x28,0xfb                             // vmovaps       %ymm3,%ymm7
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_move_dst_src_avx
+_sk_move_dst_src_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xfc,0x28,0xc4                             // vmovaps       %ymm4,%ymm0
+  .byte  0xc5,0xfc,0x28,0xcd                             // vmovaps       %ymm5,%ymm1
+  .byte  0xc5,0xfc,0x28,0xd6                             // vmovaps       %ymm6,%ymm2
+  .byte  0xc5,0xfc,0x28,0xdf                             // vmovaps       %ymm7,%ymm3
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_premul_avx
+_sk_premul_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xfc,0x59,0xc3                             // vmulps        %ymm3,%ymm0,%ymm0
+  .byte  0xc5,0xf4,0x59,0xcb                             // vmulps        %ymm3,%ymm1,%ymm1
+  .byte  0xc5,0xec,0x59,0xd3                             // vmulps        %ymm3,%ymm2,%ymm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_unpremul_avx
+_sk_unpremul_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x41,0x3c,0x57,0xc0                        // vxorps        %ymm8,%ymm8,%ymm8
+  .byte  0xc4,0x41,0x64,0xc2,0xc8,0x00                   // vcmpeqps      %ymm8,%ymm3,%ymm9
+  .byte  0xc4,0x62,0x7d,0x18,0x12                        // vbroadcastss  (%rdx),%ymm10
+  .byte  0xc5,0x2c,0x5e,0xd3                             // vdivps        %ymm3,%ymm10,%ymm10
+  .byte  0xc4,0x43,0x2d,0x4a,0xc0,0x90                   // vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
+  .byte  0xc5,0xbc,0x59,0xc0                             // vmulps        %ymm0,%ymm8,%ymm0
+  .byte  0xc5,0xbc,0x59,0xc9                             // vmulps        %ymm1,%ymm8,%ymm1
+  .byte  0xc5,0xbc,0x59,0xd2                             // vmulps        %ymm2,%ymm8,%ymm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_from_srgb_avx
+_sk_from_srgb_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x42,0x40                   // vbroadcastss  0x40(%rdx),%ymm8
+  .byte  0xc5,0x3c,0x59,0xc8                             // vmulps        %ymm0,%ymm8,%ymm9
+  .byte  0xc5,0x7c,0x59,0xd0                             // vmulps        %ymm0,%ymm0,%ymm10
+  .byte  0xc4,0x62,0x7d,0x18,0x5a,0x3c                   // vbroadcastss  0x3c(%rdx),%ymm11
+  .byte  0xc4,0x62,0x7d,0x18,0x62,0x38                   // vbroadcastss  0x38(%rdx),%ymm12
+  .byte  0xc5,0x24,0x59,0xe8                             // vmulps        %ymm0,%ymm11,%ymm13
+  .byte  0xc4,0x41,0x14,0x58,0xec                        // vaddps        %ymm12,%ymm13,%ymm13
+  .byte  0xc4,0x62,0x7d,0x18,0x72,0x34                   // vbroadcastss  0x34(%rdx),%ymm14
+  .byte  0xc4,0x41,0x2c,0x59,0xd5                        // vmulps        %ymm13,%ymm10,%ymm10
+  .byte  0xc4,0x41,0x0c,0x58,0xd2                        // vaddps        %ymm10,%ymm14,%ymm10
+  .byte  0xc4,0x62,0x7d,0x18,0x6a,0x44                   // vbroadcastss  0x44(%rdx),%ymm13
+  .byte  0xc4,0xc1,0x7c,0xc2,0xc5,0x01                   // vcmpltps      %ymm13,%ymm0,%ymm0
+  .byte  0xc4,0xc3,0x2d,0x4a,0xc1,0x00                   // vblendvps     %ymm0,%ymm9,%ymm10,%ymm0
+  .byte  0xc5,0x3c,0x59,0xc9                             // vmulps        %ymm1,%ymm8,%ymm9
+  .byte  0xc5,0x74,0x59,0xd1                             // vmulps        %ymm1,%ymm1,%ymm10
+  .byte  0xc5,0x24,0x59,0xf9                             // vmulps        %ymm1,%ymm11,%ymm15
+  .byte  0xc4,0x41,0x04,0x58,0xfc                        // vaddps        %ymm12,%ymm15,%ymm15
+  .byte  0xc4,0x41,0x2c,0x59,0xd7                        // vmulps        %ymm15,%ymm10,%ymm10
+  .byte  0xc4,0x41,0x0c,0x58,0xd2                        // vaddps        %ymm10,%ymm14,%ymm10
+  .byte  0xc4,0xc1,0x74,0xc2,0xcd,0x01                   // vcmpltps      %ymm13,%ymm1,%ymm1
+  .byte  0xc4,0xc3,0x2d,0x4a,0xc9,0x10                   // vblendvps     %ymm1,%ymm9,%ymm10,%ymm1
+  .byte  0xc5,0x3c,0x59,0xc2                             // vmulps        %ymm2,%ymm8,%ymm8
+  .byte  0xc5,0x6c,0x59,0xca                             // vmulps        %ymm2,%ymm2,%ymm9
+  .byte  0xc5,0x24,0x59,0xd2                             // vmulps        %ymm2,%ymm11,%ymm10
+  .byte  0xc4,0x41,0x2c,0x58,0xd4                        // vaddps        %ymm12,%ymm10,%ymm10
+  .byte  0xc4,0x41,0x34,0x59,0xca                        // vmulps        %ymm10,%ymm9,%ymm9
+  .byte  0xc4,0x41,0x0c,0x58,0xc9                        // vaddps        %ymm9,%ymm14,%ymm9
+  .byte  0xc4,0xc1,0x6c,0xc2,0xd5,0x01                   // vcmpltps      %ymm13,%ymm2,%ymm2
+  .byte  0xc4,0xc3,0x35,0x4a,0xd0,0x20                   // vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_to_srgb_avx
+_sk_to_srgb_avx:
+  .byte  0xc5,0x7c,0x52,0xc0                             // vrsqrtps      %ymm0,%ymm8
+  .byte  0xc4,0x41,0x7c,0x53,0xc8                        // vrcpps        %ymm8,%ymm9
+  .byte  0xc4,0x41,0x7c,0x52,0xd0                        // vrsqrtps      %ymm8,%ymm10
+  .byte  0xc4,0x62,0x7d,0x18,0x42,0x48                   // vbroadcastss  0x48(%rdx),%ymm8
+  .byte  0xc5,0x3c,0x59,0xd8                             // vmulps        %ymm0,%ymm8,%ymm11
+  .byte  0xc4,0x62,0x7d,0x18,0x22                        // vbroadcastss  (%rdx),%ymm12
+  .byte  0xc4,0x62,0x7d,0x18,0x6a,0x4c                   // vbroadcastss  0x4c(%rdx),%ymm13
+  .byte  0xc4,0x62,0x7d,0x18,0x72,0x50                   // vbroadcastss  0x50(%rdx),%ymm14
+  .byte  0xc4,0x62,0x7d,0x18,0x7a,0x54                   // vbroadcastss  0x54(%rdx),%ymm15
+  .byte  0xc4,0x41,0x34,0x59,0xce                        // vmulps        %ymm14,%ymm9,%ymm9
+  .byte  0xc4,0x41,0x34,0x58,0xcf                        // vaddps        %ymm15,%ymm9,%ymm9
+  .byte  0xc4,0x41,0x2c,0x59,0xd5                        // vmulps        %ymm13,%ymm10,%ymm10
+  .byte  0xc4,0x41,0x2c,0x58,0xc9                        // vaddps        %ymm9,%ymm10,%ymm9
+  .byte  0xc4,0x41,0x1c,0x5d,0xc9                        // vminps        %ymm9,%ymm12,%ymm9
+  .byte  0xc4,0x62,0x7d,0x18,0x52,0x58                   // vbroadcastss  0x58(%rdx),%ymm10
+  .byte  0xc4,0xc1,0x7c,0xc2,0xc2,0x01                   // vcmpltps      %ymm10,%ymm0,%ymm0
+  .byte  0xc4,0xc3,0x35,0x4a,0xc3,0x00                   // vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
+  .byte  0xc5,0x7c,0x52,0xc9                             // vrsqrtps      %ymm1,%ymm9
+  .byte  0xc4,0x41,0x7c,0x53,0xd9                        // vrcpps        %ymm9,%ymm11
+  .byte  0xc4,0x41,0x7c,0x52,0xc9                        // vrsqrtps      %ymm9,%ymm9
+  .byte  0xc4,0x41,0x0c,0x59,0xdb                        // vmulps        %ymm11,%ymm14,%ymm11
+  .byte  0xc4,0x41,0x04,0x58,0xdb                        // vaddps        %ymm11,%ymm15,%ymm11
+  .byte  0xc4,0x41,0x14,0x59,0xc9                        // vmulps        %ymm9,%ymm13,%ymm9
+  .byte  0xc4,0x41,0x34,0x58,0xcb                        // vaddps        %ymm11,%ymm9,%ymm9
+  .byte  0xc5,0x3c,0x59,0xd9                             // vmulps        %ymm1,%ymm8,%ymm11
+  .byte  0xc4,0x41,0x1c,0x5d,0xc9                        // vminps        %ymm9,%ymm12,%ymm9
+  .byte  0xc4,0xc1,0x74,0xc2,0xca,0x01                   // vcmpltps      %ymm10,%ymm1,%ymm1
+  .byte  0xc4,0xc3,0x35,0x4a,0xcb,0x10                   // vblendvps     %ymm1,%ymm11,%ymm9,%ymm1
+  .byte  0xc5,0x7c,0x52,0xca                             // vrsqrtps      %ymm2,%ymm9
+  .byte  0xc4,0x41,0x7c,0x53,0xd9                        // vrcpps        %ymm9,%ymm11
+  .byte  0xc4,0x41,0x0c,0x59,0xdb                        // vmulps        %ymm11,%ymm14,%ymm11
+  .byte  0xc4,0x41,0x04,0x58,0xdb                        // vaddps        %ymm11,%ymm15,%ymm11
+  .byte  0xc4,0x41,0x7c,0x52,0xc9                        // vrsqrtps      %ymm9,%ymm9
+  .byte  0xc4,0x41,0x14,0x59,0xc9                        // vmulps        %ymm9,%ymm13,%ymm9
+  .byte  0xc4,0x41,0x34,0x58,0xcb                        // vaddps        %ymm11,%ymm9,%ymm9
+  .byte  0xc4,0x41,0x1c,0x5d,0xc9                        // vminps        %ymm9,%ymm12,%ymm9
+  .byte  0xc5,0x3c,0x59,0xc2                             // vmulps        %ymm2,%ymm8,%ymm8
+  .byte  0xc4,0xc1,0x6c,0xc2,0xd2,0x01                   // vcmpltps      %ymm10,%ymm2,%ymm2
+  .byte  0xc4,0xc3,0x35,0x4a,0xd0,0x20                   // vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_scale_u8_avx
+_sk_scale_u8_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0x8b,0x00                                  // mov           (%rax),%rax
+  .byte  0xc4,0x62,0x79,0x31,0x44,0x38,0x04              // vpmovzxbd     0x4(%rax,%rdi,1),%xmm8
+  .byte  0xc4,0x62,0x79,0x31,0x0c,0x38                   // vpmovzxbd     (%rax,%rdi,1),%xmm9
+  .byte  0xc4,0x43,0x35,0x18,0xc0,0x01                   // vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
+  .byte  0xc4,0x41,0x7c,0x5b,0xc0                        // vcvtdq2ps     %ymm8,%ymm8
+  .byte  0xc4,0x62,0x7d,0x18,0x4a,0x0c                   // vbroadcastss  0xc(%rdx),%ymm9
+  .byte  0xc4,0x41,0x3c,0x59,0xc1                        // vmulps        %ymm9,%ymm8,%ymm8
+  .byte  0xc5,0xbc,0x59,0xc0                             // vmulps        %ymm0,%ymm8,%ymm0
+  .byte  0xc5,0xbc,0x59,0xc9                             // vmulps        %ymm1,%ymm8,%ymm1
+  .byte  0xc5,0xbc,0x59,0xd2                             // vmulps        %ymm2,%ymm8,%ymm2
+  .byte  0xc5,0xbc,0x59,0xdb                             // vmulps        %ymm3,%ymm8,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_lerp_u8_avx
+_sk_lerp_u8_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0x8b,0x00                                  // mov           (%rax),%rax
+  .byte  0xc4,0x62,0x79,0x31,0x44,0x38,0x04              // vpmovzxbd     0x4(%rax,%rdi,1),%xmm8
+  .byte  0xc4,0x62,0x79,0x31,0x0c,0x38                   // vpmovzxbd     (%rax,%rdi,1),%xmm9
+  .byte  0xc4,0x43,0x35,0x18,0xc0,0x01                   // vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
+  .byte  0xc4,0x41,0x7c,0x5b,0xc0                        // vcvtdq2ps     %ymm8,%ymm8
+  .byte  0xc4,0x62,0x7d,0x18,0x4a,0x0c                   // vbroadcastss  0xc(%rdx),%ymm9
+  .byte  0xc4,0x41,0x3c,0x59,0xc1                        // vmulps        %ymm9,%ymm8,%ymm8
+  .byte  0xc5,0xfc,0x5c,0xc4                             // vsubps        %ymm4,%ymm0,%ymm0
+  .byte  0xc4,0xc1,0x7c,0x59,0xc0                        // vmulps        %ymm8,%ymm0,%ymm0
+  .byte  0xc5,0xfc,0x58,0xc4                             // vaddps        %ymm4,%ymm0,%ymm0
+  .byte  0xc5,0xf4,0x5c,0xcd                             // vsubps        %ymm5,%ymm1,%ymm1
+  .byte  0xc4,0xc1,0x74,0x59,0xc8                        // vmulps        %ymm8,%ymm1,%ymm1
+  .byte  0xc5,0xf4,0x58,0xcd                             // vaddps        %ymm5,%ymm1,%ymm1
+  .byte  0xc5,0xec,0x5c,0xd6                             // vsubps        %ymm6,%ymm2,%ymm2
+  .byte  0xc4,0xc1,0x6c,0x59,0xd0                        // vmulps        %ymm8,%ymm2,%ymm2
+  .byte  0xc5,0xec,0x58,0xd6                             // vaddps        %ymm6,%ymm2,%ymm2
+  .byte  0xc5,0xe4,0x5c,0xdf                             // vsubps        %ymm7,%ymm3,%ymm3
+  .byte  0xc4,0xc1,0x64,0x59,0xd8                        // vmulps        %ymm8,%ymm3,%ymm3
+  .byte  0xc5,0xe4,0x58,0xdf                             // vaddps        %ymm7,%ymm3,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_load_tables_avx
+_sk_load_tables_avx:
+  .byte  0x41,0x57                                       // push          %r15
+  .byte  0x41,0x56                                       // push          %r14
+  .byte  0x41,0x54                                       // push          %r12
+  .byte  0x53                                            // push          %rbx
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x4c,0x8b,0x00                                  // mov           (%rax),%r8
+  .byte  0x48,0x8b,0x48,0x08                             // mov           0x8(%rax),%rcx
+  .byte  0xc4,0x41,0x7c,0x10,0x14,0xb8                   // vmovups       (%r8,%rdi,4),%ymm10
+  .byte  0xc5,0xf9,0x6e,0x42,0x10                        // vmovd         0x10(%rdx),%xmm0
+  .byte  0xc4,0xe3,0x79,0x04,0xc0,0x00                   // vpermilps     $0x0,%xmm0,%xmm0
+  .byte  0xc4,0x63,0x7d,0x18,0xc8,0x01                   // vinsertf128   $0x1,%xmm0,%ymm0,%ymm9
+  .byte  0xc4,0xc1,0x34,0x54,0xc2                        // vandps        %ymm10,%ymm9,%ymm0
+  .byte  0xc4,0xc1,0xf9,0x7e,0xc0                        // vmovq         %xmm0,%r8
+  .byte  0x45,0x89,0xc1                                  // mov           %r8d,%r9d
+  .byte  0xc4,0xc3,0xf9,0x16,0xc2,0x01                   // vpextrq       $0x1,%xmm0,%r10
+  .byte  0x45,0x89,0xd3                                  // mov           %r10d,%r11d
+  .byte  0x49,0xc1,0xea,0x20                             // shr           $0x20,%r10
+  .byte  0x49,0xc1,0xe8,0x20                             // shr           $0x20,%r8
+  .byte  0xc4,0xe3,0x7d,0x19,0xc0,0x01                   // vextractf128  $0x1,%ymm0,%xmm0
+  .byte  0xc4,0xc1,0xf9,0x7e,0xc7                        // vmovq         %xmm0,%r15
+  .byte  0x45,0x89,0xfe                                  // mov           %r15d,%r14d
+  .byte  0xc4,0xe3,0xf9,0x16,0xc3,0x01                   // vpextrq       $0x1,%xmm0,%rbx
+  .byte  0x41,0x89,0xdc                                  // mov           %ebx,%r12d
+  .byte  0x48,0xc1,0xeb,0x20                             // shr           $0x20,%rbx
+  .byte  0x49,0xc1,0xef,0x20                             // shr           $0x20,%r15
+  .byte  0xc4,0xa1,0x7a,0x10,0x04,0xb1                   // vmovss        (%rcx,%r14,4),%xmm0
+  .byte  0xc4,0xa3,0x79,0x21,0x04,0xb9,0x10              // vinsertps     $0x10,(%rcx,%r15,4),%xmm0,%xmm0
+  .byte  0xc4,0xa3,0x79,0x21,0x04,0xa1,0x20              // vinsertps     $0x20,(%rcx,%r12,4),%xmm0,%xmm0
+  .byte  0xc4,0xe3,0x79,0x21,0x04,0x99,0x30              // vinsertps     $0x30,(%rcx,%rbx,4),%xmm0,%xmm0
+  .byte  0xc4,0xa1,0x7a,0x10,0x0c,0x89                   // vmovss        (%rcx,%r9,4),%xmm1
+  .byte  0xc4,0xa3,0x71,0x21,0x0c,0x81,0x10              // vinsertps     $0x10,(%rcx,%r8,4),%xmm1,%xmm1
+  .byte  0xc4,0xa3,0x71,0x21,0x0c,0x99,0x20              // vinsertps     $0x20,(%rcx,%r11,4),%xmm1,%xmm1
+  .byte  0xc4,0xa3,0x71,0x21,0x0c,0x91,0x30              // vinsertps     $0x30,(%rcx,%r10,4),%xmm1,%xmm1
+  .byte  0xc4,0xe3,0x75,0x18,0xc0,0x01                   // vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
+  .byte  0x4c,0x8b,0x78,0x10                             // mov           0x10(%rax),%r15
+  .byte  0xc4,0xc1,0x71,0x72,0xd2,0x08                   // vpsrld        $0x8,%xmm10,%xmm1
+  .byte  0xc4,0x43,0x7d,0x19,0xd0,0x01                   // vextractf128  $0x1,%ymm10,%xmm8
+  .byte  0xc4,0xc1,0x69,0x72,0xd0,0x08                   // vpsrld        $0x8,%xmm8,%xmm2
+  .byte  0xc4,0xe3,0x75,0x18,0xca,0x01                   // vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
+  .byte  0xc5,0xb4,0x54,0xc9                             // vandps        %ymm1,%ymm9,%ymm1
+  .byte  0xc4,0xc1,0xf9,0x7e,0xc8                        // vmovq         %xmm1,%r8
+  .byte  0x45,0x89,0xc2                                  // mov           %r8d,%r10d
+  .byte  0xc4,0xc3,0xf9,0x16,0xc9,0x01                   // vpextrq       $0x1,%xmm1,%r9
+  .byte  0x45,0x89,0xcb                                  // mov           %r9d,%r11d
+  .byte  0x49,0xc1,0xe9,0x20                             // shr           $0x20,%r9
+  .byte  0x49,0xc1,0xe8,0x20                             // shr           $0x20,%r8
+  .byte  0xc4,0xe3,0x7d,0x19,0xc9,0x01                   // vextractf128  $0x1,%ymm1,%xmm1
+  .byte  0xc4,0xe1,0xf9,0x7e,0xcb                        // vmovq         %xmm1,%rbx
+  .byte  0x41,0x89,0xde                                  // mov           %ebx,%r14d
+  .byte  0xc4,0xe3,0xf9,0x16,0xc9,0x01                   // vpextrq       $0x1,%xmm1,%rcx
+  .byte  0x41,0x89,0xcc                                  // mov           %ecx,%r12d
+  .byte  0x48,0xc1,0xe9,0x20                             // shr           $0x20,%rcx
+  .byte  0x48,0xc1,0xeb,0x20                             // shr           $0x20,%rbx
+  .byte  0xc4,0x81,0x7a,0x10,0x0c,0xb7                   // vmovss        (%r15,%r14,4),%xmm1
+  .byte  0xc4,0xc3,0x71,0x21,0x0c,0x9f,0x10              // vinsertps     $0x10,(%r15,%rbx,4),%xmm1,%xmm1
+  .byte  0xc4,0x81,0x7a,0x10,0x14,0xa7                   // vmovss        (%r15,%r12,4),%xmm2
+  .byte  0xc4,0xe3,0x71,0x21,0xca,0x20                   // vinsertps     $0x20,%xmm2,%xmm1,%xmm1
+  .byte  0xc4,0xc1,0x7a,0x10,0x14,0x8f                   // vmovss        (%r15,%rcx,4),%xmm2
+  .byte  0xc4,0xe3,0x71,0x21,0xca,0x30                   // vinsertps     $0x30,%xmm2,%xmm1,%xmm1
+  .byte  0xc4,0x81,0x7a,0x10,0x14,0x97                   // vmovss        (%r15,%r10,4),%xmm2
+  .byte  0xc4,0x83,0x69,0x21,0x14,0x87,0x10              // vinsertps     $0x10,(%r15,%r8,4),%xmm2,%xmm2
+  .byte  0xc4,0x81,0x7a,0x10,0x1c,0x9f                   // vmovss        (%r15,%r11,4),%xmm3
+  .byte  0xc4,0xe3,0x69,0x21,0xd3,0x20                   // vinsertps     $0x20,%xmm3,%xmm2,%xmm2
+  .byte  0xc4,0x81,0x7a,0x10,0x1c,0x8f                   // vmovss        (%r15,%r9,4),%xmm3
+  .byte  0xc4,0xe3,0x69,0x21,0xd3,0x30                   // vinsertps     $0x30,%xmm3,%xmm2,%xmm2
+  .byte  0xc4,0xe3,0x6d,0x18,0xc9,0x01                   // vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
+  .byte  0x48,0x8b,0x40,0x18                             // mov           0x18(%rax),%rax
+  .byte  0xc4,0xc1,0x69,0x72,0xd2,0x10                   // vpsrld        $0x10,%xmm10,%xmm2
+  .byte  0xc4,0xc1,0x61,0x72,0xd0,0x10                   // vpsrld        $0x10,%xmm8,%xmm3
+  .byte  0xc4,0xe3,0x6d,0x18,0xd3,0x01                   // vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
+  .byte  0xc5,0xb4,0x54,0xd2                             // vandps        %ymm2,%ymm9,%ymm2
+  .byte  0xc4,0xc1,0xf9,0x7e,0xd0                        // vmovq         %xmm2,%r8
+  .byte  0x45,0x89,0xc1                                  // mov           %r8d,%r9d
+  .byte  0xc4,0xc3,0xf9,0x16,0xd6,0x01                   // vpextrq       $0x1,%xmm2,%r14
+  .byte  0x45,0x89,0xf2                                  // mov           %r14d,%r10d
+  .byte  0x49,0xc1,0xee,0x20                             // shr           $0x20,%r14
+  .byte  0x49,0xc1,0xe8,0x20                             // shr           $0x20,%r8
+  .byte  0xc4,0xe3,0x7d,0x19,0xd2,0x01                   // vextractf128  $0x1,%ymm2,%xmm2
+  .byte  0xc4,0xe1,0xf9,0x7e,0xd3                        // vmovq         %xmm2,%rbx
+  .byte  0x41,0x89,0xdb                                  // mov           %ebx,%r11d
+  .byte  0xc4,0xe3,0xf9,0x16,0xd1,0x01                   // vpextrq       $0x1,%xmm2,%rcx
+  .byte  0x41,0x89,0xcf                                  // mov           %ecx,%r15d
+  .byte  0x48,0xc1,0xe9,0x20                             // shr           $0x20,%rcx
+  .byte  0x48,0xc1,0xeb,0x20                             // shr           $0x20,%rbx
+  .byte  0xc4,0xa1,0x7a,0x10,0x14,0x98                   // vmovss        (%rax,%r11,4),%xmm2
+  .byte  0xc4,0xe3,0x69,0x21,0x14,0x98,0x10              // vinsertps     $0x10,(%rax,%rbx,4),%xmm2,%xmm2
+  .byte  0xc4,0xa1,0x7a,0x10,0x1c,0xb8                   // vmovss        (%rax,%r15,4),%xmm3
+  .byte  0xc4,0xe3,0x69,0x21,0xd3,0x20                   // vinsertps     $0x20,%xmm3,%xmm2,%xmm2
+  .byte  0xc5,0xfa,0x10,0x1c,0x88                        // vmovss        (%rax,%rcx,4),%xmm3
+  .byte  0xc4,0x63,0x69,0x21,0xcb,0x30                   // vinsertps     $0x30,%xmm3,%xmm2,%xmm9
+  .byte  0xc4,0xa1,0x7a,0x10,0x1c,0x88                   // vmovss        (%rax,%r9,4),%xmm3
+  .byte  0xc4,0xa3,0x61,0x21,0x1c,0x80,0x10              // vinsertps     $0x10,(%rax,%r8,4),%xmm3,%xmm3
+  .byte  0xc4,0xa1,0x7a,0x10,0x14,0x90                   // vmovss        (%rax,%r10,4),%xmm2
+  .byte  0xc4,0xe3,0x61,0x21,0xd2,0x20                   // vinsertps     $0x20,%xmm2,%xmm3,%xmm2
+  .byte  0xc4,0xa1,0x7a,0x10,0x1c,0xb0                   // vmovss        (%rax,%r14,4),%xmm3
+  .byte  0xc4,0xe3,0x69,0x21,0xd3,0x30                   // vinsertps     $0x30,%xmm3,%xmm2,%xmm2
+  .byte  0xc4,0xc3,0x6d,0x18,0xd1,0x01                   // vinsertf128   $0x1,%xmm9,%ymm2,%ymm2
+  .byte  0xc4,0xc1,0x31,0x72,0xd2,0x18                   // vpsrld        $0x18,%xmm10,%xmm9
+  .byte  0xc4,0xc1,0x61,0x72,0xd0,0x18                   // vpsrld        $0x18,%xmm8,%xmm3
+  .byte  0xc4,0xe3,0x35,0x18,0xdb,0x01                   // vinsertf128   $0x1,%xmm3,%ymm9,%ymm3
+  .byte  0xc5,0xfc,0x5b,0xdb                             // vcvtdq2ps     %ymm3,%ymm3
+  .byte  0xc4,0x62,0x7d,0x18,0x42,0x0c                   // vbroadcastss  0xc(%rdx),%ymm8
+  .byte  0xc4,0xc1,0x64,0x59,0xd8                        // vmulps        %ymm8,%ymm3,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x5b                                            // pop           %rbx
+  .byte  0x41,0x5c                                       // pop           %r12
+  .byte  0x41,0x5e                                       // pop           %r14
+  .byte  0x41,0x5f                                       // pop           %r15
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_load_8888_avx
+_sk_load_8888_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0x8b,0x00                                  // mov           (%rax),%rax
+  .byte  0xc5,0xfc,0x10,0x1c,0xb8                        // vmovups       (%rax,%rdi,4),%ymm3
+  .byte  0xc5,0xf9,0x6e,0x42,0x10                        // vmovd         0x10(%rdx),%xmm0
+  .byte  0xc4,0xe3,0x79,0x04,0xc0,0x00                   // vpermilps     $0x0,%xmm0,%xmm0
+  .byte  0xc4,0x63,0x7d,0x18,0xd8,0x01                   // vinsertf128   $0x1,%xmm0,%ymm0,%ymm11
+  .byte  0xc5,0xa4,0x54,0xc3                             // vandps        %ymm3,%ymm11,%ymm0
+  .byte  0xc5,0xfc,0x5b,0xc0                             // vcvtdq2ps     %ymm0,%ymm0
+  .byte  0xc4,0x62,0x7d,0x18,0x42,0x0c                   // vbroadcastss  0xc(%rdx),%ymm8
+  .byte  0xc5,0xbc,0x59,0xc0                             // vmulps        %ymm0,%ymm8,%ymm0
+  .byte  0xc5,0xa9,0x72,0xd3,0x08                        // vpsrld        $0x8,%xmm3,%xmm10
+  .byte  0xc4,0xc3,0x7d,0x19,0xd9,0x01                   // vextractf128  $0x1,%ymm3,%xmm9
+  .byte  0xc4,0xc1,0x71,0x72,0xd1,0x08                   // vpsrld        $0x8,%xmm9,%xmm1
+  .byte  0xc4,0xe3,0x2d,0x18,0xc9,0x01                   // vinsertf128   $0x1,%xmm1,%ymm10,%ymm1
+  .byte  0xc5,0xa4,0x54,0xc9                             // vandps        %ymm1,%ymm11,%ymm1
+  .byte  0xc5,0xfc,0x5b,0xc9                             // vcvtdq2ps     %ymm1,%ymm1
+  .byte  0xc5,0xbc,0x59,0xc9                             // vmulps        %ymm1,%ymm8,%ymm1
+  .byte  0xc5,0xa9,0x72,0xd3,0x10                        // vpsrld        $0x10,%xmm3,%xmm10
+  .byte  0xc4,0xc1,0x69,0x72,0xd1,0x10                   // vpsrld        $0x10,%xmm9,%xmm2
+  .byte  0xc4,0xe3,0x2d,0x18,0xd2,0x01                   // vinsertf128   $0x1,%xmm2,%ymm10,%ymm2
+  .byte  0xc5,0xa4,0x54,0xd2                             // vandps        %ymm2,%ymm11,%ymm2
+  .byte  0xc5,0xfc,0x5b,0xd2                             // vcvtdq2ps     %ymm2,%ymm2
+  .byte  0xc5,0xbc,0x59,0xd2                             // vmulps        %ymm2,%ymm8,%ymm2
+  .byte  0xc5,0xa9,0x72,0xd3,0x18                        // vpsrld        $0x18,%xmm3,%xmm10
+  .byte  0xc4,0xc1,0x61,0x72,0xd1,0x18                   // vpsrld        $0x18,%xmm9,%xmm3
+  .byte  0xc4,0xe3,0x2d,0x18,0xdb,0x01                   // vinsertf128   $0x1,%xmm3,%ymm10,%ymm3
+  .byte  0xc5,0xfc,0x5b,0xdb                             // vcvtdq2ps     %ymm3,%ymm3
+  .byte  0xc4,0xc1,0x64,0x59,0xd8                        // vmulps        %ymm8,%ymm3,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_store_8888_avx
+_sk_store_8888_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0x8b,0x00                                  // mov           (%rax),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x42,0x08                   // vbroadcastss  0x8(%rdx),%ymm8
+  .byte  0xc5,0x3c,0x59,0xc8                             // vmulps        %ymm0,%ymm8,%ymm9
+  .byte  0xc4,0x41,0x7d,0x5b,0xc9                        // vcvtps2dq     %ymm9,%ymm9
+  .byte  0xc5,0x3c,0x59,0xd1                             // vmulps        %ymm1,%ymm8,%ymm10
+  .byte  0xc4,0x41,0x7d,0x5b,0xd2                        // vcvtps2dq     %ymm10,%ymm10
+  .byte  0xc4,0xc1,0x21,0x72,0xf2,0x08                   // vpslld        $0x8,%xmm10,%xmm11
+  .byte  0xc4,0x43,0x7d,0x19,0xd2,0x01                   // vextractf128  $0x1,%ymm10,%xmm10
+  .byte  0xc4,0xc1,0x29,0x72,0xf2,0x08                   // vpslld        $0x8,%xmm10,%xmm10
+  .byte  0xc4,0x43,0x25,0x18,0xd2,0x01                   // vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  .byte  0xc4,0x41,0x2d,0x56,0xc9                        // vorpd         %ymm9,%ymm10,%ymm9
+  .byte  0xc5,0x3c,0x59,0xd2                             // vmulps        %ymm2,%ymm8,%ymm10
+  .byte  0xc4,0x41,0x7d,0x5b,0xd2                        // vcvtps2dq     %ymm10,%ymm10
+  .byte  0xc4,0xc1,0x21,0x72,0xf2,0x10                   // vpslld        $0x10,%xmm10,%xmm11
+  .byte  0xc4,0x43,0x7d,0x19,0xd2,0x01                   // vextractf128  $0x1,%ymm10,%xmm10
+  .byte  0xc4,0xc1,0x29,0x72,0xf2,0x10                   // vpslld        $0x10,%xmm10,%xmm10
+  .byte  0xc4,0x43,0x25,0x18,0xd2,0x01                   // vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  .byte  0xc4,0x41,0x35,0x56,0xca                        // vorpd         %ymm10,%ymm9,%ymm9
+  .byte  0xc5,0x3c,0x59,0xc3                             // vmulps        %ymm3,%ymm8,%ymm8
+  .byte  0xc4,0x41,0x7d,0x5b,0xc0                        // vcvtps2dq     %ymm8,%ymm8
+  .byte  0xc4,0xc1,0x29,0x72,0xf0,0x18                   // vpslld        $0x18,%xmm8,%xmm10
+  .byte  0xc4,0x43,0x7d,0x19,0xc0,0x01                   // vextractf128  $0x1,%ymm8,%xmm8
+  .byte  0xc4,0xc1,0x39,0x72,0xf0,0x18                   // vpslld        $0x18,%xmm8,%xmm8
+  .byte  0xc4,0x43,0x2d,0x18,0xc0,0x01                   // vinsertf128   $0x1,%xmm8,%ymm10,%ymm8
+  .byte  0xc4,0x41,0x35,0x56,0xc0                        // vorpd         %ymm8,%ymm9,%ymm8
+  .byte  0xc5,0x7d,0x11,0x04,0xb8                        // vmovupd       %ymm8,(%rax,%rdi,4)
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_load_f16_avx
+_sk_load_f16_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_store_f16_avx
+_sk_store_f16_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_clamp_x_avx
+_sk_clamp_x_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x00                        // vbroadcastss  (%rax),%ymm8
+  .byte  0xc4,0x43,0x7d,0x19,0xc1,0x01                   // vextractf128  $0x1,%ymm8,%xmm9
+  .byte  0xc4,0x41,0x29,0x76,0xd2                        // vpcmpeqd      %xmm10,%xmm10,%xmm10
+  .byte  0xc4,0x41,0x31,0xfe,0xca                        // vpaddd        %xmm10,%xmm9,%xmm9
+  .byte  0xc4,0x41,0x39,0xfe,0xc2                        // vpaddd        %xmm10,%xmm8,%xmm8
+  .byte  0xc4,0x43,0x3d,0x18,0xc1,0x01                   // vinsertf128   $0x1,%xmm9,%ymm8,%ymm8
+  .byte  0xc4,0xc1,0x7c,0x5d,0xc0                        // vminps        %ymm8,%ymm0,%ymm0
+  .byte  0xc4,0x41,0x3c,0x57,0xc0                        // vxorps        %ymm8,%ymm8,%ymm8
+  .byte  0xc5,0xbc,0x5f,0xc0                             // vmaxps        %ymm0,%ymm8,%ymm0
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_clamp_y_avx
+_sk_clamp_y_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x00                        // vbroadcastss  (%rax),%ymm8
+  .byte  0xc4,0x43,0x7d,0x19,0xc1,0x01                   // vextractf128  $0x1,%ymm8,%xmm9
+  .byte  0xc4,0x41,0x29,0x76,0xd2                        // vpcmpeqd      %xmm10,%xmm10,%xmm10
+  .byte  0xc4,0x41,0x31,0xfe,0xca                        // vpaddd        %xmm10,%xmm9,%xmm9
+  .byte  0xc4,0x41,0x39,0xfe,0xc2                        // vpaddd        %xmm10,%xmm8,%xmm8
+  .byte  0xc4,0x43,0x3d,0x18,0xc1,0x01                   // vinsertf128   $0x1,%xmm9,%ymm8,%ymm8
+  .byte  0xc4,0xc1,0x74,0x5d,0xc8                        // vminps        %ymm8,%ymm1,%ymm1
+  .byte  0xc4,0x41,0x3c,0x57,0xc0                        // vxorps        %ymm8,%ymm8,%ymm8
+  .byte  0xc5,0xbc,0x5f,0xc9                             // vmaxps        %ymm1,%ymm8,%ymm1
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_matrix_2x3_avx
+_sk_matrix_2x3_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x00                        // vbroadcastss  (%rax),%ymm8
+  .byte  0xc4,0x62,0x7d,0x18,0x48,0x08                   // vbroadcastss  0x8(%rax),%ymm9
+  .byte  0xc4,0x62,0x7d,0x18,0x50,0x10                   // vbroadcastss  0x10(%rax),%ymm10
+  .byte  0xc5,0x34,0x59,0xc9                             // vmulps        %ymm1,%ymm9,%ymm9
+  .byte  0xc4,0x41,0x34,0x58,0xca                        // vaddps        %ymm10,%ymm9,%ymm9
+  .byte  0xc5,0x3c,0x59,0xc0                             // vmulps        %ymm0,%ymm8,%ymm8
+  .byte  0xc4,0x41,0x3c,0x58,0xc1                        // vaddps        %ymm9,%ymm8,%ymm8
+  .byte  0xc4,0x62,0x7d,0x18,0x48,0x04                   // vbroadcastss  0x4(%rax),%ymm9
+  .byte  0xc4,0x62,0x7d,0x18,0x50,0x0c                   // vbroadcastss  0xc(%rax),%ymm10
+  .byte  0xc4,0x62,0x7d,0x18,0x58,0x14                   // vbroadcastss  0x14(%rax),%ymm11
+  .byte  0xc5,0xac,0x59,0xc9                             // vmulps        %ymm1,%ymm10,%ymm1
+  .byte  0xc4,0xc1,0x74,0x58,0xcb                        // vaddps        %ymm11,%ymm1,%ymm1
+  .byte  0xc5,0xb4,0x59,0xc0                             // vmulps        %ymm0,%ymm9,%ymm0
+  .byte  0xc5,0xfc,0x58,0xc9                             // vaddps        %ymm1,%ymm0,%ymm1
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0x7c,0x29,0xc0                             // vmovaps       %ymm8,%ymm0
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_matrix_3x4_avx
+_sk_matrix_3x4_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0x62,0x7d,0x18,0x00                        // vbroadcastss  (%rax),%ymm8
+  .byte  0xc4,0x62,0x7d,0x18,0x48,0x0c                   // vbroadcastss  0xc(%rax),%ymm9
+  .byte  0xc4,0x62,0x7d,0x18,0x50,0x18                   // vbroadcastss  0x18(%rax),%ymm10
+  .byte  0xc4,0x62,0x7d,0x18,0x58,0x24                   // vbroadcastss  0x24(%rax),%ymm11
+  .byte  0xc5,0x2c,0x59,0xd2                             // vmulps        %ymm2,%ymm10,%ymm10
+  .byte  0xc4,0x41,0x2c,0x58,0xd3                        // vaddps        %ymm11,%ymm10,%ymm10
+  .byte  0xc5,0x34,0x59,0xc9                             // vmulps        %ymm1,%ymm9,%ymm9
+  .byte  0xc4,0x41,0x34,0x58,0xca                        // vaddps        %ymm10,%ymm9,%ymm9
+  .byte  0xc5,0x3c,0x59,0xc0                             // vmulps        %ymm0,%ymm8,%ymm8
+  .byte  0xc4,0x41,0x3c,0x58,0xc1                        // vaddps        %ymm9,%ymm8,%ymm8
+  .byte  0xc4,0x62,0x7d,0x18,0x48,0x04                   // vbroadcastss  0x4(%rax),%ymm9
+  .byte  0xc4,0x62,0x7d,0x18,0x50,0x10                   // vbroadcastss  0x10(%rax),%ymm10
+  .byte  0xc4,0x62,0x7d,0x18,0x58,0x1c                   // vbroadcastss  0x1c(%rax),%ymm11
+  .byte  0xc4,0x62,0x7d,0x18,0x60,0x28                   // vbroadcastss  0x28(%rax),%ymm12
+  .byte  0xc5,0x24,0x59,0xda                             // vmulps        %ymm2,%ymm11,%ymm11
+  .byte  0xc4,0x41,0x24,0x58,0xdc                        // vaddps        %ymm12,%ymm11,%ymm11
+  .byte  0xc5,0x2c,0x59,0xd1                             // vmulps        %ymm1,%ymm10,%ymm10
+  .byte  0xc4,0x41,0x2c,0x58,0xd3                        // vaddps        %ymm11,%ymm10,%ymm10
+  .byte  0xc5,0x34,0x59,0xc8                             // vmulps        %ymm0,%ymm9,%ymm9
+  .byte  0xc4,0x41,0x34,0x58,0xca                        // vaddps        %ymm10,%ymm9,%ymm9
+  .byte  0xc4,0x62,0x7d,0x18,0x50,0x08                   // vbroadcastss  0x8(%rax),%ymm10
+  .byte  0xc4,0x62,0x7d,0x18,0x58,0x14                   // vbroadcastss  0x14(%rax),%ymm11
+  .byte  0xc4,0x62,0x7d,0x18,0x60,0x20                   // vbroadcastss  0x20(%rax),%ymm12
+  .byte  0xc4,0x62,0x7d,0x18,0x68,0x2c                   // vbroadcastss  0x2c(%rax),%ymm13
+  .byte  0xc5,0x9c,0x59,0xd2                             // vmulps        %ymm2,%ymm12,%ymm2
+  .byte  0xc4,0xc1,0x6c,0x58,0xd5                        // vaddps        %ymm13,%ymm2,%ymm2
+  .byte  0xc5,0xa4,0x59,0xc9                             // vmulps        %ymm1,%ymm11,%ymm1
+  .byte  0xc5,0xf4,0x58,0xca                             // vaddps        %ymm2,%ymm1,%ymm1
+  .byte  0xc5,0xac,0x59,0xc0                             // vmulps        %ymm0,%ymm10,%ymm0
+  .byte  0xc5,0xfc,0x58,0xd1                             // vaddps        %ymm1,%ymm0,%ymm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0x7c,0x29,0xc0                             // vmovaps       %ymm8,%ymm0
+  .byte  0xc5,0x7c,0x29,0xc9                             // vmovaps       %ymm9,%ymm1
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_linear_gradient_2stops_avx
+_sk_linear_gradient_2stops_avx:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0xe2,0x7d,0x18,0x48,0x10                   // vbroadcastss  0x10(%rax),%ymm1
+  .byte  0xc4,0xe2,0x7d,0x18,0x10                        // vbroadcastss  (%rax),%ymm2
+  .byte  0xc5,0xf4,0x59,0xc8                             // vmulps        %ymm0,%ymm1,%ymm1
+  .byte  0xc5,0x6c,0x58,0xc1                             // vaddps        %ymm1,%ymm2,%ymm8
+  .byte  0xc4,0xe2,0x7d,0x18,0x48,0x14                   // vbroadcastss  0x14(%rax),%ymm1
+  .byte  0xc4,0xe2,0x7d,0x18,0x50,0x04                   // vbroadcastss  0x4(%rax),%ymm2
+  .byte  0xc5,0xf4,0x59,0xc8                             // vmulps        %ymm0,%ymm1,%ymm1
+  .byte  0xc5,0xec,0x58,0xc9                             // vaddps        %ymm1,%ymm2,%ymm1
+  .byte  0xc4,0xe2,0x7d,0x18,0x50,0x18                   // vbroadcastss  0x18(%rax),%ymm2
+  .byte  0xc4,0xe2,0x7d,0x18,0x58,0x08                   // vbroadcastss  0x8(%rax),%ymm3
+  .byte  0xc5,0xec,0x59,0xd0                             // vmulps        %ymm0,%ymm2,%ymm2
+  .byte  0xc5,0xe4,0x58,0xd2                             // vaddps        %ymm2,%ymm3,%ymm2
+  .byte  0xc4,0xe2,0x7d,0x18,0x58,0x1c                   // vbroadcastss  0x1c(%rax),%ymm3
+  .byte  0xc4,0x62,0x7d,0x18,0x48,0x0c                   // vbroadcastss  0xc(%rax),%ymm9
+  .byte  0xc5,0xe4,0x59,0xc0                             // vmulps        %ymm0,%ymm3,%ymm0
+  .byte  0xc5,0xb4,0x58,0xd8                             // vaddps        %ymm0,%ymm9,%ymm3
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0x7c,0x29,0xc0                             // vmovaps       %ymm8,%ymm0
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
 .globl _sk_start_pipeline_sse41
 _sk_start_pipeline_sse41:
   .byte  0x41,0x57                                       // push          %r15
index d2078b6..1409d03 100644 (file)
@@ -589,6 +589,701 @@ _sk_linear_gradient_2stops_hsw LABEL PROC
   DB  197,124,41,192                                  ; vmovaps       %ymm8,%ymm0
   DB  255,224                                         ; jmpq          *%rax
 
+PUBLIC _sk_start_pipeline_avx
+_sk_start_pipeline_avx LABEL PROC
+  DB  65,87                                           ; push          %r15
+  DB  65,86                                           ; push          %r14
+  DB  65,85                                           ; push          %r13
+  DB  65,84                                           ; push          %r12
+  DB  86                                              ; push          %rsi
+  DB  87                                              ; push          %rdi
+  DB  83                                              ; push          %rbx
+  DB  72,129,236,160,0,0,0                            ; sub           $0xa0,%rsp
+  DB  197,120,41,188,36,144,0,0,0                     ; vmovaps       %xmm15,0x90(%rsp)
+  DB  197,120,41,180,36,128,0,0,0                     ; vmovaps       %xmm14,0x80(%rsp)
+  DB  197,120,41,108,36,112                           ; vmovaps       %xmm13,0x70(%rsp)
+  DB  197,120,41,100,36,96                            ; vmovaps       %xmm12,0x60(%rsp)
+  DB  197,120,41,92,36,80                             ; vmovaps       %xmm11,0x50(%rsp)
+  DB  197,120,41,84,36,64                             ; vmovaps       %xmm10,0x40(%rsp)
+  DB  197,120,41,76,36,48                             ; vmovaps       %xmm9,0x30(%rsp)
+  DB  197,120,41,68,36,32                             ; vmovaps       %xmm8,0x20(%rsp)
+  DB  197,248,41,124,36,16                            ; vmovaps       %xmm7,0x10(%rsp)
+  DB  197,248,41,52,36                                ; vmovaps       %xmm6,(%rsp)
+  DB  77,137,207                                      ; mov           %r9,%r15
+  DB  77,137,198                                      ; mov           %r8,%r14
+  DB  72,137,203                                      ; mov           %rcx,%rbx
+  DB  72,137,214                                      ; mov           %rdx,%rsi
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  73,137,196                                      ; mov           %rax,%r12
+  DB  73,137,245                                      ; mov           %rsi,%r13
+  DB  72,141,67,8                                     ; lea           0x8(%rbx),%rax
+  DB  76,57,248                                       ; cmp           %r15,%rax
+  DB  118,5                                           ; jbe           75 <_sk_start_pipeline_avx+0x75>
+  DB  72,137,216                                      ; mov           %rbx,%rax
+  DB  235,60                                          ; jmp           b1 <_sk_start_pipeline_avx+0xb1>
+  DB  197,252,87,192                                  ; vxorps        %ymm0,%ymm0,%ymm0
+  DB  197,244,87,201                                  ; vxorps        %ymm1,%ymm1,%ymm1
+  DB  197,236,87,210                                  ; vxorps        %ymm2,%ymm2,%ymm2
+  DB  197,228,87,219                                  ; vxorps        %ymm3,%ymm3,%ymm3
+  DB  197,220,87,228                                  ; vxorps        %ymm4,%ymm4,%ymm4
+  DB  197,212,87,237                                  ; vxorps        %ymm5,%ymm5,%ymm5
+  DB  197,204,87,246                                  ; vxorps        %ymm6,%ymm6,%ymm6
+  DB  197,196,87,255                                  ; vxorps        %ymm7,%ymm7,%ymm7
+  DB  72,137,223                                      ; mov           %rbx,%rdi
+  DB  76,137,238                                      ; mov           %r13,%rsi
+  DB  76,137,242                                      ; mov           %r14,%rdx
+  DB  65,255,212                                      ; callq         *%r12
+  DB  72,141,67,8                                     ; lea           0x8(%rbx),%rax
+  DB  72,131,195,16                                   ; add           $0x10,%rbx
+  DB  76,57,251                                       ; cmp           %r15,%rbx
+  DB  72,137,195                                      ; mov           %rax,%rbx
+  DB  118,196                                         ; jbe           75 <_sk_start_pipeline_avx+0x75>
+  DB  197,248,40,52,36                                ; vmovaps       (%rsp),%xmm6
+  DB  197,248,40,124,36,16                            ; vmovaps       0x10(%rsp),%xmm7
+  DB  197,120,40,68,36,32                             ; vmovaps       0x20(%rsp),%xmm8
+  DB  197,120,40,76,36,48                             ; vmovaps       0x30(%rsp),%xmm9
+  DB  197,120,40,84,36,64                             ; vmovaps       0x40(%rsp),%xmm10
+  DB  197,120,40,92,36,80                             ; vmovaps       0x50(%rsp),%xmm11
+  DB  197,120,40,100,36,96                            ; vmovaps       0x60(%rsp),%xmm12
+  DB  197,120,40,108,36,112                           ; vmovaps       0x70(%rsp),%xmm13
+  DB  197,120,40,180,36,128,0,0,0                     ; vmovaps       0x80(%rsp),%xmm14
+  DB  197,120,40,188,36,144,0,0,0                     ; vmovaps       0x90(%rsp),%xmm15
+  DB  72,129,196,160,0,0,0                            ; add           $0xa0,%rsp
+  DB  91                                              ; pop           %rbx
+  DB  95                                              ; pop           %rdi
+  DB  94                                              ; pop           %rsi
+  DB  65,92                                           ; pop           %r12
+  DB  65,93                                           ; pop           %r13
+  DB  65,94                                           ; pop           %r14
+  DB  65,95                                           ; pop           %r15
+  DB  197,248,119                                     ; vzeroupper
+  DB  195                                             ; retq
+
+PUBLIC _sk_just_return_avx
+_sk_just_return_avx LABEL PROC
+  DB  195                                             ; retq
+
+PUBLIC _sk_seed_shader_avx
+_sk_seed_shader_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,249,110,199                                 ; vmovd         %edi,%xmm0
+  DB  196,227,121,4,192,0                             ; vpermilps     $0x0,%xmm0,%xmm0
+  DB  196,227,125,24,192,1                            ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm0
+  DB  197,252,91,192                                  ; vcvtdq2ps     %ymm0,%ymm0
+  DB  196,226,125,24,74,4                             ; vbroadcastss  0x4(%rdx),%ymm1
+  DB  197,252,88,193                                  ; vaddps        %ymm1,%ymm0,%ymm0
+  DB  197,252,88,66,20                                ; vaddps        0x14(%rdx),%ymm0,%ymm0
+  DB  197,249,110,16                                  ; vmovd         (%rax),%xmm2
+  DB  196,227,121,4,210,0                             ; vpermilps     $0x0,%xmm2,%xmm2
+  DB  196,227,109,24,210,1                            ; vinsertf128   $0x1,%xmm2,%ymm2,%ymm2
+  DB  197,252,91,210                                  ; vcvtdq2ps     %ymm2,%ymm2
+  DB  197,236,88,201                                  ; vaddps        %ymm1,%ymm2,%ymm1
+  DB  196,226,125,24,18                               ; vbroadcastss  (%rdx),%ymm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,228,87,219                                  ; vxorps        %ymm3,%ymm3,%ymm3
+  DB  197,220,87,228                                  ; vxorps        %ymm4,%ymm4,%ymm4
+  DB  197,212,87,237                                  ; vxorps        %ymm5,%ymm5,%ymm5
+  DB  197,204,87,246                                  ; vxorps        %ymm6,%ymm6,%ymm6
+  DB  197,196,87,255                                  ; vxorps        %ymm7,%ymm7,%ymm7
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_constant_color_avx
+_sk_constant_color_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,226,125,24,0                                ; vbroadcastss  (%rax),%ymm0
+  DB  196,226,125,24,72,4                             ; vbroadcastss  0x4(%rax),%ymm1
+  DB  196,226,125,24,80,8                             ; vbroadcastss  0x8(%rax),%ymm2
+  DB  196,226,125,24,88,12                            ; vbroadcastss  0xc(%rax),%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_clear_avx
+_sk_clear_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,252,87,192                                  ; vxorps        %ymm0,%ymm0,%ymm0
+  DB  197,244,87,201                                  ; vxorps        %ymm1,%ymm1,%ymm1
+  DB  197,236,87,210                                  ; vxorps        %ymm2,%ymm2,%ymm2
+  DB  197,228,87,219                                  ; vxorps        %ymm3,%ymm3,%ymm3
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_plus__avx
+_sk_plus__avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,252,88,196                                  ; vaddps        %ymm4,%ymm0,%ymm0
+  DB  197,244,88,205                                  ; vaddps        %ymm5,%ymm1,%ymm1
+  DB  197,236,88,214                                  ; vaddps        %ymm6,%ymm2,%ymm2
+  DB  197,228,88,223                                  ; vaddps        %ymm7,%ymm3,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_srcover_avx
+_sk_srcover_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,2                                 ; vbroadcastss  (%rdx),%ymm8
+  DB  197,60,92,195                                   ; vsubps        %ymm3,%ymm8,%ymm8
+  DB  197,60,89,204                                   ; vmulps        %ymm4,%ymm8,%ymm9
+  DB  197,180,88,192                                  ; vaddps        %ymm0,%ymm9,%ymm0
+  DB  197,60,89,205                                   ; vmulps        %ymm5,%ymm8,%ymm9
+  DB  197,180,88,201                                  ; vaddps        %ymm1,%ymm9,%ymm1
+  DB  197,60,89,206                                   ; vmulps        %ymm6,%ymm8,%ymm9
+  DB  197,180,88,210                                  ; vaddps        %ymm2,%ymm9,%ymm2
+  DB  197,60,89,199                                   ; vmulps        %ymm7,%ymm8,%ymm8
+  DB  197,188,88,219                                  ; vaddps        %ymm3,%ymm8,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_dstover_avx
+_sk_dstover_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,2                                 ; vbroadcastss  (%rdx),%ymm8
+  DB  197,60,92,199                                   ; vsubps        %ymm7,%ymm8,%ymm8
+  DB  197,188,89,192                                  ; vmulps        %ymm0,%ymm8,%ymm0
+  DB  197,252,88,196                                  ; vaddps        %ymm4,%ymm0,%ymm0
+  DB  197,188,89,201                                  ; vmulps        %ymm1,%ymm8,%ymm1
+  DB  197,244,88,205                                  ; vaddps        %ymm5,%ymm1,%ymm1
+  DB  197,188,89,210                                  ; vmulps        %ymm2,%ymm8,%ymm2
+  DB  197,236,88,214                                  ; vaddps        %ymm6,%ymm2,%ymm2
+  DB  197,188,89,219                                  ; vmulps        %ymm3,%ymm8,%ymm3
+  DB  197,228,88,223                                  ; vaddps        %ymm7,%ymm3,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_clamp_0_avx
+_sk_clamp_0_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,65,60,87,192                                ; vxorps        %ymm8,%ymm8,%ymm8
+  DB  196,193,124,95,192                              ; vmaxps        %ymm8,%ymm0,%ymm0
+  DB  196,193,116,95,200                              ; vmaxps        %ymm8,%ymm1,%ymm1
+  DB  196,193,108,95,208                              ; vmaxps        %ymm8,%ymm2,%ymm2
+  DB  196,193,100,95,216                              ; vmaxps        %ymm8,%ymm3,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_clamp_1_avx
+_sk_clamp_1_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,2                                 ; vbroadcastss  (%rdx),%ymm8
+  DB  196,193,124,93,192                              ; vminps        %ymm8,%ymm0,%ymm0
+  DB  196,193,116,93,200                              ; vminps        %ymm8,%ymm1,%ymm1
+  DB  196,193,108,93,208                              ; vminps        %ymm8,%ymm2,%ymm2
+  DB  196,193,100,93,216                              ; vminps        %ymm8,%ymm3,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_clamp_a_avx
+_sk_clamp_a_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,2                                 ; vbroadcastss  (%rdx),%ymm8
+  DB  196,193,100,93,216                              ; vminps        %ymm8,%ymm3,%ymm3
+  DB  197,252,93,195                                  ; vminps        %ymm3,%ymm0,%ymm0
+  DB  197,244,93,203                                  ; vminps        %ymm3,%ymm1,%ymm1
+  DB  197,236,93,211                                  ; vminps        %ymm3,%ymm2,%ymm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_set_rgb_avx
+_sk_set_rgb_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,226,125,24,0                                ; vbroadcastss  (%rax),%ymm0
+  DB  196,226,125,24,72,4                             ; vbroadcastss  0x4(%rax),%ymm1
+  DB  196,226,125,24,80,8                             ; vbroadcastss  0x8(%rax),%ymm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_swap_rb_avx
+_sk_swap_rb_avx LABEL PROC
+  DB  197,124,40,192                                  ; vmovaps       %ymm0,%ymm8
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,252,40,194                                  ; vmovaps       %ymm2,%ymm0
+  DB  197,124,41,194                                  ; vmovaps       %ymm8,%ymm2
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_swap_avx
+_sk_swap_avx LABEL PROC
+  DB  197,124,40,195                                  ; vmovaps       %ymm3,%ymm8
+  DB  197,124,40,202                                  ; vmovaps       %ymm2,%ymm9
+  DB  197,124,40,209                                  ; vmovaps       %ymm1,%ymm10
+  DB  197,124,40,216                                  ; vmovaps       %ymm0,%ymm11
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,252,40,196                                  ; vmovaps       %ymm4,%ymm0
+  DB  197,252,40,205                                  ; vmovaps       %ymm5,%ymm1
+  DB  197,252,40,214                                  ; vmovaps       %ymm6,%ymm2
+  DB  197,252,40,223                                  ; vmovaps       %ymm7,%ymm3
+  DB  197,124,41,220                                  ; vmovaps       %ymm11,%ymm4
+  DB  197,124,41,213                                  ; vmovaps       %ymm10,%ymm5
+  DB  197,124,41,206                                  ; vmovaps       %ymm9,%ymm6
+  DB  197,124,41,199                                  ; vmovaps       %ymm8,%ymm7
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_move_src_dst_avx
+_sk_move_src_dst_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,252,40,224                                  ; vmovaps       %ymm0,%ymm4
+  DB  197,252,40,233                                  ; vmovaps       %ymm1,%ymm5
+  DB  197,252,40,242                                  ; vmovaps       %ymm2,%ymm6
+  DB  197,252,40,251                                  ; vmovaps       %ymm3,%ymm7
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_move_dst_src_avx
+_sk_move_dst_src_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,252,40,196                                  ; vmovaps       %ymm4,%ymm0
+  DB  197,252,40,205                                  ; vmovaps       %ymm5,%ymm1
+  DB  197,252,40,214                                  ; vmovaps       %ymm6,%ymm2
+  DB  197,252,40,223                                  ; vmovaps       %ymm7,%ymm3
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_premul_avx
+_sk_premul_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,252,89,195                                  ; vmulps        %ymm3,%ymm0,%ymm0
+  DB  197,244,89,203                                  ; vmulps        %ymm3,%ymm1,%ymm1
+  DB  197,236,89,211                                  ; vmulps        %ymm3,%ymm2,%ymm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_unpremul_avx
+_sk_unpremul_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,65,60,87,192                                ; vxorps        %ymm8,%ymm8,%ymm8
+  DB  196,65,100,194,200,0                            ; vcmpeqps      %ymm8,%ymm3,%ymm9
+  DB  196,98,125,24,18                                ; vbroadcastss  (%rdx),%ymm10
+  DB  197,44,94,211                                   ; vdivps        %ymm3,%ymm10,%ymm10
+  DB  196,67,45,74,192,144                            ; vblendvps     %ymm9,%ymm8,%ymm10,%ymm8
+  DB  197,188,89,192                                  ; vmulps        %ymm0,%ymm8,%ymm0
+  DB  197,188,89,201                                  ; vmulps        %ymm1,%ymm8,%ymm1
+  DB  197,188,89,210                                  ; vmulps        %ymm2,%ymm8,%ymm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_from_srgb_avx
+_sk_from_srgb_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,66,64                             ; vbroadcastss  0x40(%rdx),%ymm8
+  DB  197,60,89,200                                   ; vmulps        %ymm0,%ymm8,%ymm9
+  DB  197,124,89,208                                  ; vmulps        %ymm0,%ymm0,%ymm10
+  DB  196,98,125,24,90,60                             ; vbroadcastss  0x3c(%rdx),%ymm11
+  DB  196,98,125,24,98,56                             ; vbroadcastss  0x38(%rdx),%ymm12
+  DB  197,36,89,232                                   ; vmulps        %ymm0,%ymm11,%ymm13
+  DB  196,65,20,88,236                                ; vaddps        %ymm12,%ymm13,%ymm13
+  DB  196,98,125,24,114,52                            ; vbroadcastss  0x34(%rdx),%ymm14
+  DB  196,65,44,89,213                                ; vmulps        %ymm13,%ymm10,%ymm10
+  DB  196,65,12,88,210                                ; vaddps        %ymm10,%ymm14,%ymm10
+  DB  196,98,125,24,106,68                            ; vbroadcastss  0x44(%rdx),%ymm13
+  DB  196,193,124,194,197,1                           ; vcmpltps      %ymm13,%ymm0,%ymm0
+  DB  196,195,45,74,193,0                             ; vblendvps     %ymm0,%ymm9,%ymm10,%ymm0
+  DB  197,60,89,201                                   ; vmulps        %ymm1,%ymm8,%ymm9
+  DB  197,116,89,209                                  ; vmulps        %ymm1,%ymm1,%ymm10
+  DB  197,36,89,249                                   ; vmulps        %ymm1,%ymm11,%ymm15
+  DB  196,65,4,88,252                                 ; vaddps        %ymm12,%ymm15,%ymm15
+  DB  196,65,44,89,215                                ; vmulps        %ymm15,%ymm10,%ymm10
+  DB  196,65,12,88,210                                ; vaddps        %ymm10,%ymm14,%ymm10
+  DB  196,193,116,194,205,1                           ; vcmpltps      %ymm13,%ymm1,%ymm1
+  DB  196,195,45,74,201,16                            ; vblendvps     %ymm1,%ymm9,%ymm10,%ymm1
+  DB  197,60,89,194                                   ; vmulps        %ymm2,%ymm8,%ymm8
+  DB  197,108,89,202                                  ; vmulps        %ymm2,%ymm2,%ymm9
+  DB  197,36,89,210                                   ; vmulps        %ymm2,%ymm11,%ymm10
+  DB  196,65,44,88,212                                ; vaddps        %ymm12,%ymm10,%ymm10
+  DB  196,65,52,89,202                                ; vmulps        %ymm10,%ymm9,%ymm9
+  DB  196,65,12,88,201                                ; vaddps        %ymm9,%ymm14,%ymm9
+  DB  196,193,108,194,213,1                           ; vcmpltps      %ymm13,%ymm2,%ymm2
+  DB  196,195,53,74,208,32                            ; vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_to_srgb_avx
+_sk_to_srgb_avx LABEL PROC
+  DB  197,124,82,192                                  ; vrsqrtps      %ymm0,%ymm8
+  DB  196,65,124,83,200                               ; vrcpps        %ymm8,%ymm9
+  DB  196,65,124,82,208                               ; vrsqrtps      %ymm8,%ymm10
+  DB  196,98,125,24,66,72                             ; vbroadcastss  0x48(%rdx),%ymm8
+  DB  197,60,89,216                                   ; vmulps        %ymm0,%ymm8,%ymm11
+  DB  196,98,125,24,34                                ; vbroadcastss  (%rdx),%ymm12
+  DB  196,98,125,24,106,76                            ; vbroadcastss  0x4c(%rdx),%ymm13
+  DB  196,98,125,24,114,80                            ; vbroadcastss  0x50(%rdx),%ymm14
+  DB  196,98,125,24,122,84                            ; vbroadcastss  0x54(%rdx),%ymm15
+  DB  196,65,52,89,206                                ; vmulps        %ymm14,%ymm9,%ymm9
+  DB  196,65,52,88,207                                ; vaddps        %ymm15,%ymm9,%ymm9
+  DB  196,65,44,89,213                                ; vmulps        %ymm13,%ymm10,%ymm10
+  DB  196,65,44,88,201                                ; vaddps        %ymm9,%ymm10,%ymm9
+  DB  196,65,28,93,201                                ; vminps        %ymm9,%ymm12,%ymm9
+  DB  196,98,125,24,82,88                             ; vbroadcastss  0x58(%rdx),%ymm10
+  DB  196,193,124,194,194,1                           ; vcmpltps      %ymm10,%ymm0,%ymm0
+  DB  196,195,53,74,195,0                             ; vblendvps     %ymm0,%ymm11,%ymm9,%ymm0
+  DB  197,124,82,201                                  ; vrsqrtps      %ymm1,%ymm9
+  DB  196,65,124,83,217                               ; vrcpps        %ymm9,%ymm11
+  DB  196,65,124,82,201                               ; vrsqrtps      %ymm9,%ymm9
+  DB  196,65,12,89,219                                ; vmulps        %ymm11,%ymm14,%ymm11
+  DB  196,65,4,88,219                                 ; vaddps        %ymm11,%ymm15,%ymm11
+  DB  196,65,20,89,201                                ; vmulps        %ymm9,%ymm13,%ymm9
+  DB  196,65,52,88,203                                ; vaddps        %ymm11,%ymm9,%ymm9
+  DB  197,60,89,217                                   ; vmulps        %ymm1,%ymm8,%ymm11
+  DB  196,65,28,93,201                                ; vminps        %ymm9,%ymm12,%ymm9
+  DB  196,193,116,194,202,1                           ; vcmpltps      %ymm10,%ymm1,%ymm1
+  DB  196,195,53,74,203,16                            ; vblendvps     %ymm1,%ymm11,%ymm9,%ymm1
+  DB  197,124,82,202                                  ; vrsqrtps      %ymm2,%ymm9
+  DB  196,65,124,83,217                               ; vrcpps        %ymm9,%ymm11
+  DB  196,65,12,89,219                                ; vmulps        %ymm11,%ymm14,%ymm11
+  DB  196,65,4,88,219                                 ; vaddps        %ymm11,%ymm15,%ymm11
+  DB  196,65,124,82,201                               ; vrsqrtps      %ymm9,%ymm9
+  DB  196,65,20,89,201                                ; vmulps        %ymm9,%ymm13,%ymm9
+  DB  196,65,52,88,203                                ; vaddps        %ymm11,%ymm9,%ymm9
+  DB  196,65,28,93,201                                ; vminps        %ymm9,%ymm12,%ymm9
+  DB  197,60,89,194                                   ; vmulps        %ymm2,%ymm8,%ymm8
+  DB  196,193,108,194,210,1                           ; vcmpltps      %ymm10,%ymm2,%ymm2
+  DB  196,195,53,74,208,32                            ; vblendvps     %ymm2,%ymm8,%ymm9,%ymm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_scale_u8_avx
+_sk_scale_u8_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,139,0                                        ; mov           (%rax),%rax
+  DB  196,98,121,49,68,56,4                           ; vpmovzxbd     0x4(%rax,%rdi,1),%xmm8
+  DB  196,98,121,49,12,56                             ; vpmovzxbd     (%rax,%rdi,1),%xmm9
+  DB  196,67,53,24,192,1                              ; vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
+  DB  196,65,124,91,192                               ; vcvtdq2ps     %ymm8,%ymm8
+  DB  196,98,125,24,74,12                             ; vbroadcastss  0xc(%rdx),%ymm9
+  DB  196,65,60,89,193                                ; vmulps        %ymm9,%ymm8,%ymm8
+  DB  197,188,89,192                                  ; vmulps        %ymm0,%ymm8,%ymm0
+  DB  197,188,89,201                                  ; vmulps        %ymm1,%ymm8,%ymm1
+  DB  197,188,89,210                                  ; vmulps        %ymm2,%ymm8,%ymm2
+  DB  197,188,89,219                                  ; vmulps        %ymm3,%ymm8,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_lerp_u8_avx
+_sk_lerp_u8_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,139,0                                        ; mov           (%rax),%rax
+  DB  196,98,121,49,68,56,4                           ; vpmovzxbd     0x4(%rax,%rdi,1),%xmm8
+  DB  196,98,121,49,12,56                             ; vpmovzxbd     (%rax,%rdi,1),%xmm9
+  DB  196,67,53,24,192,1                              ; vinsertf128   $0x1,%xmm8,%ymm9,%ymm8
+  DB  196,65,124,91,192                               ; vcvtdq2ps     %ymm8,%ymm8
+  DB  196,98,125,24,74,12                             ; vbroadcastss  0xc(%rdx),%ymm9
+  DB  196,65,60,89,193                                ; vmulps        %ymm9,%ymm8,%ymm8
+  DB  197,252,92,196                                  ; vsubps        %ymm4,%ymm0,%ymm0
+  DB  196,193,124,89,192                              ; vmulps        %ymm8,%ymm0,%ymm0
+  DB  197,252,88,196                                  ; vaddps        %ymm4,%ymm0,%ymm0
+  DB  197,244,92,205                                  ; vsubps        %ymm5,%ymm1,%ymm1
+  DB  196,193,116,89,200                              ; vmulps        %ymm8,%ymm1,%ymm1
+  DB  197,244,88,205                                  ; vaddps        %ymm5,%ymm1,%ymm1
+  DB  197,236,92,214                                  ; vsubps        %ymm6,%ymm2,%ymm2
+  DB  196,193,108,89,208                              ; vmulps        %ymm8,%ymm2,%ymm2
+  DB  197,236,88,214                                  ; vaddps        %ymm6,%ymm2,%ymm2
+  DB  197,228,92,223                                  ; vsubps        %ymm7,%ymm3,%ymm3
+  DB  196,193,100,89,216                              ; vmulps        %ymm8,%ymm3,%ymm3
+  DB  197,228,88,223                                  ; vaddps        %ymm7,%ymm3,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_load_tables_avx
+_sk_load_tables_avx LABEL PROC
+  DB  65,87                                           ; push          %r15
+  DB  65,86                                           ; push          %r14
+  DB  65,84                                           ; push          %r12
+  DB  83                                              ; push          %rbx
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  76,139,0                                        ; mov           (%rax),%r8
+  DB  72,139,72,8                                     ; mov           0x8(%rax),%rcx
+  DB  196,65,124,16,20,184                            ; vmovups       (%r8,%rdi,4),%ymm10
+  DB  197,249,110,66,16                               ; vmovd         0x10(%rdx),%xmm0
+  DB  196,227,121,4,192,0                             ; vpermilps     $0x0,%xmm0,%xmm0
+  DB  196,99,125,24,200,1                             ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm9
+  DB  196,193,52,84,194                               ; vandps        %ymm10,%ymm9,%ymm0
+  DB  196,193,249,126,192                             ; vmovq         %xmm0,%r8
+  DB  69,137,193                                      ; mov           %r8d,%r9d
+  DB  196,195,249,22,194,1                            ; vpextrq       $0x1,%xmm0,%r10
+  DB  69,137,211                                      ; mov           %r10d,%r11d
+  DB  73,193,234,32                                   ; shr           $0x20,%r10
+  DB  73,193,232,32                                   ; shr           $0x20,%r8
+  DB  196,227,125,25,192,1                            ; vextractf128  $0x1,%ymm0,%xmm0
+  DB  196,193,249,126,199                             ; vmovq         %xmm0,%r15
+  DB  69,137,254                                      ; mov           %r15d,%r14d
+  DB  196,227,249,22,195,1                            ; vpextrq       $0x1,%xmm0,%rbx
+  DB  65,137,220                                      ; mov           %ebx,%r12d
+  DB  72,193,235,32                                   ; shr           $0x20,%rbx
+  DB  73,193,239,32                                   ; shr           $0x20,%r15
+  DB  196,161,122,16,4,177                            ; vmovss        (%rcx,%r14,4),%xmm0
+  DB  196,163,121,33,4,185,16                         ; vinsertps     $0x10,(%rcx,%r15,4),%xmm0,%xmm0
+  DB  196,163,121,33,4,161,32                         ; vinsertps     $0x20,(%rcx,%r12,4),%xmm0,%xmm0
+  DB  196,227,121,33,4,153,48                         ; vinsertps     $0x30,(%rcx,%rbx,4),%xmm0,%xmm0
+  DB  196,161,122,16,12,137                           ; vmovss        (%rcx,%r9,4),%xmm1
+  DB  196,163,113,33,12,129,16                        ; vinsertps     $0x10,(%rcx,%r8,4),%xmm1,%xmm1
+  DB  196,163,113,33,12,153,32                        ; vinsertps     $0x20,(%rcx,%r11,4),%xmm1,%xmm1
+  DB  196,163,113,33,12,145,48                        ; vinsertps     $0x30,(%rcx,%r10,4),%xmm1,%xmm1
+  DB  196,227,117,24,192,1                            ; vinsertf128   $0x1,%xmm0,%ymm1,%ymm0
+  DB  76,139,120,16                                   ; mov           0x10(%rax),%r15
+  DB  196,193,113,114,210,8                           ; vpsrld        $0x8,%xmm10,%xmm1
+  DB  196,67,125,25,208,1                             ; vextractf128  $0x1,%ymm10,%xmm8
+  DB  196,193,105,114,208,8                           ; vpsrld        $0x8,%xmm8,%xmm2
+  DB  196,227,117,24,202,1                            ; vinsertf128   $0x1,%xmm2,%ymm1,%ymm1
+  DB  197,180,84,201                                  ; vandps        %ymm1,%ymm9,%ymm1
+  DB  196,193,249,126,200                             ; vmovq         %xmm1,%r8
+  DB  69,137,194                                      ; mov           %r8d,%r10d
+  DB  196,195,249,22,201,1                            ; vpextrq       $0x1,%xmm1,%r9
+  DB  69,137,203                                      ; mov           %r9d,%r11d
+  DB  73,193,233,32                                   ; shr           $0x20,%r9
+  DB  73,193,232,32                                   ; shr           $0x20,%r8
+  DB  196,227,125,25,201,1                            ; vextractf128  $0x1,%ymm1,%xmm1
+  DB  196,225,249,126,203                             ; vmovq         %xmm1,%rbx
+  DB  65,137,222                                      ; mov           %ebx,%r14d
+  DB  196,227,249,22,201,1                            ; vpextrq       $0x1,%xmm1,%rcx
+  DB  65,137,204                                      ; mov           %ecx,%r12d
+  DB  72,193,233,32                                   ; shr           $0x20,%rcx
+  DB  72,193,235,32                                   ; shr           $0x20,%rbx
+  DB  196,129,122,16,12,183                           ; vmovss        (%r15,%r14,4),%xmm1
+  DB  196,195,113,33,12,159,16                        ; vinsertps     $0x10,(%r15,%rbx,4),%xmm1,%xmm1
+  DB  196,129,122,16,20,167                           ; vmovss        (%r15,%r12,4),%xmm2
+  DB  196,227,113,33,202,32                           ; vinsertps     $0x20,%xmm2,%xmm1,%xmm1
+  DB  196,193,122,16,20,143                           ; vmovss        (%r15,%rcx,4),%xmm2
+  DB  196,227,113,33,202,48                           ; vinsertps     $0x30,%xmm2,%xmm1,%xmm1
+  DB  196,129,122,16,20,151                           ; vmovss        (%r15,%r10,4),%xmm2
+  DB  196,131,105,33,20,135,16                        ; vinsertps     $0x10,(%r15,%r8,4),%xmm2,%xmm2
+  DB  196,129,122,16,28,159                           ; vmovss        (%r15,%r11,4),%xmm3
+  DB  196,227,105,33,211,32                           ; vinsertps     $0x20,%xmm3,%xmm2,%xmm2
+  DB  196,129,122,16,28,143                           ; vmovss        (%r15,%r9,4),%xmm3
+  DB  196,227,105,33,211,48                           ; vinsertps     $0x30,%xmm3,%xmm2,%xmm2
+  DB  196,227,109,24,201,1                            ; vinsertf128   $0x1,%xmm1,%ymm2,%ymm1
+  DB  72,139,64,24                                    ; mov           0x18(%rax),%rax
+  DB  196,193,105,114,210,16                          ; vpsrld        $0x10,%xmm10,%xmm2
+  DB  196,193,97,114,208,16                           ; vpsrld        $0x10,%xmm8,%xmm3
+  DB  196,227,109,24,211,1                            ; vinsertf128   $0x1,%xmm3,%ymm2,%ymm2
+  DB  197,180,84,210                                  ; vandps        %ymm2,%ymm9,%ymm2
+  DB  196,193,249,126,208                             ; vmovq         %xmm2,%r8
+  DB  69,137,193                                      ; mov           %r8d,%r9d
+  DB  196,195,249,22,214,1                            ; vpextrq       $0x1,%xmm2,%r14
+  DB  69,137,242                                      ; mov           %r14d,%r10d
+  DB  73,193,238,32                                   ; shr           $0x20,%r14
+  DB  73,193,232,32                                   ; shr           $0x20,%r8
+  DB  196,227,125,25,210,1                            ; vextractf128  $0x1,%ymm2,%xmm2
+  DB  196,225,249,126,211                             ; vmovq         %xmm2,%rbx
+  DB  65,137,219                                      ; mov           %ebx,%r11d
+  DB  196,227,249,22,209,1                            ; vpextrq       $0x1,%xmm2,%rcx
+  DB  65,137,207                                      ; mov           %ecx,%r15d
+  DB  72,193,233,32                                   ; shr           $0x20,%rcx
+  DB  72,193,235,32                                   ; shr           $0x20,%rbx
+  DB  196,161,122,16,20,152                           ; vmovss        (%rax,%r11,4),%xmm2
+  DB  196,227,105,33,20,152,16                        ; vinsertps     $0x10,(%rax,%rbx,4),%xmm2,%xmm2
+  DB  196,161,122,16,28,184                           ; vmovss        (%rax,%r15,4),%xmm3
+  DB  196,227,105,33,211,32                           ; vinsertps     $0x20,%xmm3,%xmm2,%xmm2
+  DB  197,250,16,28,136                               ; vmovss        (%rax,%rcx,4),%xmm3
+  DB  196,99,105,33,203,48                            ; vinsertps     $0x30,%xmm3,%xmm2,%xmm9
+  DB  196,161,122,16,28,136                           ; vmovss        (%rax,%r9,4),%xmm3
+  DB  196,163,97,33,28,128,16                         ; vinsertps     $0x10,(%rax,%r8,4),%xmm3,%xmm3
+  DB  196,161,122,16,20,144                           ; vmovss        (%rax,%r10,4),%xmm2
+  DB  196,227,97,33,210,32                            ; vinsertps     $0x20,%xmm2,%xmm3,%xmm2
+  DB  196,161,122,16,28,176                           ; vmovss        (%rax,%r14,4),%xmm3
+  DB  196,227,105,33,211,48                           ; vinsertps     $0x30,%xmm3,%xmm2,%xmm2
+  DB  196,195,109,24,209,1                            ; vinsertf128   $0x1,%xmm9,%ymm2,%ymm2
+  DB  196,193,49,114,210,24                           ; vpsrld        $0x18,%xmm10,%xmm9
+  DB  196,193,97,114,208,24                           ; vpsrld        $0x18,%xmm8,%xmm3
+  DB  196,227,53,24,219,1                             ; vinsertf128   $0x1,%xmm3,%ymm9,%ymm3
+  DB  197,252,91,219                                  ; vcvtdq2ps     %ymm3,%ymm3
+  DB  196,98,125,24,66,12                             ; vbroadcastss  0xc(%rdx),%ymm8
+  DB  196,193,100,89,216                              ; vmulps        %ymm8,%ymm3,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  91                                              ; pop           %rbx
+  DB  65,92                                           ; pop           %r12
+  DB  65,94                                           ; pop           %r14
+  DB  65,95                                           ; pop           %r15
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_load_8888_avx
+_sk_load_8888_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,139,0                                        ; mov           (%rax),%rax
+  DB  197,252,16,28,184                               ; vmovups       (%rax,%rdi,4),%ymm3
+  DB  197,249,110,66,16                               ; vmovd         0x10(%rdx),%xmm0
+  DB  196,227,121,4,192,0                             ; vpermilps     $0x0,%xmm0,%xmm0
+  DB  196,99,125,24,216,1                             ; vinsertf128   $0x1,%xmm0,%ymm0,%ymm11
+  DB  197,164,84,195                                  ; vandps        %ymm3,%ymm11,%ymm0
+  DB  197,252,91,192                                  ; vcvtdq2ps     %ymm0,%ymm0
+  DB  196,98,125,24,66,12                             ; vbroadcastss  0xc(%rdx),%ymm8
+  DB  197,188,89,192                                  ; vmulps        %ymm0,%ymm8,%ymm0
+  DB  197,169,114,211,8                               ; vpsrld        $0x8,%xmm3,%xmm10
+  DB  196,195,125,25,217,1                            ; vextractf128  $0x1,%ymm3,%xmm9
+  DB  196,193,113,114,209,8                           ; vpsrld        $0x8,%xmm9,%xmm1
+  DB  196,227,45,24,201,1                             ; vinsertf128   $0x1,%xmm1,%ymm10,%ymm1
+  DB  197,164,84,201                                  ; vandps        %ymm1,%ymm11,%ymm1
+  DB  197,252,91,201                                  ; vcvtdq2ps     %ymm1,%ymm1
+  DB  197,188,89,201                                  ; vmulps        %ymm1,%ymm8,%ymm1
+  DB  197,169,114,211,16                              ; vpsrld        $0x10,%xmm3,%xmm10
+  DB  196,193,105,114,209,16                          ; vpsrld        $0x10,%xmm9,%xmm2
+  DB  196,227,45,24,210,1                             ; vinsertf128   $0x1,%xmm2,%ymm10,%ymm2
+  DB  197,164,84,210                                  ; vandps        %ymm2,%ymm11,%ymm2
+  DB  197,252,91,210                                  ; vcvtdq2ps     %ymm2,%ymm2
+  DB  197,188,89,210                                  ; vmulps        %ymm2,%ymm8,%ymm2
+  DB  197,169,114,211,24                              ; vpsrld        $0x18,%xmm3,%xmm10
+  DB  196,193,97,114,209,24                           ; vpsrld        $0x18,%xmm9,%xmm3
+  DB  196,227,45,24,219,1                             ; vinsertf128   $0x1,%xmm3,%ymm10,%ymm3
+  DB  197,252,91,219                                  ; vcvtdq2ps     %ymm3,%ymm3
+  DB  196,193,100,89,216                              ; vmulps        %ymm8,%ymm3,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_store_8888_avx
+_sk_store_8888_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,139,0                                        ; mov           (%rax),%rax
+  DB  196,98,125,24,66,8                              ; vbroadcastss  0x8(%rdx),%ymm8
+  DB  197,60,89,200                                   ; vmulps        %ymm0,%ymm8,%ymm9
+  DB  196,65,125,91,201                               ; vcvtps2dq     %ymm9,%ymm9
+  DB  197,60,89,209                                   ; vmulps        %ymm1,%ymm8,%ymm10
+  DB  196,65,125,91,210                               ; vcvtps2dq     %ymm10,%ymm10
+  DB  196,193,33,114,242,8                            ; vpslld        $0x8,%xmm10,%xmm11
+  DB  196,67,125,25,210,1                             ; vextractf128  $0x1,%ymm10,%xmm10
+  DB  196,193,41,114,242,8                            ; vpslld        $0x8,%xmm10,%xmm10
+  DB  196,67,37,24,210,1                              ; vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  DB  196,65,45,86,201                                ; vorpd         %ymm9,%ymm10,%ymm9
+  DB  197,60,89,210                                   ; vmulps        %ymm2,%ymm8,%ymm10
+  DB  196,65,125,91,210                               ; vcvtps2dq     %ymm10,%ymm10
+  DB  196,193,33,114,242,16                           ; vpslld        $0x10,%xmm10,%xmm11
+  DB  196,67,125,25,210,1                             ; vextractf128  $0x1,%ymm10,%xmm10
+  DB  196,193,41,114,242,16                           ; vpslld        $0x10,%xmm10,%xmm10
+  DB  196,67,37,24,210,1                              ; vinsertf128   $0x1,%xmm10,%ymm11,%ymm10
+  DB  196,65,53,86,202                                ; vorpd         %ymm10,%ymm9,%ymm9
+  DB  197,60,89,195                                   ; vmulps        %ymm3,%ymm8,%ymm8
+  DB  196,65,125,91,192                               ; vcvtps2dq     %ymm8,%ymm8
+  DB  196,193,41,114,240,24                           ; vpslld        $0x18,%xmm8,%xmm10
+  DB  196,67,125,25,192,1                             ; vextractf128  $0x1,%ymm8,%xmm8
+  DB  196,193,57,114,240,24                           ; vpslld        $0x18,%xmm8,%xmm8
+  DB  196,67,45,24,192,1                              ; vinsertf128   $0x1,%xmm8,%ymm10,%ymm8
+  DB  196,65,53,86,192                                ; vorpd         %ymm8,%ymm9,%ymm8
+  DB  197,125,17,4,184                                ; vmovupd       %ymm8,(%rax,%rdi,4)
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_load_f16_avx
+_sk_load_f16_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_store_f16_avx
+_sk_store_f16_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_clamp_x_avx
+_sk_clamp_x_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,0                                 ; vbroadcastss  (%rax),%ymm8
+  DB  196,67,125,25,193,1                             ; vextractf128  $0x1,%ymm8,%xmm9
+  DB  196,65,41,118,210                               ; vpcmpeqd      %xmm10,%xmm10,%xmm10
+  DB  196,65,49,254,202                               ; vpaddd        %xmm10,%xmm9,%xmm9
+  DB  196,65,57,254,194                               ; vpaddd        %xmm10,%xmm8,%xmm8
+  DB  196,67,61,24,193,1                              ; vinsertf128   $0x1,%xmm9,%ymm8,%ymm8
+  DB  196,193,124,93,192                              ; vminps        %ymm8,%ymm0,%ymm0
+  DB  196,65,60,87,192                                ; vxorps        %ymm8,%ymm8,%ymm8
+  DB  197,188,95,192                                  ; vmaxps        %ymm0,%ymm8,%ymm0
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_clamp_y_avx
+_sk_clamp_y_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,0                                 ; vbroadcastss  (%rax),%ymm8
+  DB  196,67,125,25,193,1                             ; vextractf128  $0x1,%ymm8,%xmm9
+  DB  196,65,41,118,210                               ; vpcmpeqd      %xmm10,%xmm10,%xmm10
+  DB  196,65,49,254,202                               ; vpaddd        %xmm10,%xmm9,%xmm9
+  DB  196,65,57,254,194                               ; vpaddd        %xmm10,%xmm8,%xmm8
+  DB  196,67,61,24,193,1                              ; vinsertf128   $0x1,%xmm9,%ymm8,%ymm8
+  DB  196,193,116,93,200                              ; vminps        %ymm8,%ymm1,%ymm1
+  DB  196,65,60,87,192                                ; vxorps        %ymm8,%ymm8,%ymm8
+  DB  197,188,95,201                                  ; vmaxps        %ymm1,%ymm8,%ymm1
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_matrix_2x3_avx
+_sk_matrix_2x3_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,0                                 ; vbroadcastss  (%rax),%ymm8
+  DB  196,98,125,24,72,8                              ; vbroadcastss  0x8(%rax),%ymm9
+  DB  196,98,125,24,80,16                             ; vbroadcastss  0x10(%rax),%ymm10
+  DB  197,52,89,201                                   ; vmulps        %ymm1,%ymm9,%ymm9
+  DB  196,65,52,88,202                                ; vaddps        %ymm10,%ymm9,%ymm9
+  DB  197,60,89,192                                   ; vmulps        %ymm0,%ymm8,%ymm8
+  DB  196,65,60,88,193                                ; vaddps        %ymm9,%ymm8,%ymm8
+  DB  196,98,125,24,72,4                              ; vbroadcastss  0x4(%rax),%ymm9
+  DB  196,98,125,24,80,12                             ; vbroadcastss  0xc(%rax),%ymm10
+  DB  196,98,125,24,88,20                             ; vbroadcastss  0x14(%rax),%ymm11
+  DB  197,172,89,201                                  ; vmulps        %ymm1,%ymm10,%ymm1
+  DB  196,193,116,88,203                              ; vaddps        %ymm11,%ymm1,%ymm1
+  DB  197,180,89,192                                  ; vmulps        %ymm0,%ymm9,%ymm0
+  DB  197,252,88,201                                  ; vaddps        %ymm1,%ymm0,%ymm1
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,124,41,192                                  ; vmovaps       %ymm8,%ymm0
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_matrix_3x4_avx
+_sk_matrix_3x4_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,98,125,24,0                                 ; vbroadcastss  (%rax),%ymm8
+  DB  196,98,125,24,72,12                             ; vbroadcastss  0xc(%rax),%ymm9
+  DB  196,98,125,24,80,24                             ; vbroadcastss  0x18(%rax),%ymm10
+  DB  196,98,125,24,88,36                             ; vbroadcastss  0x24(%rax),%ymm11
+  DB  197,44,89,210                                   ; vmulps        %ymm2,%ymm10,%ymm10
+  DB  196,65,44,88,211                                ; vaddps        %ymm11,%ymm10,%ymm10
+  DB  197,52,89,201                                   ; vmulps        %ymm1,%ymm9,%ymm9
+  DB  196,65,52,88,202                                ; vaddps        %ymm10,%ymm9,%ymm9
+  DB  197,60,89,192                                   ; vmulps        %ymm0,%ymm8,%ymm8
+  DB  196,65,60,88,193                                ; vaddps        %ymm9,%ymm8,%ymm8
+  DB  196,98,125,24,72,4                              ; vbroadcastss  0x4(%rax),%ymm9
+  DB  196,98,125,24,80,16                             ; vbroadcastss  0x10(%rax),%ymm10
+  DB  196,98,125,24,88,28                             ; vbroadcastss  0x1c(%rax),%ymm11
+  DB  196,98,125,24,96,40                             ; vbroadcastss  0x28(%rax),%ymm12
+  DB  197,36,89,218                                   ; vmulps        %ymm2,%ymm11,%ymm11
+  DB  196,65,36,88,220                                ; vaddps        %ymm12,%ymm11,%ymm11
+  DB  197,44,89,209                                   ; vmulps        %ymm1,%ymm10,%ymm10
+  DB  196,65,44,88,211                                ; vaddps        %ymm11,%ymm10,%ymm10
+  DB  197,52,89,200                                   ; vmulps        %ymm0,%ymm9,%ymm9
+  DB  196,65,52,88,202                                ; vaddps        %ymm10,%ymm9,%ymm9
+  DB  196,98,125,24,80,8                              ; vbroadcastss  0x8(%rax),%ymm10
+  DB  196,98,125,24,88,20                             ; vbroadcastss  0x14(%rax),%ymm11
+  DB  196,98,125,24,96,32                             ; vbroadcastss  0x20(%rax),%ymm12
+  DB  196,98,125,24,104,44                            ; vbroadcastss  0x2c(%rax),%ymm13
+  DB  197,156,89,210                                  ; vmulps        %ymm2,%ymm12,%ymm2
+  DB  196,193,108,88,213                              ; vaddps        %ymm13,%ymm2,%ymm2
+  DB  197,164,89,201                                  ; vmulps        %ymm1,%ymm11,%ymm1
+  DB  197,244,88,202                                  ; vaddps        %ymm2,%ymm1,%ymm1
+  DB  197,172,89,192                                  ; vmulps        %ymm0,%ymm10,%ymm0
+  DB  197,252,88,209                                  ; vaddps        %ymm1,%ymm0,%ymm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,124,41,192                                  ; vmovaps       %ymm8,%ymm0
+  DB  197,124,41,201                                  ; vmovaps       %ymm9,%ymm1
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_linear_gradient_2stops_avx
+_sk_linear_gradient_2stops_avx LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,226,125,24,72,16                            ; vbroadcastss  0x10(%rax),%ymm1
+  DB  196,226,125,24,16                               ; vbroadcastss  (%rax),%ymm2
+  DB  197,244,89,200                                  ; vmulps        %ymm0,%ymm1,%ymm1
+  DB  197,108,88,193                                  ; vaddps        %ymm1,%ymm2,%ymm8
+  DB  196,226,125,24,72,20                            ; vbroadcastss  0x14(%rax),%ymm1
+  DB  196,226,125,24,80,4                             ; vbroadcastss  0x4(%rax),%ymm2
+  DB  197,244,89,200                                  ; vmulps        %ymm0,%ymm1,%ymm1
+  DB  197,236,88,201                                  ; vaddps        %ymm1,%ymm2,%ymm1
+  DB  196,226,125,24,80,24                            ; vbroadcastss  0x18(%rax),%ymm2
+  DB  196,226,125,24,88,8                             ; vbroadcastss  0x8(%rax),%ymm3
+  DB  197,236,89,208                                  ; vmulps        %ymm0,%ymm2,%ymm2
+  DB  197,228,88,210                                  ; vaddps        %ymm2,%ymm3,%ymm2
+  DB  196,226,125,24,88,28                            ; vbroadcastss  0x1c(%rax),%ymm3
+  DB  196,98,125,24,72,12                             ; vbroadcastss  0xc(%rax),%ymm9
+  DB  197,228,89,192                                  ; vmulps        %ymm0,%ymm3,%ymm0
+  DB  197,180,88,216                                  ; vaddps        %ymm0,%ymm9,%ymm3
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,124,41,192                                  ; vmovaps       %ymm8,%ymm0
+  DB  255,224                                         ; jmpq          *%rax
+
 PUBLIC _sk_start_pipeline_sse41
 _sk_start_pipeline_sse41 LABEL PROC
   DB  65,87                                           ; push          %r15
index a691f2b..21e3c35 100644 (file)
@@ -104,6 +104,30 @@ using K = const SkJumper_constants;
 
     #define WRAP(name) sk_##name##_hsw
 
+#elif defined(__AVX__)
+    #include <immintrin.h>
+
+    using F   = float    __attribute__((ext_vector_type(8)));
+    using I32 =  int32_t __attribute__((ext_vector_type(8)));
+    using U32 = uint32_t __attribute__((ext_vector_type(8)));
+    using U8  = uint8_t  __attribute__((ext_vector_type(8)));
+
+    static F   mad(F f, F m, F a)  { return f*m+a;              }
+    static F   min(F a, F b)       { return _mm256_min_ps(a,b); }
+    static F   max(F a, F b)       { return _mm256_max_ps(a,b); }
+    static F   rcp  (F v)          { return _mm256_rcp_ps  (v); }
+    static F   rsqrt(F v)          { return _mm256_rsqrt_ps(v); }
+    static U32 round(F v, F scale) { return _mm256_cvtps_epi32(v*scale); }
+
+    static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+
+    static F gather(const float* p, U32 ix) {
+        return { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
+                 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
+    }
+
+    #define WRAP(name) sk_##name##_avx
+
 #elif defined(__SSE2__)
     #include <immintrin.h>
 
@@ -499,6 +523,9 @@ STAGE(load_f16) {
     g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
     b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
     a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+#elif defined(__AVX__)
+    // TODO
+
 #elif defined(__SSE2__)
     auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
          _23 = _mm_loadu_si128(((__m128i*)ptr) + 1);
@@ -568,6 +595,8 @@ STAGE(store_f16) {
     _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
     _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
     _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+#elif defined(__AVX__)
+    // TODO
 #elif defined(__SSE2__)
     auto float_to_half = [&](F f) {
         return bit_cast<U32>(f * bit_cast<F>(U32(k->_0x07800000)))  // Fix up the exponent,
index b6ab3c0..945f776 100755 (executable)
@@ -33,6 +33,14 @@ subprocess.check_call(['clang++'] + cflags + sse41 + ['-DWIN'] +
                       ['-c', 'src/jumper/SkJumper_stages.cpp'] +
                       ['-o', 'win_sse41.o'])
 
+avx = '-mno-red-zone -mavx'.split()
+subprocess.check_call(['clang++'] + cflags + avx +
+                      ['-c', 'src/jumper/SkJumper_stages.cpp'] +
+                      ['-o', 'avx.o'])
+subprocess.check_call(['clang++'] + cflags + avx + ['-DWIN'] +
+                      ['-c', 'src/jumper/SkJumper_stages.cpp'] +
+                      ['-o', 'win_avx.o'])
+
 hsw = '-mno-red-zone -mavx2 -mfma -mf16c'.split()
 subprocess.check_call(['clang++'] + cflags + hsw +
                       ['-c', 'src/jumper/SkJumper_stages.cpp'] +
@@ -125,6 +133,7 @@ parse_object_file('vfp4.o', '.long', target='elf32-littlearm')
 
 print '#elif defined(__x86_64__)'
 parse_object_file('hsw.o',   '.byte')
+parse_object_file('avx.o',   '.byte')
 parse_object_file('sse41.o', '.byte')
 parse_object_file('sse2.o',  '.byte')
 print '#endif'
@@ -141,6 +150,7 @@ print '''; Copyright 2017 Google Inc.
 '''
 print '_text SEGMENT'
 parse_object_file('win_hsw.o',   'DB')
+parse_object_file('win_avx.o',   'DB')
 parse_object_file('win_sse41.o', 'DB')
 parse_object_file('win_sse2.o',  'DB')
 print 'END'