SkSplicer: implement load_tables and matrix_3x4
authorMike Klein <mtklein@chromium.org>
Fri, 13 Jan 2017 18:18:44 +0000 (13:18 -0500)
committerSkia Commit-Bot <skia-commit-bot@chromium.org>
Fri, 13 Jan 2017 20:30:25 +0000 (20:30 +0000)
These are enough to splice interesting SkColorSpaceXform pipelines.

SkSplicer_stages.cpp is similar to but still intentionally distinct from
SkRasterPipeline_opts.  I hope to unify them next week.

unaligned_load() is nothing tricky... just a little refactor.

Change-Id: I05d0fc38dac985aa351d88776ecc14d2457f2124
Reviewed-on: https://skia-review.googlesource.com/7022
Reviewed-by: Matt Sarett <msarett@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>

src/splicer/SkSplicer.cpp
src/splicer/SkSplicer_generated.h
src/splicer/SkSplicer_stages.cpp

index fcff0af..0295c4f 100644 (file)
@@ -293,10 +293,12 @@ namespace {
                     case SkRasterPipeline::from_srgb:    splice(&buf, kSplice_from_srgb   ); break;
                     case SkRasterPipeline::to_srgb:      splice(&buf, kSplice_to_srgb     ); break;
                     case SkRasterPipeline::scale_u8:     splice(&buf, kSplice_scale_u8    ); break;
+                    case SkRasterPipeline::load_tables:  splice(&buf, kSplice_load_tables ); break;
                     case SkRasterPipeline::load_8888:    splice(&buf, kSplice_load_8888   ); break;
                     case SkRasterPipeline::store_8888:   splice(&buf, kSplice_store_8888  ); break;
                     case SkRasterPipeline::load_f16:     splice(&buf, kSplice_load_f16    ); break;
                     case SkRasterPipeline::store_f16:    splice(&buf, kSplice_store_f16   ); break;
+                    case SkRasterPipeline::matrix_3x4:   splice(&buf, kSplice_matrix_3x4  ); break;
 
                     // No joy (probably just not yet implemented).
                     default:
index df4bd03..228f8ff 100644 (file)
@@ -222,6 +222,58 @@ static const unsigned int kSplice_scale_u8[] = {
     0x6e22de02,                                 //  fmul          v2.4s, v16.4s, v2.4s
     0x6e23de03,                                 //  fmul          v3.4s, v16.4s, v3.4s
 };
+static const unsigned int kSplice_load_tables[] = {
+    0xa9402848,                                 //  ldp           x8, x10, [x2]
+    0xd37ef409,                                 //  lsl           x9, x0, #2
+    0x4d40c860,                                 //  ld1r          {v0.4s}, [x3]
+    0x3ce96903,                                 //  ldr           q3, [x8,x9]
+    0xa9412448,                                 //  ldp           x8, x9, [x2,#16]
+    0x4e231c01,                                 //  and           v1.16b, v0.16b, v3.16b
+    0x1e26002e,                                 //  fmov          w14, s1
+    0x6f380462,                                 //  ushr          v2.4s, v3.4s, #8
+    0x6f300470,                                 //  ushr          v16.4s, v3.4s, #16
+    0x8b2e494e,                                 //  add           x14, x10, w14, uxtw #2
+    0x0e0c3c2b,                                 //  mov           w11, v1.s[1]
+    0x0e143c2c,                                 //  mov           w12, v1.s[2]
+    0x0e1c3c2d,                                 //  mov           w13, v1.s[3]
+    0x4e221c01,                                 //  and           v1.16b, v0.16b, v2.16b
+    0x4e301c02,                                 //  and           v2.16b, v0.16b, v16.16b
+    0x0d4081c0,                                 //  ld1           {v0.s}[0], [x14]
+    0x1e26002e,                                 //  fmov          w14, s1
+    0x8b2e490e,                                 //  add           x14, x8, w14, uxtw #2
+    0x8b2b494b,                                 //  add           x11, x10, w11, uxtw #2
+    0xbc6c5950,                                 //  ldr           s16, [x10,w12,uxtw #2]
+    0xbc6d5951,                                 //  ldr           s17, [x10,w13,uxtw #2]
+    0x0e0c3c2a,                                 //  mov           w10, v1.s[1]
+    0x0e143c2c,                                 //  mov           w12, v1.s[2]
+    0x0e1c3c2d,                                 //  mov           w13, v1.s[3]
+    0x0d4081c1,                                 //  ld1           {v1.s}[0], [x14]
+    0x0d409160,                                 //  ld1           {v0.s}[1], [x11]
+    0xbc6c5912,                                 //  ldr           s18, [x8,w12,uxtw #2]
+    0x0e143c4c,                                 //  mov           w12, v2.s[2]
+    0x1e26004e,                                 //  fmov          w14, s2
+    0xbc6c5933,                                 //  ldr           s19, [x9,w12,uxtw #2]
+    0x8b2e492c,                                 //  add           x12, x9, w14, uxtw #2
+    0x8b2a490a,                                 //  add           x10, x8, w10, uxtw #2
+    0x0e0c3c4f,                                 //  mov           w15, v2.s[1]
+    0x0e1c3c4b,                                 //  mov           w11, v2.s[3]
+    0x0d408182,                                 //  ld1           {v2.s}[0], [x12]
+    0x0d409141,                                 //  ld1           {v1.s}[1], [x10]
+    0x6e140600,                                 //  mov           v0.s[2], v16.s[0]
+    0xbc6d5910,                                 //  ldr           s16, [x8,w13,uxtw #2]
+    0x8b2f492a,                                 //  add           x10, x9, w15, uxtw #2
+    0x0d409142,                                 //  ld1           {v2.s}[1], [x10]
+    0x6e140641,                                 //  mov           v1.s[2], v18.s[0]
+    0x6e1c0620,                                 //  mov           v0.s[3], v17.s[0]
+    0xbc6b5931,                                 //  ldr           s17, [x9,w11,uxtw #2]
+    0x6e1c0601,                                 //  mov           v1.s[3], v16.s[0]
+    0xbd400c70,                                 //  ldr           s16, [x3,#12]
+    0x6f280463,                                 //  ushr          v3.4s, v3.4s, #24
+    0x6e140662,                                 //  mov           v2.s[2], v19.s[0]
+    0x4e21d863,                                 //  scvtf         v3.4s, v3.4s
+    0x6e1c0622,                                 //  mov           v2.s[3], v17.s[0]
+    0x4f909063,                                 //  fmul          v3.4s, v3.4s, v16.s[0]
+};
 static const unsigned int kSplice_load_8888[] = {
     0xf9400048,                                 //  ldr           x8, [x2]
     0xd37ef409,                                 //  lsl           x9, x0, #2
@@ -281,6 +333,33 @@ static const unsigned int kSplice_store_f16[] = {
     0x0e216873,                                 //  fcvtn         v19.4h, v3.4s
     0x0c000510,                                 //  st4           {v16.4h-v19.4h}, [x8]
 };
+static const unsigned int kSplice_matrix_3x4[] = {
+    0xaa0203e8,                                 //  mov           x8, x2
+    0x91009049,                                 //  add           x9, x2, #0x24
+    0x4ddfc913,                                 //  ld1r          {v19.4s}, [x8], #4
+    0x4d40c930,                                 //  ld1r          {v16.4s}, [x9]
+    0x9100a049,                                 //  add           x9, x2, #0x28
+    0x4d40c931,                                 //  ld1r          {v17.4s}, [x9]
+    0x2d435454,                                 //  ldp           s20, s21, [x2,#24]
+    0x9100b049,                                 //  add           x9, x2, #0x2c
+    0xbd402056,                                 //  ldr           s22, [x2,#32]
+    0x4d40c932,                                 //  ld1r          {v18.4s}, [x9]
+    0x4f941050,                                 //  fmla          v16.4s, v2.4s, v20.s[0]
+    0x4f951051,                                 //  fmla          v17.4s, v2.4s, v21.s[0]
+    0x2d415454,                                 //  ldp           s20, s21, [x2,#8]
+    0x4f961052,                                 //  fmla          v18.4s, v2.4s, v22.s[0]
+    0x2d425842,                                 //  ldp           s2, s22, [x2,#16]
+    0x4f951030,                                 //  fmla          v16.4s, v1.4s, v21.s[0]
+    0xbd400115,                                 //  ldr           s21, [x8]
+    0x4f821031,                                 //  fmla          v17.4s, v1.4s, v2.s[0]
+    0x4f961032,                                 //  fmla          v18.4s, v1.4s, v22.s[0]
+    0x4e20ce70,                                 //  fmla          v16.4s, v19.4s, v0.4s
+    0x4f951011,                                 //  fmla          v17.4s, v0.4s, v21.s[0]
+    0x4f941012,                                 //  fmla          v18.4s, v0.4s, v20.s[0]
+    0x4eb01e00,                                 //  mov           v0.16b, v16.16b
+    0x4eb11e21,                                 //  mov           v1.16b, v17.16b
+    0x4eb21e42,                                 //  mov           v2.16b, v18.16b
+};
 
 #elif defined(__ARM_NEON__)
 
@@ -505,6 +584,44 @@ static const unsigned int kSplice_scale_u8[] = {
     0xe28dd008,                                 //  add           sp, sp, #8
     0xecbd8b02,                                 //  vpop          {d8}
 };
+static const unsigned int kSplice_load_tables[] = {
+    0xe92d41f0,                                 //  push          {r4, r5, r6, r7, r8, lr}
+    0xe592c000,                                 //  ldr           ip, [r2]
+    0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
+    0xe08c5100,                                 //  add           r5, ip, r0, lsl #2
+    0xe592e004,                                 //  ldr           lr, [r2, #4]
+    0xedd51b00,                                 //  vldr          d17, [r5]
+    0xf24021b1,                                 //  vand          d18, d16, d17
+    0xe592800c,                                 //  ldr           r8, [r2, #12]
+    0xf3f83031,                                 //  vshr.u32      d19, d17, #8
+    0xe5924008,                                 //  ldr           r4, [r2, #8]
+    0xed931a03,                                 //  vldr          s2, [r3, #12]
+    0xee325b90,                                 //  vmov.32       r5, d18[1]
+    0xee126b90,                                 //  vmov.32       r6, d18[0]
+    0xf3f02031,                                 //  vshr.u32      d18, d17, #16
+    0xf24021b2,                                 //  vand          d18, d16, d18
+    0xf24001b3,                                 //  vand          d16, d16, d19
+    0xee127b90,                                 //  vmov.32       r7, d18[0]
+    0xe08e5105,                                 //  add           r5, lr, r5, lsl #2
+    0xe08e6106,                                 //  add           r6, lr, r6, lsl #2
+    0xedd50a00,                                 //  vldr          s1, [r5]
+    0xee325b90,                                 //  vmov.32       r5, d18[1]
+    0xed960a00,                                 //  vldr          s0, [r6]
+    0xee306b90,                                 //  vmov.32       r6, d16[1]
+    0xe0887107,                                 //  add           r7, r8, r7, lsl #2
+    0xe088c105,                                 //  add           ip, r8, r5, lsl #2
+    0xee105b90,                                 //  vmov.32       r5, d16[0]
+    0xf3e80031,                                 //  vshr.u32      d16, d17, #24
+    0xe0846106,                                 //  add           r6, r4, r6, lsl #2
+    0xeddc2a00,                                 //  vldr          s5, [ip]
+    0xf3fb0620,                                 //  vcvt.f32.s32  d16, d16
+    0xed972a00,                                 //  vldr          s4, [r7]
+    0xf2a039c1,                                 //  vmul.f32      d3, d16, d1[0]
+    0xedd61a00,                                 //  vldr          s3, [r6]
+    0xe0846105,                                 //  add           r6, r4, r5, lsl #2
+    0xed961a00,                                 //  vldr          s2, [r6]
+    0xe8bd41f0,                                 //  pop           {r4, r5, r6, r7, r8, lr}
+};
 static const unsigned int kSplice_load_8888[] = {
     0xe592c000,                                 //  ldr           ip, [r2]
     0xf4e30c9f,                                 //  vld1.32       {d16[]}, [r3 :32]
@@ -581,6 +698,43 @@ static const unsigned int kSplice_store_f16[] = {
     0xe08cc180,                                 //  add           ip, ip, r0, lsl #3
     0xf44c084f,                                 //  vst2.16       {d16-d17}, [ip]
 };
+static const unsigned int kSplice_matrix_3x4[] = {
+    0xe282c020,                                 //  add           ip, r2, #32
+    0xf4ec3c9f,                                 //  vld1.32       {d19[]}, [ip :32]
+    0xe282c02c,                                 //  add           ip, r2, #44
+    0xf4ec0c9f,                                 //  vld1.32       {d16[]}, [ip :32]
+    0xe282c01c,                                 //  add           ip, r2, #28
+    0xf2420c33,                                 //  vfma.f32      d16, d2, d19
+    0xf4ec4c9f,                                 //  vld1.32       {d20[]}, [ip :32]
+    0xe282c018,                                 //  add           ip, r2, #24
+    0xf4ec2c9f,                                 //  vld1.32       {d18[]}, [ip :32]
+    0xe282c024,                                 //  add           ip, r2, #36
+    0xf4ec1c9f,                                 //  vld1.32       {d17[]}, [ip :32]
+    0xe282c028,                                 //  add           ip, r2, #40
+    0xf2421c32,                                 //  vfma.f32      d17, d2, d18
+    0xf4ec2c9f,                                 //  vld1.32       {d18[]}, [ip :32]
+    0xe282c010,                                 //  add           ip, r2, #16
+    0xf2422c34,                                 //  vfma.f32      d18, d2, d20
+    0xf4ec3c9f,                                 //  vld1.32       {d19[]}, [ip :32]
+    0xe282c00c,                                 //  add           ip, r2, #12
+    0xf4ec4c9f,                                 //  vld1.32       {d20[]}, [ip :32]
+    0xe282c014,                                 //  add           ip, r2, #20
+    0xf2411c34,                                 //  vfma.f32      d17, d1, d20
+    0xf4ec4c9f,                                 //  vld1.32       {d20[]}, [ip :32]
+    0xf2410c34,                                 //  vfma.f32      d16, d1, d20
+    0xe282c004,                                 //  add           ip, r2, #4
+    0xf2412c33,                                 //  vfma.f32      d18, d1, d19
+    0xf4e23c9f,                                 //  vld1.32       {d19[]}, [r2 :32]
+    0xf4ec4c9f,                                 //  vld1.32       {d20[]}, [ip :32]
+    0xe282c008,                                 //  add           ip, r2, #8
+    0xf2401c33,                                 //  vfma.f32      d17, d0, d19
+    0xf4ec3c9f,                                 //  vld1.32       {d19[]}, [ip :32]
+    0xf2400c33,                                 //  vfma.f32      d16, d0, d19
+    0xf2402c34,                                 //  vfma.f32      d18, d0, d20
+    0xf22101b1,                                 //  vorr          d0, d17, d17
+    0xf22021b0,                                 //  vorr          d2, d16, d16
+    0xf22211b2,                                 //  vorr          d1, d18, d18
+};
 
 #else
 
@@ -747,6 +901,30 @@ static const unsigned char kSplice_scale_u8[] = {
     0xc5,0xbc,0x59,0xd2,                        //  vmulps        %ymm2,%ymm8,%ymm2
     0xc5,0xbc,0x59,0xdb,                        //  vmulps        %ymm3,%ymm8,%ymm3
 };
+static const unsigned char kSplice_load_tables[] = {
+    0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
+    0x4c,0x8b,0x42,0x08,                        //  mov           0x8(%rdx),%r8
+    0xc5,0xfc,0x10,0x1c,0xb8,                   //  vmovups       (%rax,%rdi,4),%ymm3
+    0xc4,0xe2,0x7d,0x18,0x11,                   //  vbroadcastss  (%rcx),%ymm2
+    0xc5,0xec,0x54,0xcb,                        //  vandps        %ymm3,%ymm2,%ymm1
+    0xc5,0xfc,0x57,0xc0,                        //  vxorps        %ymm0,%ymm0,%ymm0
+    0xc5,0x7c,0xc2,0xc0,0x00,                   //  vcmpeqps      %ymm0,%ymm0,%ymm8
+    0xc4,0x41,0x7c,0x28,0xc8,                   //  vmovaps       %ymm8,%ymm9
+    0xc4,0xc2,0x35,0x92,0x04,0x88,              //  vgatherdps    %ymm9,(%r8,%ymm1,4),%ymm0
+    0x48,0x8b,0x42,0x10,                        //  mov           0x10(%rdx),%rax
+    0xc5,0xf5,0x72,0xd3,0x08,                   //  vpsrld        $0x8,%ymm3,%ymm1
+    0xc5,0x6c,0x54,0xc9,                        //  vandps        %ymm1,%ymm2,%ymm9
+    0xc4,0x41,0x7c,0x28,0xd0,                   //  vmovaps       %ymm8,%ymm10
+    0xc4,0xa2,0x2d,0x92,0x0c,0x88,              //  vgatherdps    %ymm10,(%rax,%ymm9,4),%ymm1
+    0x48,0x8b,0x42,0x18,                        //  mov           0x18(%rdx),%rax
+    0xc5,0xb5,0x72,0xd3,0x10,                   //  vpsrld        $0x10,%ymm3,%ymm9
+    0xc4,0x41,0x6c,0x54,0xc9,                   //  vandps        %ymm9,%ymm2,%ymm9
+    0xc4,0xa2,0x3d,0x92,0x14,0x88,              //  vgatherdps    %ymm8,(%rax,%ymm9,4),%ymm2
+    0xc5,0xe5,0x72,0xd3,0x18,                   //  vpsrld        $0x18,%ymm3,%ymm3
+    0xc5,0xfc,0x5b,0xdb,                        //  vcvtdq2ps     %ymm3,%ymm3
+    0xc4,0x62,0x7d,0x18,0x41,0x0c,              //  vbroadcastss  0xc(%rcx),%ymm8
+    0xc4,0xc1,0x64,0x59,0xd8,                   //  vmulps        %ymm8,%ymm3,%ymm3
+};
 static const unsigned char kSplice_load_8888[] = {
     0x48,0x8b,0x02,                             //  mov           (%rdx),%rax
     0xc5,0xfc,0x10,0x1c,0xb8,                   //  vmovups       (%rax,%rdi,4),%ymm3
@@ -828,6 +1006,32 @@ static const unsigned char kSplice_store_f16[] = {
     0xc4,0x41,0x39,0x6a,0xc2,                   //  vpunpckhdq    %xmm10,%xmm8,%xmm8
     0xc5,0x7a,0x7f,0x44,0xf8,0x30,              //  vmovdqu       %xmm8,0x30(%rax,%rdi,8)
 };
+static const unsigned char kSplice_matrix_3x4[] = {
+    0xc4,0x62,0x7d,0x18,0x0a,                   //  vbroadcastss  (%rdx),%ymm9
+    0xc4,0x62,0x7d,0x18,0x52,0x0c,              //  vbroadcastss  0xc(%rdx),%ymm10
+    0xc4,0x62,0x7d,0x18,0x5a,0x18,              //  vbroadcastss  0x18(%rdx),%ymm11
+    0xc4,0x62,0x7d,0x18,0x42,0x24,              //  vbroadcastss  0x24(%rdx),%ymm8
+    0xc4,0x42,0x6d,0xb8,0xc3,                   //  vfmadd231ps   %ymm11,%ymm2,%ymm8
+    0xc4,0x42,0x75,0xb8,0xc2,                   //  vfmadd231ps   %ymm10,%ymm1,%ymm8
+    0xc4,0x42,0x7d,0xb8,0xc1,                   //  vfmadd231ps   %ymm9,%ymm0,%ymm8
+    0xc4,0x62,0x7d,0x18,0x52,0x04,              //  vbroadcastss  0x4(%rdx),%ymm10
+    0xc4,0x62,0x7d,0x18,0x5a,0x10,              //  vbroadcastss  0x10(%rdx),%ymm11
+    0xc4,0x62,0x7d,0x18,0x62,0x1c,              //  vbroadcastss  0x1c(%rdx),%ymm12
+    0xc4,0x62,0x7d,0x18,0x4a,0x28,              //  vbroadcastss  0x28(%rdx),%ymm9
+    0xc4,0x42,0x6d,0xb8,0xcc,                   //  vfmadd231ps   %ymm12,%ymm2,%ymm9
+    0xc4,0x42,0x75,0xb8,0xcb,                   //  vfmadd231ps   %ymm11,%ymm1,%ymm9
+    0xc4,0x42,0x7d,0xb8,0xca,                   //  vfmadd231ps   %ymm10,%ymm0,%ymm9
+    0xc4,0x62,0x7d,0x18,0x5a,0x08,              //  vbroadcastss  0x8(%rdx),%ymm11
+    0xc4,0x62,0x7d,0x18,0x62,0x14,              //  vbroadcastss  0x14(%rdx),%ymm12
+    0xc4,0x62,0x7d,0x18,0x6a,0x20,              //  vbroadcastss  0x20(%rdx),%ymm13
+    0xc4,0x62,0x7d,0x18,0x52,0x2c,              //  vbroadcastss  0x2c(%rdx),%ymm10
+    0xc4,0x42,0x6d,0xb8,0xd5,                   //  vfmadd231ps   %ymm13,%ymm2,%ymm10
+    0xc4,0x42,0x75,0xb8,0xd4,                   //  vfmadd231ps   %ymm12,%ymm1,%ymm10
+    0xc4,0x42,0x7d,0xb8,0xd3,                   //  vfmadd231ps   %ymm11,%ymm0,%ymm10
+    0xc5,0x7c,0x29,0xc0,                        //  vmovaps       %ymm8,%ymm0
+    0xc5,0x7c,0x29,0xc9,                        //  vmovaps       %ymm9,%ymm1
+    0xc5,0x7c,0x29,0xd2,                        //  vmovaps       %ymm10,%ymm2
+};
 
 #endif
 
index c45f204..2d83996 100644 (file)
@@ -33,6 +33,7 @@
     AI static F   if_then_else(I32 c, F t, F e)        { return vbslq_f32((U32)c,t,e);   }
     AI static U32 round(F v, F scale)                  { return vcvtnq_u32_f32(v*scale); }
 
+    AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]}; }
 #elif defined(__ARM_NEON__)
     #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
         #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
@@ -53,6 +54,7 @@
     AI static F   if_then_else(I32 c, F t, F e)        { return vbsl_f32((U32)c,t,e);   }
     AI static U32 round(F v, F scale)                  { return vcvt_u32_f32(fma(v,scale,0.5f)); }
 
+    AI static F gather(const float* p, U32 ix) { return {p[ix[0]], p[ix[1]]}; }
 #else
     #if !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
         #error On x86, compile with -mavx2 -mfma -mf16c.
     AI static F   rsqrt(F v)                    { return _mm256_rsqrt_ps   (v); }
     AI static F   if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
     AI static U32 round(F v, F scale)           { return _mm256_cvtps_epi32(v*scale); }
+
+    AI static F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, ix, 4); }
 #endif
 
 AI static F   cast  (U32 v) { return __builtin_convertvector((I32)v, F);   }
 AI static U32 expand(U8  v) { return __builtin_convertvector(     v, U32); }
 
+template <typename T, typename P>
+AI static T unaligned_load(const P* p) {
+    T v;
+    memcpy(&v, p, sizeof(v));
+    return v;
+}
+
 // We'll be compiling this file to an object file, then extracting parts of it into
 // SkSplicer_generated.h.  It's easier to do if the function names are not C++ mangled.
 // On ARMv7, use aapcs-vfp calling convention to pass as much data in registers as possible.
@@ -241,8 +252,7 @@ STAGE(to_srgb) {
 STAGE(scale_u8) {
     auto ptr = *(const uint8_t**)ctx + x;
 
-    U8 scales;
-    memcpy(&scales, ptr, sizeof(scales));
+    auto scales = unaligned_load<U8>(ptr);
     auto c = cast(expand(scales)) * k->_1_255;
 
     r = r * c;
@@ -251,12 +261,24 @@ STAGE(scale_u8) {
     a = a * c;
 }
 
+STAGE(load_tables) {
+    struct Ctx {
+        const uint32_t* src;
+        const float *r, *g, *b;
+    };
+    auto c = (const Ctx*)ctx;
+
+    auto px = unaligned_load<U32>(c->src + x);
+    r = gather(c->r, (px      ) & k->_0x000000ff);
+    g = gather(c->g, (px >>  8) & k->_0x000000ff);
+    b = gather(c->b, (px >> 16) & k->_0x000000ff);
+    a = cast(        (px >> 24)) * k->_1_255;
+}
+
 STAGE(load_8888) {
     auto ptr = *(const uint32_t**)ctx + x;
 
-    U32 px;
-    memcpy(&px, ptr, sizeof(px));
-
+    auto px = unaligned_load<U32>(ptr);
     r = cast((px      ) & k->_0x000000ff) * k->_1_255;
     g = cast((px >>  8) & k->_0x000000ff) * k->_1_255;
     b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
@@ -347,3 +369,14 @@ STAGE(store_f16) {
     _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
 #endif
 }
+
+STAGE(matrix_3x4) {
+    auto m = (const float*)ctx;
+
+    auto R = fma(r,m[0], fma(g,m[3], fma(b,m[6], m[ 9]))),
+         G = fma(r,m[1], fma(g,m[4], fma(b,m[7], m[10]))),
+         B = fma(r,m[2], fma(g,m[5], fma(b,m[8], m[11])));
+    r = R;
+    g = G;
+    b = B;
+}