--- /dev/null
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef SkSplicer_generated_DEFINED
+#define SkSplicer_generated_DEFINED
+
+// This file is generated semi-automatically with this command:
+// $ src/splicer/build_stages.py > src/splicer/SkSplicer_generated.h
+
+static const unsigned char kSplice_clear[] = {
+ 0xc5,0xfc,0x57,0xc0, // vxorps %ymm0, %ymm0, %ymm0
+ 0xc5,0xf4,0x57,0xc9, // vxorps %ymm1, %ymm1, %ymm1
+ 0xc5,0xec,0x57,0xd2, // vxorps %ymm2, %ymm2, %ymm2
+ 0xc5,0xe4,0x57,0xdb, // vxorps %ymm3, %ymm3, %ymm3
+};
+static const unsigned char kSplice_plus[] = {
+ 0xc5,0xfc,0x58,0xc4, // vaddps %ymm4, %ymm0, %ymm0
+ 0xc5,0xf4,0x58,0xcd, // vaddps %ymm5, %ymm1, %ymm1
+ 0xc5,0xec,0x58,0xd6, // vaddps %ymm6, %ymm2, %ymm2
+ 0xc5,0xe4,0x58,0xdf, // vaddps %ymm7, %ymm3, %ymm3
+};
+static const unsigned char kSplice_srcover[] = {
+ 0xc4,0x62,0x7d,0x18,0x41,0x04, // vbroadcastss 0x4(%rcx), %ymm8
+ 0xc5,0x3c,0x5c,0xc3, // vsubps %ymm3, %ymm8, %ymm8
+ 0xc4,0xc2,0x5d,0xb8,0xc0, // vfmadd231ps %ymm8, %ymm4, %ymm0
+ 0xc4,0xc2,0x55,0xb8,0xc8, // vfmadd231ps %ymm8, %ymm5, %ymm1
+ 0xc4,0xc2,0x4d,0xb8,0xd0, // vfmadd231ps %ymm8, %ymm6, %ymm2
+ 0xc4,0xc2,0x4d,0xb8,0xd8, // vfmadd231ps %ymm8, %ymm6, %ymm3
+};
+static const unsigned char kSplice_dstover[] = {
+ 0xc4,0x62,0x7d,0x18,0x41,0x04, // vbroadcastss 0x4(%rcx), %ymm8
+ 0xc5,0x3c,0x5c,0xc7, // vsubps %ymm7, %ymm8, %ymm8
+ 0xc4,0xc2,0x7d,0xb8,0xe0, // vfmadd231ps %ymm8, %ymm0, %ymm4
+ 0xc4,0xc2,0x75,0xb8,0xe8, // vfmadd231ps %ymm8, %ymm1, %ymm5
+ 0xc4,0xc2,0x6d,0xb8,0xf0, // vfmadd231ps %ymm8, %ymm2, %ymm6
+ 0xc4,0xc2,0x6d,0xb8,0xf8, // vfmadd231ps %ymm8, %ymm2, %ymm7
+};
+static const unsigned char kSplice_clamp_0[] = {
+ 0xc4,0x41,0x3c,0x57,0xc0, // vxorps %ymm8, %ymm8, %ymm8
+ 0xc4,0xc1,0x7c,0x5f,0xc0, // vmaxps %ymm8, %ymm0, %ymm0
+ 0xc4,0xc1,0x74,0x5f,0xc8, // vmaxps %ymm8, %ymm1, %ymm1
+ 0xc4,0xc1,0x6c,0x5f,0xd0, // vmaxps %ymm8, %ymm2, %ymm2
+ 0xc4,0xc1,0x64,0x5f,0xd8, // vmaxps %ymm8, %ymm3, %ymm3
+};
+static const unsigned char kSplice_clamp_1[] = {
+ 0xc4,0x62,0x7d,0x18,0x41,0x04, // vbroadcastss 0x4(%rcx), %ymm8
+ 0xc4,0xc1,0x7c,0x5d,0xc0, // vminps %ymm8, %ymm0, %ymm0
+ 0xc4,0xc1,0x74,0x5d,0xc8, // vminps %ymm8, %ymm1, %ymm1
+ 0xc4,0xc1,0x6c,0x5d,0xd0, // vminps %ymm8, %ymm2, %ymm2
+ 0xc4,0xc1,0x64,0x5d,0xd8, // vminps %ymm8, %ymm3, %ymm3
+};
+static const unsigned char kSplice_clamp_a[] = {
+ 0xc4,0x62,0x7d,0x18,0x41,0x04, // vbroadcastss 0x4(%rcx), %ymm8
+ 0xc4,0xc1,0x64,0x5d,0xd8, // vminps %ymm8, %ymm3, %ymm3
+ 0xc5,0xfc,0x5d,0xc3, // vminps %ymm3, %ymm0, %ymm0
+ 0xc5,0xf4,0x5d,0xcb, // vminps %ymm3, %ymm1, %ymm1
+ 0xc5,0xec,0x5d,0xd3, // vminps %ymm3, %ymm2, %ymm2
+};
+static const unsigned char kSplice_swap[] = {
+ 0xc5,0x7c,0x28,0xc3, // vmovaps %ymm3, %ymm8
+ 0xc5,0x7c,0x28,0xca, // vmovaps %ymm2, %ymm9
+ 0xc5,0x7c,0x28,0xd1, // vmovaps %ymm1, %ymm10
+ 0xc5,0x7c,0x28,0xd8, // vmovaps %ymm0, %ymm11
+ 0xc5,0xfc,0x28,0xc4, // vmovaps %ymm4, %ymm0
+ 0xc5,0xfc,0x28,0xcd, // vmovaps %ymm5, %ymm1
+ 0xc5,0xfc,0x28,0xd6, // vmovaps %ymm6, %ymm2
+ 0xc5,0xfc,0x28,0xdf, // vmovaps %ymm7, %ymm3
+ 0xc5,0x7c,0x29,0xdc, // vmovaps %ymm11, %ymm4
+ 0xc5,0x7c,0x29,0xd5, // vmovaps %ymm10, %ymm5
+ 0xc5,0x7c,0x29,0xce, // vmovaps %ymm9, %ymm6
+ 0xc5,0x7c,0x29,0xc7, // vmovaps %ymm8, %ymm7
+};
+static const unsigned char kSplice_move_src_dst[] = {
+ 0xc5,0xfc,0x28,0xe0, // vmovaps %ymm0, %ymm4
+ 0xc5,0xfc,0x28,0xe9, // vmovaps %ymm1, %ymm5
+ 0xc5,0xfc,0x28,0xf2, // vmovaps %ymm2, %ymm6
+ 0xc5,0xfc,0x28,0xfb, // vmovaps %ymm3, %ymm7
+};
+static const unsigned char kSplice_move_dst_src[] = {
+ 0xc5,0xfc,0x28,0xc4, // vmovaps %ymm4, %ymm0
+ 0xc5,0xfc,0x28,0xcd, // vmovaps %ymm5, %ymm1
+ 0xc5,0xfc,0x28,0xd6, // vmovaps %ymm6, %ymm2
+ 0xc5,0xfc,0x28,0xdf, // vmovaps %ymm7, %ymm3
+};
+static const unsigned char kSplice_premul[] = {
+ 0xc5,0xfc,0x59,0xc3, // vmulps %ymm3, %ymm0, %ymm0
+ 0xc5,0xf4,0x59,0xcb, // vmulps %ymm3, %ymm1, %ymm1
+ 0xc5,0xec,0x59,0xd3, // vmulps %ymm3, %ymm2, %ymm2
+};
+static const unsigned char kSplice_unpremul[] = {
+ 0xc4,0x41,0x3c,0x57,0xc0, // vxorps %ymm8, %ymm8, %ymm8
+ 0xc4,0x41,0x64,0xc2,0xc8,0x00, // vcmpeqps %ymm8, %ymm3, %ymm9
+ 0xc4,0x62,0x7d,0x18,0x51,0x04, // vbroadcastss 0x4(%rcx), %ymm10
+ 0xc5,0x2c,0x5e,0xd3, // vdivps %ymm3, %ymm10, %ymm10
+ 0xc4,0x43,0x2d,0x4a,0xc0,0x90, // vblendvps %ymm9, %ymm8, %ymm10, %ymm8
+ 0xc5,0xbc,0x59,0xc0, // vmulps %ymm0, %ymm8, %ymm0
+ 0xc5,0xbc,0x59,0xc9, // vmulps %ymm1, %ymm8, %ymm1
+ 0xc5,0xbc,0x59,0xd2, // vmulps %ymm2, %ymm8, %ymm2
+};
+static const unsigned char kSplice_from_srgb[] = {
+ 0xc4,0x62,0x7d,0x18,0x41,0x1c, // vbroadcastss 0x1c(%rcx), %ymm8
+ 0xc5,0x3c,0x59,0xc8, // vmulps %ymm0, %ymm8, %ymm9
+ 0xc5,0x7c,0x59,0xd0, // vmulps %ymm0, %ymm0, %ymm10
+ 0xc4,0x62,0x7d,0x18,0x59,0x18, // vbroadcastss 0x18(%rcx), %ymm11
+ 0xc4,0x62,0x7d,0x18,0x61,0x14, // vbroadcastss 0x14(%rcx), %ymm12
+ 0xc4,0x41,0x7c,0x28,0xeb, // vmovaps %ymm11, %ymm13
+ 0xc4,0x42,0x7d,0xa8,0xec, // vfmadd213ps %ymm12, %ymm0, %ymm13
+ 0xc4,0x62,0x7d,0x18,0x71,0x10, // vbroadcastss 0x10(%rcx), %ymm14
+ 0xc4,0x42,0x2d,0xa8,0xee, // vfmadd213ps %ymm14, %ymm10, %ymm13
+ 0xc4,0x62,0x7d,0x18,0x51,0x20, // vbroadcastss 0x20(%rcx), %ymm10
+ 0xc4,0xc1,0x7c,0xc2,0xc2,0x01, // vcmpltps %ymm10, %ymm0, %ymm0
+ 0xc4,0xc3,0x15,0x4a,0xc1,0x00, // vblendvps %ymm0, %ymm9, %ymm13, %ymm0
+ 0xc5,0x3c,0x59,0xc9, // vmulps %ymm1, %ymm8, %ymm9
+ 0xc5,0x74,0x59,0xe9, // vmulps %ymm1, %ymm1, %ymm13
+ 0xc4,0x41,0x7c,0x28,0xfb, // vmovaps %ymm11, %ymm15
+ 0xc4,0x42,0x75,0xa8,0xfc, // vfmadd213ps %ymm12, %ymm1, %ymm15
+ 0xc4,0x42,0x15,0xa8,0xfe, // vfmadd213ps %ymm14, %ymm13, %ymm15
+ 0xc4,0xc1,0x74,0xc2,0xca,0x01, // vcmpltps %ymm10, %ymm1, %ymm1
+ 0xc4,0xc3,0x05,0x4a,0xc9,0x10, // vblendvps %ymm1, %ymm9, %ymm15, %ymm1
+ 0xc5,0x3c,0x59,0xc2, // vmulps %ymm2, %ymm8, %ymm8
+ 0xc5,0x6c,0x59,0xca, // vmulps %ymm2, %ymm2, %ymm9
+ 0xc4,0x42,0x6d,0xa8,0xdc, // vfmadd213ps %ymm12, %ymm2, %ymm11
+ 0xc4,0x42,0x35,0xa8,0xde, // vfmadd213ps %ymm14, %ymm9, %ymm11
+ 0xc4,0xc1,0x6c,0xc2,0xd2,0x01, // vcmpltps %ymm10, %ymm2, %ymm2
+ 0xc4,0xc3,0x25,0x4a,0xd0,0x20, // vblendvps %ymm2, %ymm8, %ymm11, %ymm2
+};
+static const unsigned char kSplice_to_srgb[] = {
+ 0xc5,0x7c,0x52,0xc0, // vrsqrtps %ymm0, %ymm8
+ 0xc4,0x41,0x7c,0x53,0xc8, // vrcpps %ymm8, %ymm9
+ 0xc4,0x41,0x7c,0x52,0xd0, // vrsqrtps %ymm8, %ymm10
+ 0xc4,0x62,0x7d,0x18,0x41,0x24, // vbroadcastss 0x24(%rcx), %ymm8
+ 0xc5,0x3c,0x59,0xd8, // vmulps %ymm0, %ymm8, %ymm11
+ 0xc4,0x62,0x7d,0x18,0x61,0x04, // vbroadcastss 0x4(%rcx), %ymm12
+ 0xc4,0x62,0x7d,0x18,0x69,0x28, // vbroadcastss 0x28(%rcx), %ymm13
+ 0xc4,0x62,0x7d,0x18,0x71,0x2c, // vbroadcastss 0x2c(%rcx), %ymm14
+ 0xc4,0x62,0x7d,0x18,0x79,0x30, // vbroadcastss 0x30(%rcx), %ymm15
+ 0xc4,0x42,0x0d,0xa8,0xcf, // vfmadd213ps %ymm15, %ymm14, %ymm9
+ 0xc4,0x42,0x15,0xb8,0xca, // vfmadd231ps %ymm10, %ymm13, %ymm9
+ 0xc4,0x41,0x1c,0x5d,0xc9, // vminps %ymm9, %ymm12, %ymm9
+ 0xc4,0x62,0x7d,0x18,0x51,0x34, // vbroadcastss 0x34(%rcx), %ymm10
+ 0xc4,0xc1,0x7c,0xc2,0xc2,0x01, // vcmpltps %ymm10, %ymm0, %ymm0
+ 0xc4,0xc3,0x35,0x4a,0xc3,0x00, // vblendvps %ymm0, %ymm11, %ymm9, %ymm0
+ 0xc5,0x7c,0x52,0xc9, // vrsqrtps %ymm1, %ymm9
+ 0xc4,0x41,0x7c,0x53,0xd9, // vrcpps %ymm9, %ymm11
+ 0xc4,0x41,0x7c,0x52,0xc9, // vrsqrtps %ymm9, %ymm9
+ 0xc4,0x42,0x0d,0xa8,0xdf, // vfmadd213ps %ymm15, %ymm14, %ymm11
+ 0xc4,0x42,0x15,0xb8,0xd9, // vfmadd231ps %ymm9, %ymm13, %ymm11
+ 0xc5,0x3c,0x59,0xc9, // vmulps %ymm1, %ymm8, %ymm9
+ 0xc4,0x41,0x1c,0x5d,0xdb, // vminps %ymm11, %ymm12, %ymm11
+ 0xc4,0xc1,0x74,0xc2,0xca,0x01, // vcmpltps %ymm10, %ymm1, %ymm1
+ 0xc4,0xc3,0x25,0x4a,0xc9,0x10, // vblendvps %ymm1, %ymm9, %ymm11, %ymm1
+ 0xc5,0x7c,0x52,0xca, // vrsqrtps %ymm2, %ymm9
+ 0xc4,0x41,0x7c,0x53,0xd9, // vrcpps %ymm9, %ymm11
+ 0xc4,0x42,0x0d,0xa8,0xdf, // vfmadd213ps %ymm15, %ymm14, %ymm11
+ 0xc4,0x41,0x7c,0x52,0xc9, // vrsqrtps %ymm9, %ymm9
+ 0xc4,0x42,0x15,0xb8,0xd9, // vfmadd231ps %ymm9, %ymm13, %ymm11
+ 0xc4,0x41,0x1c,0x5d,0xcb, // vminps %ymm11, %ymm12, %ymm9
+ 0xc5,0x3c,0x59,0xc2, // vmulps %ymm2, %ymm8, %ymm8
+ 0xc4,0xc1,0x6c,0xc2,0xd2,0x01, // vcmpltps %ymm10, %ymm2, %ymm2
+ 0xc4,0xc3,0x35,0x4a,0xd0,0x20, // vblendvps %ymm2, %ymm8, %ymm9, %ymm2
+};
+static const unsigned char kSplice_scale_u8[] = {
+ 0x48,0x8b,0x02, // movq (%rdx), %rax
+ 0xc4,0x62,0x7d,0x31,0x04,0x38, // vpmovzxbd (%rax,%rdi), %ymm8
+ 0xc4,0x41,0x7c,0x5b,0xc0, // vcvtdq2ps %ymm8, %ymm8
+ 0xc4,0x62,0x7d,0x18,0x49,0x0c, // vbroadcastss 0xc(%rcx), %ymm9
+ 0xc4,0x41,0x3c,0x59,0xc1, // vmulps %ymm9, %ymm8, %ymm8
+ 0xc5,0xbc,0x59,0xc0, // vmulps %ymm0, %ymm8, %ymm0
+ 0xc5,0xbc,0x59,0xc9, // vmulps %ymm1, %ymm8, %ymm1
+ 0xc5,0xbc,0x59,0xd2, // vmulps %ymm2, %ymm8, %ymm2
+ 0xc5,0xbc,0x59,0xdb, // vmulps %ymm3, %ymm8, %ymm3
+};
+static const unsigned char kSplice_load_8888[] = {
+ 0x48,0x8b,0x02, // movq (%rdx), %rax
+ 0xc5,0xfc,0x10,0x1c,0xb8, // vmovups (%rax,%rdi,4), %ymm3
+ 0xc4,0xe2,0x7d,0x18,0x11, // vbroadcastss (%rcx), %ymm2
+ 0xc5,0xec,0x54,0xc3, // vandps %ymm3, %ymm2, %ymm0
+ 0xc5,0xfc,0x5b,0xc0, // vcvtdq2ps %ymm0, %ymm0
+ 0xc4,0x62,0x7d,0x18,0x41,0x0c, // vbroadcastss 0xc(%rcx), %ymm8
+ 0xc5,0xbc,0x59,0xc0, // vmulps %ymm0, %ymm8, %ymm0
+ 0xc5,0xf5,0x72,0xd3,0x08, // vpsrld $0x8, %ymm3, %ymm1
+ 0xc5,0xec,0x54,0xc9, // vandps %ymm1, %ymm2, %ymm1
+ 0xc5,0xfc,0x5b,0xc9, // vcvtdq2ps %ymm1, %ymm1
+ 0xc5,0xbc,0x59,0xc9, // vmulps %ymm1, %ymm8, %ymm1
+ 0xc5,0xb5,0x72,0xd3,0x10, // vpsrld $0x10, %ymm3, %ymm9
+ 0xc4,0xc1,0x6c,0x54,0xd1, // vandps %ymm9, %ymm2, %ymm2
+ 0xc5,0xfc,0x5b,0xd2, // vcvtdq2ps %ymm2, %ymm2
+ 0xc5,0xbc,0x59,0xd2, // vmulps %ymm2, %ymm8, %ymm2
+ 0xc5,0xe5,0x72,0xd3,0x18, // vpsrld $0x18, %ymm3, %ymm3
+ 0xc5,0xfc,0x5b,0xdb, // vcvtdq2ps %ymm3, %ymm3
+ 0xc4,0xc1,0x64,0x59,0xd8, // vmulps %ymm8, %ymm3, %ymm3
+};
+static const unsigned char kSplice_store_8888[] = {
+ 0x48,0x8b,0x02, // movq (%rdx), %rax
+ 0xc4,0x62,0x7d,0x18,0x41,0x08, // vbroadcastss 0x8(%rcx), %ymm8
+ 0xc5,0x3c,0x59,0xc8, // vmulps %ymm0, %ymm8, %ymm9
+ 0xc4,0x41,0x7d,0x5b,0xc9, // vcvtps2dq %ymm9, %ymm9
+ 0xc5,0x3c,0x59,0xd1, // vmulps %ymm1, %ymm8, %ymm10
+ 0xc4,0x41,0x7d,0x5b,0xd2, // vcvtps2dq %ymm10, %ymm10
+ 0xc4,0xc1,0x2d,0x72,0xf2,0x08, // vpslld $0x8, %ymm10, %ymm10
+ 0xc4,0x41,0x2d,0xeb,0xc9, // vpor %ymm9, %ymm10, %ymm9
+ 0xc5,0x3c,0x59,0xd2, // vmulps %ymm2, %ymm8, %ymm10
+ 0xc4,0x41,0x7d,0x5b,0xd2, // vcvtps2dq %ymm10, %ymm10
+ 0xc4,0xc1,0x2d,0x72,0xf2,0x10, // vpslld $0x10, %ymm10, %ymm10
+ 0xc5,0x3c,0x59,0xc3, // vmulps %ymm3, %ymm8, %ymm8
+ 0xc4,0x41,0x7d,0x5b,0xc0, // vcvtps2dq %ymm8, %ymm8
+ 0xc4,0xc1,0x3d,0x72,0xf0,0x18, // vpslld $0x18, %ymm8, %ymm8
+ 0xc4,0x41,0x2d,0xeb,0xc0, // vpor %ymm8, %ymm10, %ymm8
+ 0xc4,0x41,0x35,0xeb,0xc0, // vpor %ymm8, %ymm9, %ymm8
+ 0xc5,0x7e,0x7f,0x04,0xb8, // vmovdqu %ymm8, (%rax,%rdi,4)
+};
+static const unsigned char kSplice_load_f16[] = {
+ 0x48,0x8b,0x02, // movq (%rdx), %rax
+ 0xc5,0xfa,0x6f,0x04,0xf8, // vmovdqu (%rax,%rdi,8), %xmm0
+ 0xc5,0xfa,0x6f,0x4c,0xf8,0x10, // vmovdqu 0x10(%rax,%rdi,8), %xmm1
+ 0xc5,0xfa,0x6f,0x54,0xf8,0x20, // vmovdqu 0x20(%rax,%rdi,8), %xmm2
+ 0xc5,0xfa,0x6f,0x5c,0xf8,0x30, // vmovdqu 0x30(%rax,%rdi,8), %xmm3
+ 0xc5,0x79,0x61,0xc1, // vpunpcklwd %xmm1, %xmm0, %xmm8
+ 0xc5,0xf9,0x69,0xc1, // vpunpckhwd %xmm1, %xmm0, %xmm0
+ 0xc5,0xe9,0x61,0xcb, // vpunpcklwd %xmm3, %xmm2, %xmm1
+ 0xc5,0xe9,0x69,0xd3, // vpunpckhwd %xmm3, %xmm2, %xmm2
+ 0xc5,0x39,0x61,0xc8, // vpunpcklwd %xmm0, %xmm8, %xmm9
+ 0xc5,0x39,0x69,0xc0, // vpunpckhwd %xmm0, %xmm8, %xmm8
+ 0xc5,0xf1,0x61,0xda, // vpunpcklwd %xmm2, %xmm1, %xmm3
+ 0xc5,0x71,0x69,0xd2, // vpunpckhwd %xmm2, %xmm1, %xmm10
+ 0xc5,0xb1,0x6c,0xc3, // vpunpcklqdq %xmm3, %xmm9, %xmm0
+ 0xc4,0xe2,0x7d,0x13,0xc0, // vcvtph2ps %xmm0, %ymm0
+ 0xc5,0xb1,0x6d,0xcb, // vpunpckhqdq %xmm3, %xmm9, %xmm1
+ 0xc4,0xe2,0x7d,0x13,0xc9, // vcvtph2ps %xmm1, %ymm1
+ 0xc4,0xc1,0x39,0x6c,0xd2, // vpunpcklqdq %xmm10, %xmm8, %xmm2
+ 0xc4,0xe2,0x7d,0x13,0xd2, // vcvtph2ps %xmm2, %ymm2
+ 0xc4,0xc1,0x39,0x6d,0xda, // vpunpckhqdq %xmm10, %xmm8, %xmm3
+ 0xc4,0xe2,0x7d,0x13,0xdb, // vcvtph2ps %xmm3, %ymm3
+};
+static const unsigned char kSplice_store_f16[] = {
+ 0x48,0x8b,0x02, // movq (%rdx), %rax
+ 0xc4,0xc3,0x7d,0x1d,0xc0,0x04, // vcvtps2ph $0x4, %ymm0, %xmm8
+ 0xc4,0xc3,0x7d,0x1d,0xc9,0x04, // vcvtps2ph $0x4, %ymm1, %xmm9
+ 0xc4,0xc3,0x7d,0x1d,0xd2,0x04, // vcvtps2ph $0x4, %ymm2, %xmm10
+ 0xc4,0xc3,0x7d,0x1d,0xdb,0x04, // vcvtps2ph $0x4, %ymm3, %xmm11
+ 0xc4,0x41,0x39,0x61,0xe1, // vpunpcklwd %xmm9, %xmm8, %xmm12
+ 0xc4,0x41,0x39,0x69,0xc1, // vpunpckhwd %xmm9, %xmm8, %xmm8
+ 0xc4,0x41,0x29,0x61,0xcb, // vpunpcklwd %xmm11, %xmm10, %xmm9
+ 0xc4,0x41,0x29,0x69,0xd3, // vpunpckhwd %xmm11, %xmm10, %xmm10
+ 0xc4,0x41,0x19,0x62,0xd9, // vpunpckldq %xmm9, %xmm12, %xmm11
+ 0xc5,0x7a,0x7f,0x1c,0xf8, // vmovdqu %xmm11, (%rax,%rdi,8)
+ 0xc4,0x41,0x19,0x6a,0xc9, // vpunpckhdq %xmm9, %xmm12, %xmm9
+ 0xc5,0x7a,0x7f,0x4c,0xf8,0x10, // vmovdqu %xmm9, 0x10(%rax,%rdi,8)
+ 0xc4,0x41,0x39,0x62,0xca, // vpunpckldq %xmm10, %xmm8, %xmm9
+ 0xc5,0x7a,0x7f,0x4c,0xf8,0x20, // vmovdqu %xmm9, 0x20(%rax,%rdi,8)
+ 0xc4,0x41,0x39,0x6a,0xc2, // vpunpckhdq %xmm10, %xmm8, %xmm8
+ 0xc5,0x7a,0x7f,0x44,0xf8,0x30, // vmovdqu %xmm8, 0x30(%rax,%rdi,8)
+};
+#endif//SkSplicer_generated_DEFINED
--- /dev/null
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "SkSplicer_shared.h"
+#include <immintrin.h>
+#include <string.h>
+
+#if !defined(__clang__) || !defined(__AVX2__) || !defined(__FMA__) || !defined(__F16C__)
+ #error This file is not like the rest of Skia.
+ #error It must be compiled with clang and with -mavx2 -mfma -mf16c -fomit-frame-pointer.
+#endif
+
+// We have very specific inlining requirements. It helps to just take total control.
+#define AI __attribute__((always_inline)) inline
+
+// We'll be compiling this file to an object file, then extracting parts of it into
+// SkSplicer_generated.h. It's easier to do if the function names are not C++ mangled.
+#define C extern "C"
+
+// Since we know we're using Clang, we can use its vector extensions.
+// These are __m256 and __m256i, but friendlier and strongly-typed.
+using F = float __attribute__((ext_vector_type(8)));
+using I32 = int32_t __attribute__((ext_vector_type(8)));
+using U32 = uint32_t __attribute__((ext_vector_type(8)));
+using U8 = uint8_t __attribute__((ext_vector_type(8)));
+
+// We polyfill a few routines that Clang doesn't build into ext_vector_types.
+AI static F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
+AI static U32 round (F v) { return _mm256_cvtps_epi32(v); }
+AI static U32 expand(U8 v) { return __builtin_convertvector(v, U32); }
+
+AI static F rcp (F v) { return _mm256_rcp_ps (v); }
+AI static F rsqrt(F v) { return _mm256_rsqrt_ps(v); }
+AI static F min (F a, F b) { return _mm256_min_ps (a,b); }
+AI static F max (F a, F b) { return _mm256_max_ps (a,b); }
+AI static F fma (F f, F m, F a) { return _mm256_fmadd_ps(f,m,a); }
+
+AI static F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e,t,c); }
+
+// Stages all fit a common interface that allows SkSplicer to splice them together.
+using K = const SkSplicer_constants;
+using Stage = void(size_t x, size_t n, void* ctx, K* constants, F,F,F,F, F,F,F,F);
+
+// Stage's arguments act as the working set of registers within the final spliced function.
+// Here's a little primer on the ABI:
+// x: rdi x and n work to drive the loop, like for (; x < n; x += 8)
+// n: rsi
+// ctx: rdx Look for movabsq_rdx in SkSplicer.cpp to see how this works.
+// constants: rcx Look for movabsq_rcx in SkSplicer.cpp to see how this works.
+// vectors: ymm0-ymm7
+
+
+// done() is the key to this entire splicing strategy.
+//
+// It matches the signature of Stage, so all the registers are kept live.
+// Every Stage calls done() and so will end in a single jmp (i.e. tail-call) into done(),
+// which marks the point where we can splice one Stage onto the next.
+//
+// The lovely bit is that we don't have to define done(), just declare it.
+C void done(size_t, size_t, void*, K*, F,F,F,F, F,F,F,F);
+
+// This should feel familiar to anyone who's read SkRasterPipeline_opts.h.
+// It's just a convenience to make a valid, spliceable Stage, nothing magic.
+#define STAGE(name) \
+ AI static void name##_k(size_t x, size_t n, void* ctx, K* k, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
+ C void name(size_t x, size_t n, void* ctx, K* k, \
+ F r, F g, F b, F a, F dr, F dg, F db, F da) { \
+ name##_k(x,n,ctx,k, r,g,b,a, dr,dg,db,da); \
+ done (x,n,ctx,k, r,g,b,a, dr,dg,db,da); \
+ } \
+ AI static void name##_k(size_t x, size_t n, void* ctx, K* k, \
+ F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
+
+// We can now define Stages!
+
+// Some things to keep in mind while writing Stages:
+// - do not branch; (i.e. avoid jmp)
+// - do not call functions that don't inline; (i.e. avoid call, ret, stack use)
+// - do not use constant literals other than 0 and 0.0f. (i.e. avoid rip relative addressing)
+//
+// Some things that should work fine:
+// - 0 and 0.0f;
+// - arithmetic;
+// - functions of F and U32 that we've defined above;
+// - temporary values;
+// - lambdas;
+// - memcpy() with a compile-time constant size argument.
+
+STAGE(clear) {
+ r = g = b = a = 0;
+}
+
+STAGE(plus) {
+ r = r + dr;
+ g = g + dg;
+ b = b + db;
+ a = a + da;
+}
+
+STAGE(srcover) {
+ auto A = k->_1 - a;
+ r = fma(dr, A, r);
+ g = fma(dg, A, g);
+ b = fma(db, A, b);
+ a = fma(db, A, a);
+}
+STAGE(dstover) { srcover_k(x,n,ctx,k, dr,dg,db,da, r,g,b,a); }
+
+STAGE(clamp_0) {
+ r = max(r, 0);
+ g = max(g, 0);
+ b = max(b, 0);
+ a = max(a, 0);
+}
+
+STAGE(clamp_1) {
+ r = min(r, k->_1);
+ g = min(g, k->_1);
+ b = min(b, k->_1);
+ a = min(a, k->_1);
+}
+
+STAGE(clamp_a) {
+ a = min(a, k->_1);
+ r = min(r, a);
+ g = min(g, a);
+ b = min(b, a);
+}
+
+STAGE(swap) {
+ auto swap = [](F& v, F& dv) {
+ auto tmp = v;
+ v = dv;
+ dv = tmp;
+ };
+ swap(r, dr);
+ swap(g, dg);
+ swap(b, db);
+ swap(a, da);
+}
+STAGE(move_src_dst) {
+ dr = r;
+ dg = g;
+ db = b;
+ da = a;
+}
+STAGE(move_dst_src) {
+ r = dr;
+ g = dg;
+ b = db;
+ a = da;
+}
+
+STAGE(premul) {
+ r = r * a;
+ g = g * a;
+ b = b * a;
+}
+STAGE(unpremul) {
+ auto scale = if_then_else(a == 0, 0, k->_1 / a);
+ r = r * scale;
+ g = g * scale;
+ b = b * scale;
+}
+
+STAGE(from_srgb) {
+ auto fn = [&](F s) {
+ auto lo = s * k->_1_1292;
+ auto hi = fma(s*s, fma(s, k->_03000, k->_06975), k->_00025);
+ return if_then_else(s < k->_0055, lo, hi);
+ };
+ r = fn(r);
+ g = fn(g);
+ b = fn(b);
+}
+STAGE(to_srgb) {
+ auto fn = [&](F l) {
+ F sqrt = rcp (rsqrt(l)),
+ ftrt = rsqrt(rsqrt(l));
+ auto lo = l * k->_1246;
+ auto hi = min(k->_1, fma(k->_0411192, ftrt,
+ fma(k->_0689206, sqrt,
+ k->n_00988)));
+ return if_then_else(l < k->_00043, lo, hi);
+ };
+ r = fn(r);
+ g = fn(g);
+ b = fn(b);
+}
+
+STAGE(scale_u8) {
+ auto ptr = *(const uint8_t**)ctx + x;
+
+ U8 scales;
+ memcpy(&scales, ptr, sizeof(scales));
+ auto c = cast(expand(scales)) * k->_1_255;
+
+ r = r * c;
+ g = g * c;
+ b = b * c;
+ a = a * c;
+}
+
+STAGE(load_8888) {
+ auto ptr = *(const uint32_t**)ctx + x;
+
+ U32 px;
+ memcpy(&px, ptr, sizeof(px));
+
+ r = cast((px ) & k->_0x000000ff) * k->_1_255;
+ g = cast((px >> 8) & k->_0x000000ff) * k->_1_255;
+ b = cast((px >> 16) & k->_0x000000ff) * k->_1_255;
+ a = cast((px >> 24) ) * k->_1_255;
+}
+
+STAGE(store_8888) {
+ auto ptr = *(uint32_t**)ctx + x;
+
+ U32 px = round(r * k->_255)
+ | round(g * k->_255) << 8
+ | round(b * k->_255) << 16
+ | round(a * k->_255) << 24;
+ memcpy(ptr, &px, sizeof(px));
+}
+
+STAGE(load_f16) {
+ auto ptr = *(const uint64_t**)ctx + x;
+
+ auto _01 = _mm_loadu_si128(((__m128i*)ptr) + 0),
+ _23 = _mm_loadu_si128(((__m128i*)ptr) + 1),
+ _45 = _mm_loadu_si128(((__m128i*)ptr) + 2),
+ _67 = _mm_loadu_si128(((__m128i*)ptr) + 3);
+
+ auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
+ _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
+ _46 = _mm_unpacklo_epi16(_45, _67),
+ _57 = _mm_unpackhi_epi16(_45, _67);
+
+ auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
+ ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
+ rg4567 = _mm_unpacklo_epi16(_46, _57),
+ ba4567 = _mm_unpackhi_epi16(_46, _57);
+
+ r = _mm256_cvtph_ps(_mm_unpacklo_epi64(rg0123, rg4567));
+ g = _mm256_cvtph_ps(_mm_unpackhi_epi64(rg0123, rg4567));
+ b = _mm256_cvtph_ps(_mm_unpacklo_epi64(ba0123, ba4567));
+ a = _mm256_cvtph_ps(_mm_unpackhi_epi64(ba0123, ba4567));
+}
+
+STAGE(store_f16) {
+ auto ptr = *(uint64_t**)ctx + x;
+
+ auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
+ G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
+ B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
+ A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION);
+
+ auto rg0123 = _mm_unpacklo_epi16(R, G), // r0 g0 r1 g1 r2 g2 r3 g3
+ rg4567 = _mm_unpackhi_epi16(R, G), // r4 g4 r5 g5 r6 g6 r7 g7
+ ba0123 = _mm_unpacklo_epi16(B, A),
+ ba4567 = _mm_unpackhi_epi16(B, A);
+
+ _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg0123, ba0123));
+ _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg0123, ba0123));
+ _mm_storeu_si128((__m128i*)ptr + 2, _mm_unpacklo_epi32(rg4567, ba4567));
+ _mm_storeu_si128((__m128i*)ptr + 3, _mm_unpackhi_epi32(rg4567, ba4567));
+}