From: Mike Klein Date: Fri, 24 Feb 2017 16:51:36 +0000 (-0500) Subject: SkJumper: perspective matrix X-Git-Tag: accepted/tizen/5.0/unified/20181102.025319~55^2~84 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=11d2df0bdd58d08ab57bc10eea56bc333664c892;p=platform%2Fupstream%2FlibSkiaSharp.git SkJumper: perspective matrix Change-Id: I2c63e0996e4689950f8f3b82da0fb07941c26044 Reviewed-on: https://skia-review.googlesource.com/8952 Reviewed-by: Mike Klein Commit-Queue: Mike Klein --- diff --git a/src/jumper/SkJumper.cpp b/src/jumper/SkJumper.cpp index 0821666..488caf6 100644 --- a/src/jumper/SkJumper.cpp +++ b/src/jumper/SkJumper.cpp @@ -46,47 +46,48 @@ static K kConstants = { 31.0f, 63.0f, }; -#define STAGES(M) \ - M(seed_shader) \ - M(constant_color) \ - M(clear) \ - M(plus_) \ - M(srcover) \ - M(dstover) \ - M(clamp_0) \ - M(clamp_1) \ - M(clamp_a) \ - M(set_rgb) \ - M(swap_rb) \ - M(swap) \ - M(move_src_dst) \ - M(move_dst_src) \ - M(premul) \ - M(unpremul) \ - M(from_srgb) \ - M(to_srgb) \ - M(scale_1_float) \ - M(scale_u8) \ - M(lerp_1_float) \ - M(lerp_u8) \ - M(lerp_565) \ - M(load_tables) \ - M(load_a8) \ - M(store_a8) \ - M(load_565) \ - M(store_565) \ - M(load_8888) \ - M(store_8888) \ - M(load_f16) \ - M(store_f16) \ - M(matrix_2x3) \ - M(matrix_3x4) \ - M(clamp_x) \ - M(clamp_y) \ - M(repeat_x) \ - M(repeat_y) \ - M(mirror_x) \ - M(mirror_y) \ +#define STAGES(M) \ + M(seed_shader) \ + M(constant_color) \ + M(clear) \ + M(plus_) \ + M(srcover) \ + M(dstover) \ + M(clamp_0) \ + M(clamp_1) \ + M(clamp_a) \ + M(set_rgb) \ + M(swap_rb) \ + M(swap) \ + M(move_src_dst) \ + M(move_dst_src) \ + M(premul) \ + M(unpremul) \ + M(from_srgb) \ + M(to_srgb) \ + M(scale_1_float) \ + M(scale_u8) \ + M(lerp_1_float) \ + M(lerp_u8) \ + M(lerp_565) \ + M(load_tables) \ + M(load_a8) \ + M(store_a8) \ + M(load_565) \ + M(store_565) \ + M(load_8888) \ + M(store_8888) \ + M(load_f16) \ + M(store_f16) \ + M(matrix_2x3) \ + M(matrix_3x4) \ + M(matrix_perspective) \ + M(clamp_x) \ + M(clamp_y) \ + M(repeat_x) \ + M(repeat_y) \ + M(mirror_x) \ + M(mirror_y) \ M(linear_gradient_2stops) // We can't express the real types of most stage functions portably, so we use a stand-in. diff --git a/src/jumper/SkJumper_generated.S b/src/jumper/SkJumper_generated.S index 071aeea..7da1489 100644 --- a/src/jumper/SkJumper_generated.S +++ b/src/jumper/SkJumper_generated.S @@ -832,6 +832,33 @@ _sk_matrix_3x4_aarch64: .long 0x4eb21e42 // mov v2.16b, v18.16b .long 0xd61f0060 // br x3 +.globl _sk_matrix_perspective_aarch64 +_sk_matrix_perspective_aarch64: + .long 0xa8c10c28 // ldp x8, x3, [x1],#16 + .long 0xaa0803e9 // mov x9, x8 + .long 0x9100510a // add x10, x8, #0x14 + .long 0x4ddfc930 // ld1r {v16.4s}, [x9], #4 + .long 0x4d40c951 // ld1r {v17.4s}, [x10] + .long 0x9100810a // add x10, x8, #0x20 + .long 0x4d40c952 // ld1r {v18.4s}, [x10] + .long 0x2d41d113 // ldp s19, s20, [x8,#12] + .long 0x2d435915 // ldp s21, s22, [x8,#24] + .long 0x91002108 // add x8, x8, #0x8 + .long 0x4f941031 // fmla v17.4s, v1.4s, v20.s[0] + .long 0x4d40c914 // ld1r {v20.4s}, [x8] + .long 0x4f961032 // fmla v18.4s, v1.4s, v22.s[0] + .long 0xbd400136 // ldr s22, [x9] + .long 0x4f951012 // fmla v18.4s, v0.4s, v21.s[0] + .long 0x4f931011 // fmla v17.4s, v0.4s, v19.s[0] + .long 0x4f961034 // fmla v20.4s, v1.4s, v22.s[0] + .long 0x4ea1da41 // frecpe v1.4s, v18.4s + .long 0x4e21fe52 // frecps v18.4s, v18.4s, v1.4s + .long 0x6e32dc32 // fmul v18.4s, v1.4s, v18.4s + .long 0x4e20ce14 // fmla v20.4s, v16.4s, v0.4s + .long 0x6e32de21 // fmul v1.4s, v17.4s, v18.4s + .long 0x6e32de80 // fmul v0.4s, v20.4s, v18.4s + .long 0xd61f0060 // br x3 + .globl _sk_linear_gradient_2stops_aarch64 _sk_linear_gradient_2stops_aarch64: .long 0xa8c10c28 // ldp x8, x3, [x1],#16 @@ -1791,6 +1818,43 @@ _sk_matrix_3x4_vfp4: .long 0xe8bd4800 // pop {fp, lr} .long 0xe12fff1c // bx ip +.globl _sk_matrix_perspective_vfp4 +_sk_matrix_perspective_vfp4: + .long 0xe92d4800 // push {fp, lr} + .long 0xe591e000 // ldr lr, [r1] + .long 0xe591c004 // ldr ip, [r1, #4] + .long 0xe2811008 // add r1, r1, #8 + .long 0xe28e301c // add r3, lr, #28 + .long 0xf4e30c9f // vld1.32 {d16[]}, [r3 :32] + .long 0xe28e3020 // add r3, lr, #32 + .long 0xf4e31c9f // vld1.32 {d17[]}, [r3 :32] + .long 0xe28e3018 // add r3, lr, #24 + .long 0xf2411c30 // vfma.f32 d17, d1, d16 + .long 0xf4e30c9f // vld1.32 {d16[]}, [r3 :32] + .long 0xe28e3010 // add r3, lr, #16 + .long 0xf2401c30 // vfma.f32 d17, d0, d16 + .long 0xf4e30c9f // vld1.32 {d16[]}, [r3 :32] + .long 0xe28e3004 // add r3, lr, #4 + .long 0xf4e32c9f // vld1.32 {d18[]}, [r3 :32] + .long 0xe28e3008 // add r3, lr, #8 + .long 0xf4e34c9f // vld1.32 {d20[]}, [r3 :32] + .long 0xe28e3014 // add r3, lr, #20 + .long 0xf2414c32 // vfma.f32 d20, d1, d18 + .long 0xf4e32c9f // vld1.32 {d18[]}, [r3 :32] + .long 0xe28e300c // add r3, lr, #12 + .long 0xf3fb3521 // vrecpe.f32 d19, d17 + .long 0xf2412c30 // vfma.f32 d18, d1, d16 + .long 0xf4e35c9f // vld1.32 {d21[]}, [r3 :32] + .long 0xf2410fb3 // vrecps.f32 d16, d17, d19 + .long 0xf4ee1c9f // vld1.32 {d17[]}, [lr :32] + .long 0xf2404c31 // vfma.f32 d20, d0, d17 + .long 0xf2402c35 // vfma.f32 d18, d0, d21 + .long 0xf3430db0 // vmul.f32 d16, d19, d16 + .long 0xf3040db0 // vmul.f32 d0, d20, d16 + .long 0xf3021db0 // vmul.f32 d1, d18, d16 + .long 0xe8bd4800 // pop {fp, lr} + .long 0xe12fff1c // bx ip + .globl _sk_linear_gradient_2stops_vfp4 _sk_linear_gradient_2stops_vfp4: .long 0xe5913000 // ldr r3, [r1] @@ -2551,6 +2615,30 @@ _sk_matrix_3x4_hsw: .byte 197,124,41,210 // vmovaps %ymm10,%ymm2 .byte 255,224 // jmpq *%rax +.globl _sk_matrix_perspective_hsw +_sk_matrix_perspective_hsw: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8 + .byte 196,98,125,24,72,4 // vbroadcastss 0x4(%rax),%ymm9 + .byte 196,98,125,24,80,8 // vbroadcastss 0x8(%rax),%ymm10 + .byte 196,66,117,184,209 // vfmadd231ps %ymm9,%ymm1,%ymm10 + .byte 196,66,125,184,208 // vfmadd231ps %ymm8,%ymm0,%ymm10 + .byte 196,98,125,24,64,12 // vbroadcastss 0xc(%rax),%ymm8 + .byte 196,98,125,24,72,16 // vbroadcastss 0x10(%rax),%ymm9 + .byte 196,98,125,24,88,20 // vbroadcastss 0x14(%rax),%ymm11 + .byte 196,66,117,184,217 // vfmadd231ps %ymm9,%ymm1,%ymm11 + .byte 196,66,125,184,216 // vfmadd231ps %ymm8,%ymm0,%ymm11 + .byte 196,98,125,24,64,24 // vbroadcastss 0x18(%rax),%ymm8 + .byte 196,98,125,24,72,28 // vbroadcastss 0x1c(%rax),%ymm9 + .byte 196,98,125,24,96,32 // vbroadcastss 0x20(%rax),%ymm12 + .byte 196,66,117,184,225 // vfmadd231ps %ymm9,%ymm1,%ymm12 + .byte 196,66,125,184,224 // vfmadd231ps %ymm8,%ymm0,%ymm12 + .byte 196,193,124,83,204 // vrcpps %ymm12,%ymm1 + .byte 197,172,89,193 // vmulps %ymm1,%ymm10,%ymm0 + .byte 197,164,89,201 // vmulps %ymm1,%ymm11,%ymm1 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 255,224 // jmpq *%rax + .globl _sk_linear_gradient_2stops_hsw _sk_linear_gradient_2stops_hsw: .byte 72,173 // lods %ds:(%rsi),%rax @@ -3551,6 +3639,36 @@ _sk_matrix_3x4_avx: .byte 197,124,41,201 // vmovaps %ymm9,%ymm1 .byte 255,224 // jmpq *%rax +.globl _sk_matrix_perspective_avx +_sk_matrix_perspective_avx: + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 196,98,125,24,0 // vbroadcastss (%rax),%ymm8 + .byte 196,98,125,24,72,4 // vbroadcastss 0x4(%rax),%ymm9 + .byte 196,98,125,24,80,8 // vbroadcastss 0x8(%rax),%ymm10 + .byte 197,52,89,201 // vmulps %ymm1,%ymm9,%ymm9 + .byte 196,65,52,88,202 // vaddps %ymm10,%ymm9,%ymm9 + .byte 197,60,89,192 // vmulps %ymm0,%ymm8,%ymm8 + .byte 196,65,60,88,193 // vaddps %ymm9,%ymm8,%ymm8 + .byte 196,98,125,24,72,12 // vbroadcastss 0xc(%rax),%ymm9 + .byte 196,98,125,24,80,16 // vbroadcastss 0x10(%rax),%ymm10 + .byte 196,98,125,24,88,20 // vbroadcastss 0x14(%rax),%ymm11 + .byte 197,44,89,209 // vmulps %ymm1,%ymm10,%ymm10 + .byte 196,65,44,88,211 // vaddps %ymm11,%ymm10,%ymm10 + .byte 197,52,89,200 // vmulps %ymm0,%ymm9,%ymm9 + .byte 196,65,52,88,202 // vaddps %ymm10,%ymm9,%ymm9 + .byte 196,98,125,24,80,24 // vbroadcastss 0x18(%rax),%ymm10 + .byte 196,98,125,24,88,28 // vbroadcastss 0x1c(%rax),%ymm11 + .byte 196,98,125,24,96,32 // vbroadcastss 0x20(%rax),%ymm12 + .byte 197,164,89,201 // vmulps %ymm1,%ymm11,%ymm1 + .byte 196,193,116,88,204 // vaddps %ymm12,%ymm1,%ymm1 + .byte 197,172,89,192 // vmulps %ymm0,%ymm10,%ymm0 + .byte 197,252,88,193 // vaddps %ymm1,%ymm0,%ymm0 + .byte 197,252,83,200 // vrcpps %ymm0,%ymm1 + .byte 197,188,89,193 // vmulps %ymm1,%ymm8,%ymm0 + .byte 197,180,89,201 // vmulps %ymm1,%ymm9,%ymm1 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 255,224 // jmpq *%rax + .globl _sk_linear_gradient_2stops_avx _sk_linear_gradient_2stops_avx: .byte 72,173 // lods %ds:(%rsi),%rax @@ -4525,6 +4643,47 @@ _sk_matrix_3x4_sse41: .byte 65,15,40,210 // movaps %xmm10,%xmm2 .byte 255,224 // jmpq *%rax +.globl _sk_matrix_perspective_sse41 +_sk_matrix_perspective_sse41: + .byte 68,15,40,192 // movaps %xmm0,%xmm8 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 243,15,16,0 // movss (%rax),%xmm0 + .byte 243,68,15,16,72,4 // movss 0x4(%rax),%xmm9 + .byte 15,198,192,0 // shufps $0x0,%xmm0,%xmm0 + .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9 + .byte 243,68,15,16,80,8 // movss 0x8(%rax),%xmm10 + .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 + .byte 68,15,89,201 // mulps %xmm1,%xmm9 + .byte 69,15,88,202 // addps %xmm10,%xmm9 + .byte 65,15,89,192 // mulps %xmm8,%xmm0 + .byte 65,15,88,193 // addps %xmm9,%xmm0 + .byte 243,68,15,16,72,12 // movss 0xc(%rax),%xmm9 + .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9 + .byte 243,68,15,16,80,16 // movss 0x10(%rax),%xmm10 + .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 + .byte 243,68,15,16,88,20 // movss 0x14(%rax),%xmm11 + .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11 + .byte 68,15,89,209 // mulps %xmm1,%xmm10 + .byte 69,15,88,211 // addps %xmm11,%xmm10 + .byte 69,15,89,200 // mulps %xmm8,%xmm9 + .byte 69,15,88,202 // addps %xmm10,%xmm9 + .byte 243,68,15,16,80,24 // movss 0x18(%rax),%xmm10 + .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 + .byte 243,68,15,16,88,28 // movss 0x1c(%rax),%xmm11 + .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11 + .byte 243,68,15,16,96,32 // movss 0x20(%rax),%xmm12 + .byte 69,15,198,228,0 // shufps $0x0,%xmm12,%xmm12 + .byte 68,15,89,217 // mulps %xmm1,%xmm11 + .byte 69,15,88,220 // addps %xmm12,%xmm11 + .byte 69,15,89,208 // mulps %xmm8,%xmm10 + .byte 69,15,88,211 // addps %xmm11,%xmm10 + .byte 65,15,83,202 // rcpps %xmm10,%xmm1 + .byte 15,89,193 // mulps %xmm1,%xmm0 + .byte 68,15,89,201 // mulps %xmm1,%xmm9 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 65,15,40,201 // movaps %xmm9,%xmm1 + .byte 255,224 // jmpq *%rax + .globl _sk_linear_gradient_2stops_sse41 _sk_linear_gradient_2stops_sse41: .byte 72,173 // lods %ds:(%rsi),%rax @@ -5556,6 +5715,47 @@ _sk_matrix_3x4_sse2: .byte 65,15,40,210 // movaps %xmm10,%xmm2 .byte 255,224 // jmpq *%rax +.globl _sk_matrix_perspective_sse2 +_sk_matrix_perspective_sse2: + .byte 68,15,40,192 // movaps %xmm0,%xmm8 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 243,15,16,0 // movss (%rax),%xmm0 + .byte 243,68,15,16,72,4 // movss 0x4(%rax),%xmm9 + .byte 15,198,192,0 // shufps $0x0,%xmm0,%xmm0 + .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9 + .byte 243,68,15,16,80,8 // movss 0x8(%rax),%xmm10 + .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 + .byte 68,15,89,201 // mulps %xmm1,%xmm9 + .byte 69,15,88,202 // addps %xmm10,%xmm9 + .byte 65,15,89,192 // mulps %xmm8,%xmm0 + .byte 65,15,88,193 // addps %xmm9,%xmm0 + .byte 243,68,15,16,72,12 // movss 0xc(%rax),%xmm9 + .byte 69,15,198,201,0 // shufps $0x0,%xmm9,%xmm9 + .byte 243,68,15,16,80,16 // movss 0x10(%rax),%xmm10 + .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 + .byte 243,68,15,16,88,20 // movss 0x14(%rax),%xmm11 + .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11 + .byte 68,15,89,209 // mulps %xmm1,%xmm10 + .byte 69,15,88,211 // addps %xmm11,%xmm10 + .byte 69,15,89,200 // mulps %xmm8,%xmm9 + .byte 69,15,88,202 // addps %xmm10,%xmm9 + .byte 243,68,15,16,80,24 // movss 0x18(%rax),%xmm10 + .byte 69,15,198,210,0 // shufps $0x0,%xmm10,%xmm10 + .byte 243,68,15,16,88,28 // movss 0x1c(%rax),%xmm11 + .byte 69,15,198,219,0 // shufps $0x0,%xmm11,%xmm11 + .byte 243,68,15,16,96,32 // movss 0x20(%rax),%xmm12 + .byte 69,15,198,228,0 // shufps $0x0,%xmm12,%xmm12 + .byte 68,15,89,217 // mulps %xmm1,%xmm11 + .byte 69,15,88,220 // addps %xmm12,%xmm11 + .byte 69,15,89,208 // mulps %xmm8,%xmm10 + .byte 69,15,88,211 // addps %xmm11,%xmm10 + .byte 65,15,83,202 // rcpps %xmm10,%xmm1 + .byte 15,89,193 // mulps %xmm1,%xmm0 + .byte 68,15,89,201 // mulps %xmm1,%xmm9 + .byte 72,173 // lods %ds:(%rsi),%rax + .byte 65,15,40,201 // movaps %xmm9,%xmm1 + .byte 255,224 // jmpq *%rax + .globl _sk_linear_gradient_2stops_sse2 _sk_linear_gradient_2stops_sse2: .byte 72,173 // lods %ds:(%rsi),%rax diff --git a/src/jumper/SkJumper_generated_win.S b/src/jumper/SkJumper_generated_win.S index 7c38fc0..eb7359d 100644 --- a/src/jumper/SkJumper_generated_win.S +++ b/src/jumper/SkJumper_generated_win.S @@ -772,6 +772,30 @@ _sk_matrix_3x4_hsw LABEL PROC DB 197,124,41,210 ; vmovaps %ymm10,%ymm2 DB 255,224 ; jmpq *%rax +PUBLIC _sk_matrix_perspective_hsw +_sk_matrix_perspective_hsw LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 + DB 196,98,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm9 + DB 196,98,125,24,80,8 ; vbroadcastss 0x8(%rax),%ymm10 + DB 196,66,117,184,209 ; vfmadd231ps %ymm9,%ymm1,%ymm10 + DB 196,66,125,184,208 ; vfmadd231ps %ymm8,%ymm0,%ymm10 + DB 196,98,125,24,64,12 ; vbroadcastss 0xc(%rax),%ymm8 + DB 196,98,125,24,72,16 ; vbroadcastss 0x10(%rax),%ymm9 + DB 196,98,125,24,88,20 ; vbroadcastss 0x14(%rax),%ymm11 + DB 196,66,117,184,217 ; vfmadd231ps %ymm9,%ymm1,%ymm11 + DB 196,66,125,184,216 ; vfmadd231ps %ymm8,%ymm0,%ymm11 + DB 196,98,125,24,64,24 ; vbroadcastss 0x18(%rax),%ymm8 + DB 196,98,125,24,72,28 ; vbroadcastss 0x1c(%rax),%ymm9 + DB 196,98,125,24,96,32 ; vbroadcastss 0x20(%rax),%ymm12 + DB 196,66,117,184,225 ; vfmadd231ps %ymm9,%ymm1,%ymm12 + DB 196,66,125,184,224 ; vfmadd231ps %ymm8,%ymm0,%ymm12 + DB 196,193,124,83,204 ; vrcpps %ymm12,%ymm1 + DB 197,172,89,193 ; vmulps %ymm1,%ymm10,%ymm0 + DB 197,164,89,201 ; vmulps %ymm1,%ymm11,%ymm1 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + PUBLIC _sk_linear_gradient_2stops_hsw _sk_linear_gradient_2stops_hsw LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax @@ -1799,6 +1823,36 @@ _sk_matrix_3x4_avx LABEL PROC DB 197,124,41,201 ; vmovaps %ymm9,%ymm1 DB 255,224 ; jmpq *%rax +PUBLIC _sk_matrix_perspective_avx +_sk_matrix_perspective_avx LABEL PROC + DB 72,173 ; lods %ds:(%rsi),%rax + DB 196,98,125,24,0 ; vbroadcastss (%rax),%ymm8 + DB 196,98,125,24,72,4 ; vbroadcastss 0x4(%rax),%ymm9 + DB 196,98,125,24,80,8 ; vbroadcastss 0x8(%rax),%ymm10 + DB 197,52,89,201 ; vmulps %ymm1,%ymm9,%ymm9 + DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9 + DB 197,60,89,192 ; vmulps %ymm0,%ymm8,%ymm8 + DB 196,65,60,88,193 ; vaddps %ymm9,%ymm8,%ymm8 + DB 196,98,125,24,72,12 ; vbroadcastss 0xc(%rax),%ymm9 + DB 196,98,125,24,80,16 ; vbroadcastss 0x10(%rax),%ymm10 + DB 196,98,125,24,88,20 ; vbroadcastss 0x14(%rax),%ymm11 + DB 197,44,89,209 ; vmulps %ymm1,%ymm10,%ymm10 + DB 196,65,44,88,211 ; vaddps %ymm11,%ymm10,%ymm10 + DB 197,52,89,200 ; vmulps %ymm0,%ymm9,%ymm9 + DB 196,65,52,88,202 ; vaddps %ymm10,%ymm9,%ymm9 + DB 196,98,125,24,80,24 ; vbroadcastss 0x18(%rax),%ymm10 + DB 196,98,125,24,88,28 ; vbroadcastss 0x1c(%rax),%ymm11 + DB 196,98,125,24,96,32 ; vbroadcastss 0x20(%rax),%ymm12 + DB 197,164,89,201 ; vmulps %ymm1,%ymm11,%ymm1 + DB 196,193,116,88,204 ; vaddps %ymm12,%ymm1,%ymm1 + DB 197,172,89,192 ; vmulps %ymm0,%ymm10,%ymm0 + DB 197,252,88,193 ; vaddps %ymm1,%ymm0,%ymm0 + DB 197,252,83,200 ; vrcpps %ymm0,%ymm1 + DB 197,188,89,193 ; vmulps %ymm1,%ymm8,%ymm0 + DB 197,180,89,201 ; vmulps %ymm1,%ymm9,%ymm1 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 255,224 ; jmpq *%rax + PUBLIC _sk_linear_gradient_2stops_avx _sk_linear_gradient_2stops_avx LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax @@ -2800,6 +2854,47 @@ _sk_matrix_3x4_sse41 LABEL PROC DB 65,15,40,210 ; movaps %xmm10,%xmm2 DB 255,224 ; jmpq *%rax +PUBLIC _sk_matrix_perspective_sse41 +_sk_matrix_perspective_sse41 LABEL PROC + DB 68,15,40,192 ; movaps %xmm0,%xmm8 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 243,15,16,0 ; movss (%rax),%xmm0 + DB 243,68,15,16,72,4 ; movss 0x4(%rax),%xmm9 + DB 15,198,192,0 ; shufps $0x0,%xmm0,%xmm0 + DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 + DB 243,68,15,16,80,8 ; movss 0x8(%rax),%xmm10 + DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 + DB 68,15,89,201 ; mulps %xmm1,%xmm9 + DB 69,15,88,202 ; addps %xmm10,%xmm9 + DB 65,15,89,192 ; mulps %xmm8,%xmm0 + DB 65,15,88,193 ; addps %xmm9,%xmm0 + DB 243,68,15,16,72,12 ; movss 0xc(%rax),%xmm9 + DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 + DB 243,68,15,16,80,16 ; movss 0x10(%rax),%xmm10 + DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 + DB 243,68,15,16,88,20 ; movss 0x14(%rax),%xmm11 + DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11 + DB 68,15,89,209 ; mulps %xmm1,%xmm10 + DB 69,15,88,211 ; addps %xmm11,%xmm10 + DB 69,15,89,200 ; mulps %xmm8,%xmm9 + DB 69,15,88,202 ; addps %xmm10,%xmm9 + DB 243,68,15,16,80,24 ; movss 0x18(%rax),%xmm10 + DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 + DB 243,68,15,16,88,28 ; movss 0x1c(%rax),%xmm11 + DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11 + DB 243,68,15,16,96,32 ; movss 0x20(%rax),%xmm12 + DB 69,15,198,228,0 ; shufps $0x0,%xmm12,%xmm12 + DB 68,15,89,217 ; mulps %xmm1,%xmm11 + DB 69,15,88,220 ; addps %xmm12,%xmm11 + DB 69,15,89,208 ; mulps %xmm8,%xmm10 + DB 69,15,88,211 ; addps %xmm11,%xmm10 + DB 65,15,83,202 ; rcpps %xmm10,%xmm1 + DB 15,89,193 ; mulps %xmm1,%xmm0 + DB 68,15,89,201 ; mulps %xmm1,%xmm9 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 65,15,40,201 ; movaps %xmm9,%xmm1 + DB 255,224 ; jmpq *%rax + PUBLIC _sk_linear_gradient_2stops_sse41 _sk_linear_gradient_2stops_sse41 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax @@ -3858,6 +3953,47 @@ _sk_matrix_3x4_sse2 LABEL PROC DB 65,15,40,210 ; movaps %xmm10,%xmm2 DB 255,224 ; jmpq *%rax +PUBLIC _sk_matrix_perspective_sse2 +_sk_matrix_perspective_sse2 LABEL PROC + DB 68,15,40,192 ; movaps %xmm0,%xmm8 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 243,15,16,0 ; movss (%rax),%xmm0 + DB 243,68,15,16,72,4 ; movss 0x4(%rax),%xmm9 + DB 15,198,192,0 ; shufps $0x0,%xmm0,%xmm0 + DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 + DB 243,68,15,16,80,8 ; movss 0x8(%rax),%xmm10 + DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 + DB 68,15,89,201 ; mulps %xmm1,%xmm9 + DB 69,15,88,202 ; addps %xmm10,%xmm9 + DB 65,15,89,192 ; mulps %xmm8,%xmm0 + DB 65,15,88,193 ; addps %xmm9,%xmm0 + DB 243,68,15,16,72,12 ; movss 0xc(%rax),%xmm9 + DB 69,15,198,201,0 ; shufps $0x0,%xmm9,%xmm9 + DB 243,68,15,16,80,16 ; movss 0x10(%rax),%xmm10 + DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 + DB 243,68,15,16,88,20 ; movss 0x14(%rax),%xmm11 + DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11 + DB 68,15,89,209 ; mulps %xmm1,%xmm10 + DB 69,15,88,211 ; addps %xmm11,%xmm10 + DB 69,15,89,200 ; mulps %xmm8,%xmm9 + DB 69,15,88,202 ; addps %xmm10,%xmm9 + DB 243,68,15,16,80,24 ; movss 0x18(%rax),%xmm10 + DB 69,15,198,210,0 ; shufps $0x0,%xmm10,%xmm10 + DB 243,68,15,16,88,28 ; movss 0x1c(%rax),%xmm11 + DB 69,15,198,219,0 ; shufps $0x0,%xmm11,%xmm11 + DB 243,68,15,16,96,32 ; movss 0x20(%rax),%xmm12 + DB 69,15,198,228,0 ; shufps $0x0,%xmm12,%xmm12 + DB 68,15,89,217 ; mulps %xmm1,%xmm11 + DB 69,15,88,220 ; addps %xmm12,%xmm11 + DB 69,15,89,208 ; mulps %xmm8,%xmm10 + DB 69,15,88,211 ; addps %xmm11,%xmm10 + DB 65,15,83,202 ; rcpps %xmm10,%xmm1 + DB 15,89,193 ; mulps %xmm1,%xmm0 + DB 68,15,89,201 ; mulps %xmm1,%xmm9 + DB 72,173 ; lods %ds:(%rsi),%rax + DB 65,15,40,201 ; movaps %xmm9,%xmm1 + DB 255,224 ; jmpq *%rax + PUBLIC _sk_linear_gradient_2stops_sse2 _sk_linear_gradient_2stops_sse2 LABEL PROC DB 72,173 ; lods %ds:(%rsi),%rax diff --git a/src/jumper/SkJumper_stages.cpp b/src/jumper/SkJumper_stages.cpp index dfcd786..2b90fe2 100644 --- a/src/jumper/SkJumper_stages.cpp +++ b/src/jumper/SkJumper_stages.cpp @@ -859,6 +859,16 @@ STAGE(matrix_3x4) { g = G; b = B; } +STAGE(matrix_perspective) { + // N.B. Unlike the other matrix_ stages, this matrix is row-major. + auto m = (const float*)ctx; + + auto R = mad(r,m[0], mad(g,m[1], m[2])), + G = mad(r,m[3], mad(g,m[4], m[5])), + Z = mad(r,m[6], mad(g,m[7], m[8])); + r = R * rcp(Z); + g = G * rcp(Z); +} STAGE(linear_gradient_2stops) { struct Ctx { F4 c0, dc; };