SkJumper: set_rgb and swap_rb
authorMike Klein <mtklein@chromium.org>
Wed, 22 Feb 2017 19:17:32 +0000 (14:17 -0500)
committerSkia Commit-Bot <skia-commit-bot@chromium.org>
Wed, 22 Feb 2017 20:03:36 +0000 (20:03 +0000)
swap_rb is a big limiting factor on Windows and Linux.
set_rgb just happened to be nearby and easy.

Change-Id: Ic529c7578eeb278476821090127fa8fb1f70c04f
Reviewed-on: https://skia-review.googlesource.com/8859
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>

src/jumper/SkJumper.cpp
src/jumper/SkJumper_generated.S
src/jumper/SkJumper_generated_win.S
src/jumper/SkJumper_stages.cpp

index 3c7aca6..cdcbb89 100644 (file)
@@ -37,6 +37,8 @@ static K kConstants = {
     M(clamp_0)        \
     M(clamp_1)        \
     M(clamp_a)        \
+    M(set_rgb)        \
+    M(swap_rb)        \
     M(swap)           \
     M(move_src_dst)   \
     M(move_dst_src)   \
index af76618..5d7ec00 100644 (file)
@@ -171,6 +171,24 @@ _sk_clamp_a_aarch64:
   .long  0x4ea3f442                                      // fmin          v2.4s, v2.4s, v3.4s
   .long  0xd61f0060                                      // br            x3
 
+.globl _sk_set_rgb_aarch64
+_sk_set_rgb_aarch64:
+  .long  0xa8c10c28                                      // ldp           x8, x3, [x1],#16
+  .long  0x91002109                                      // add           x9, x8, #0x8
+  .long  0x4ddfc900                                      // ld1r          {v0.4s}, [x8], #4
+  .long  0x4d40c922                                      // ld1r          {v2.4s}, [x9]
+  .long  0x4d40c901                                      // ld1r          {v1.4s}, [x8]
+  .long  0xd61f0060                                      // br            x3
+
+.globl _sk_swap_rb_aarch64
+_sk_swap_rb_aarch64:
+  .long  0xf9400423                                      // ldr           x3, [x1,#8]
+  .long  0x4ea01c10                                      // mov           v16.16b, v0.16b
+  .long  0x91004021                                      // add           x1, x1, #0x10
+  .long  0x4ea21c40                                      // mov           v0.16b, v2.16b
+  .long  0x4eb01e02                                      // mov           v2.16b, v16.16b
+  .long  0xd61f0060                                      // br            x3
+
 .globl _sk_swap_aarch64
 _sk_swap_aarch64:
   .long  0xf9400423                                      // ldr           x3, [x1,#8]
@@ -762,6 +780,29 @@ _sk_clamp_a_vfp4:
   .long  0xf2222f03                                      // vmin.f32      d2, d2, d3
   .long  0xe12fff13                                      // bx            r3
 
+.globl _sk_set_rgb_vfp4
+_sk_set_rgb_vfp4:
+  .long  0xe92d4800                                      // push          {fp, lr}
+  .long  0xe591e000                                      // ldr           lr, [r1]
+  .long  0xe591c004                                      // ldr           ip, [r1, #4]
+  .long  0xe2811008                                      // add           r1, r1, #8
+  .long  0xe28e3008                                      // add           r3, lr, #8
+  .long  0xf4ae0c9f                                      // vld1.32       {d0[]}, [lr :32]
+  .long  0xf4a32c9f                                      // vld1.32       {d2[]}, [r3 :32]
+  .long  0xe28e3004                                      // add           r3, lr, #4
+  .long  0xf4a31c9f                                      // vld1.32       {d1[]}, [r3 :32]
+  .long  0xe8bd4800                                      // pop           {fp, lr}
+  .long  0xe12fff1c                                      // bx            ip
+
+.globl _sk_swap_rb_vfp4
+_sk_swap_rb_vfp4:
+  .long  0xeef00b40                                      // vmov.f64      d16, d0
+  .long  0xe5913004                                      // ldr           r3, [r1, #4]
+  .long  0xe2811008                                      // add           r1, r1, #8
+  .long  0xeeb00b42                                      // vmov.f64      d0, d2
+  .long  0xeeb02b60                                      // vmov.f64      d2, d16
+  .long  0xe12fff13                                      // bx            r3
+
 .globl _sk_swap_vfp4
 _sk_swap_vfp4:
   .long  0xeef00b43                                      // vmov.f64      d16, d3
@@ -1414,6 +1455,24 @@ _sk_clamp_a_hsw:
   .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
   .byte  0xff,0xe0                                       // jmpq          *%rax
 
+.globl _sk_set_rgb_hsw
+_sk_set_rgb_hsw:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc4,0xe2,0x7d,0x18,0x00                        // vbroadcastss  (%rax),%ymm0
+  .byte  0xc4,0xe2,0x7d,0x18,0x48,0x04                   // vbroadcastss  0x4(%rax),%ymm1
+  .byte  0xc4,0xe2,0x7d,0x18,0x50,0x08                   // vbroadcastss  0x8(%rax),%ymm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_swap_rb_hsw
+_sk_swap_rb_hsw:
+  .byte  0xc5,0x7c,0x28,0xc0                             // vmovaps       %ymm0,%ymm8
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xc5,0xfc,0x28,0xc2                             // vmovaps       %ymm2,%ymm0
+  .byte  0xc5,0x7c,0x29,0xc2                             // vmovaps       %ymm8,%ymm2
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
 .globl _sk_swap_hsw
 _sk_swap_hsw:
   .byte  0xc5,0x7c,0x28,0xc3                             // vmovaps       %ymm3,%ymm8
@@ -1972,6 +2031,27 @@ _sk_clamp_a_sse41:
   .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
   .byte  0xff,0xe0                                       // jmpq          *%rax
 
+.globl _sk_set_rgb_sse41
+_sk_set_rgb_sse41:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xf3,0x0f,0x10,0x00                             // movss         (%rax),%xmm0
+  .byte  0xf3,0x0f,0x10,0x48,0x04                        // movss         0x4(%rax),%xmm1
+  .byte  0x0f,0xc6,0xc0,0x00                             // shufps        $0x0,%xmm0,%xmm0
+  .byte  0x0f,0xc6,0xc9,0x00                             // shufps        $0x0,%xmm1,%xmm1
+  .byte  0xf3,0x0f,0x10,0x50,0x08                        // movss         0x8(%rax),%xmm2
+  .byte  0x0f,0xc6,0xd2,0x00                             // shufps        $0x0,%xmm2,%xmm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_swap_rb_sse41
+_sk_swap_rb_sse41:
+  .byte  0x44,0x0f,0x28,0xc0                             // movaps        %xmm0,%xmm8
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x0f,0x28,0xc2                                  // movaps        %xmm2,%xmm0
+  .byte  0x41,0x0f,0x28,0xd0                             // movaps        %xmm8,%xmm2
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
 .globl _sk_swap_sse41
 _sk_swap_sse41:
   .byte  0x44,0x0f,0x28,0xc3                             // movaps        %xmm3,%xmm8
@@ -2697,6 +2777,27 @@ _sk_clamp_a_sse2:
   .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
   .byte  0xff,0xe0                                       // jmpq          *%rax
 
+.globl _sk_set_rgb_sse2
+_sk_set_rgb_sse2:
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xf3,0x0f,0x10,0x00                             // movss         (%rax),%xmm0
+  .byte  0xf3,0x0f,0x10,0x48,0x04                        // movss         0x4(%rax),%xmm1
+  .byte  0x0f,0xc6,0xc0,0x00                             // shufps        $0x0,%xmm0,%xmm0
+  .byte  0x0f,0xc6,0xc9,0x00                             // shufps        $0x0,%xmm1,%xmm1
+  .byte  0xf3,0x0f,0x10,0x50,0x08                        // movss         0x8(%rax),%xmm2
+  .byte  0x0f,0xc6,0xd2,0x00                             // shufps        $0x0,%xmm2,%xmm2
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
+.globl _sk_swap_rb_sse2
+_sk_swap_rb_sse2:
+  .byte  0x44,0x0f,0x28,0xc0                             // movaps        %xmm0,%xmm8
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x48,0xad                                       // lods          %ds:(%rsi),%rax
+  .byte  0x0f,0x28,0xc2                                  // movaps        %xmm2,%xmm0
+  .byte  0x41,0x0f,0x28,0xd0                             // movaps        %xmm8,%xmm2
+  .byte  0xff,0xe0                                       // jmpq          *%rax
+
 .globl _sk_swap_sse2
 _sk_swap_sse2:
   .byte  0x44,0x0f,0x28,0xc3                             // movaps        %xmm3,%xmm8
index d681d24..d2078b6 100644 (file)
@@ -190,6 +190,24 @@ _sk_clamp_a_hsw LABEL PROC
   DB  72,173                                          ; lods          %ds:(%rsi),%rax
   DB  255,224                                         ; jmpq          *%rax
 
+PUBLIC _sk_set_rgb_hsw
+_sk_set_rgb_hsw LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  196,226,125,24,0                                ; vbroadcastss  (%rax),%ymm0
+  DB  196,226,125,24,72,4                             ; vbroadcastss  0x4(%rax),%ymm1
+  DB  196,226,125,24,80,8                             ; vbroadcastss  0x8(%rax),%ymm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_swap_rb_hsw
+_sk_swap_rb_hsw LABEL PROC
+  DB  197,124,40,192                                  ; vmovaps       %ymm0,%ymm8
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  197,252,40,194                                  ; vmovaps       %ymm2,%ymm0
+  DB  197,124,41,194                                  ; vmovaps       %ymm8,%ymm2
+  DB  255,224                                         ; jmpq          *%rax
+
 PUBLIC _sk_swap_hsw
 _sk_swap_hsw LABEL PROC
   DB  197,124,40,195                                  ; vmovaps       %ymm3,%ymm8
@@ -775,6 +793,27 @@ _sk_clamp_a_sse41 LABEL PROC
   DB  72,173                                          ; lods          %ds:(%rsi),%rax
   DB  255,224                                         ; jmpq          *%rax
 
+PUBLIC _sk_set_rgb_sse41
+_sk_set_rgb_sse41 LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  243,15,16,0                                     ; movss         (%rax),%xmm0
+  DB  243,15,16,72,4                                  ; movss         0x4(%rax),%xmm1
+  DB  15,198,192,0                                    ; shufps        $0x0,%xmm0,%xmm0
+  DB  15,198,201,0                                    ; shufps        $0x0,%xmm1,%xmm1
+  DB  243,15,16,80,8                                  ; movss         0x8(%rax),%xmm2
+  DB  15,198,210,0                                    ; shufps        $0x0,%xmm2,%xmm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_swap_rb_sse41
+_sk_swap_rb_sse41 LABEL PROC
+  DB  68,15,40,192                                    ; movaps        %xmm0,%xmm8
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  15,40,194                                       ; movaps        %xmm2,%xmm0
+  DB  65,15,40,208                                    ; movaps        %xmm8,%xmm2
+  DB  255,224                                         ; jmpq          *%rax
+
 PUBLIC _sk_swap_sse41
 _sk_swap_sse41 LABEL PROC
   DB  68,15,40,195                                    ; movaps        %xmm3,%xmm8
@@ -1527,6 +1566,27 @@ _sk_clamp_a_sse2 LABEL PROC
   DB  72,173                                          ; lods          %ds:(%rsi),%rax
   DB  255,224                                         ; jmpq          *%rax
 
+PUBLIC _sk_set_rgb_sse2
+_sk_set_rgb_sse2 LABEL PROC
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  243,15,16,0                                     ; movss         (%rax),%xmm0
+  DB  243,15,16,72,4                                  ; movss         0x4(%rax),%xmm1
+  DB  15,198,192,0                                    ; shufps        $0x0,%xmm0,%xmm0
+  DB  15,198,201,0                                    ; shufps        $0x0,%xmm1,%xmm1
+  DB  243,15,16,80,8                                  ; movss         0x8(%rax),%xmm2
+  DB  15,198,210,0                                    ; shufps        $0x0,%xmm2,%xmm2
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  255,224                                         ; jmpq          *%rax
+
+PUBLIC _sk_swap_rb_sse2
+_sk_swap_rb_sse2 LABEL PROC
+  DB  68,15,40,192                                    ; movaps        %xmm0,%xmm8
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  72,173                                          ; lods          %ds:(%rsi),%rax
+  DB  15,40,194                                       ; movaps        %xmm2,%xmm0
+  DB  65,15,40,208                                    ; movaps        %xmm8,%xmm2
+  DB  255,224                                         ; jmpq          *%rax
+
 PUBLIC _sk_swap_sse2
 _sk_swap_sse2 LABEL PROC
   DB  68,15,40,195                                    ; movaps        %xmm3,%xmm8
index 0a5d702..a691f2b 100644 (file)
@@ -321,6 +321,18 @@ STAGE(clamp_a) {
     b = min(b, a);
 }
 
+STAGE(set_rgb) {
+    auto rgb = (const float*)ctx;
+    r = rgb[0];
+    g = rgb[1];
+    b = rgb[2];
+}
+STAGE(swap_rb) {
+    auto tmp = r;
+    r = b;
+    b = tmp;
+}
+
 STAGE(swap) {
     auto swap = [](F& v, F& dv) {
         auto tmp = v;