jumper, split store_f16 into to_half, store4
authorMike Klein <mtklein@chromium.org>
Tue, 4 Apr 2017 14:24:56 +0000 (10:24 -0400)
committerSkia Commit-Bot <skia-commit-bot@chromium.org>
Tue, 4 Apr 2017 17:29:38 +0000 (17:29 +0000)
Pretty much the same deal as the last CL going the other direction:
split store_f16 into to_half() and store4().  Platforms that had fused
strategies here get a little less optimal, but the code's easier to
follow, maintain, and reuse.

Also adds widen_cast() to encapsulate the fairly common pattern of
expanding one of our logical vector types (e.g. 8-byte U16) up to the
width of the physical vector type (e.g. 16-byte __m128i).  This
operation is deeply understood by Clang, and often is a no-op.

I could make bit_cast() do this, but it seems clearer to have two names.

Change-Id: I7ba5bb4746acfcaa6d486379f67e07baee3820b2
Reviewed-on: https://skia-review.googlesource.com/11204
Reviewed-by: Herb Derby <herb@google.com>
Commit-Queue: Mike Klein <mtklein@chromium.org>

src/jumper/SkJumper_generated.S
src/jumper/SkJumper_generated_win.S
src/jumper/SkJumper_misc.h
src/jumper/SkJumper_stages.cpp
src/jumper/SkJumper_vectors.h

index bf724d2..db82770 100644 (file)
@@ -3139,18 +3139,19 @@ _sk_load_f16_vfp4:
 HIDDEN _sk_store_f16_vfp4
 .globl _sk_store_f16_vfp4
 _sk_store_f16_vfp4:
-  .long  0xeef00b41                          // vmov.f64      d16, d1
-  .long  0xeef03b42                          // vmov.f64      d19, d2
-  .long  0xf2631113                          // vorr          d17, d3, d3
-  .long  0xf2602110                          // vorr          d18, d0, d0
-  .long  0xf3fa00a1                          // vtrn.32       d16, d17
-  .long  0xf3f61620                          // vcvt.f16.f32  d17, q8
-  .long  0xf3fa20a3                          // vtrn.32       d18, d19
+  .long  0xf2630113                          // vorr          d16, d3, d3
   .long  0xe5913000                          // ldr           r3, [r1]
-  .long  0xf3f60622                          // vcvt.f16.f32  d16, q9
+  .long  0xf2612111                          // vorr          d18, d1, d1
+  .long  0xf3f67620                          // vcvt.f16.f32  d23, q8
   .long  0xe5933000                          // ldr           r3, [r3]
+  .long  0xf3f66602                          // vcvt.f16.f32  d22, q1
   .long  0xe0833180                          // add           r3, r3, r0, lsl #3
-  .long  0xf443084f                          // vst2.16       {d16-d17}, [r3]
+  .long  0xf3f65622                          // vcvt.f16.f32  d21, q9
+  .long  0xf3f64600                          // vcvt.f16.f32  d20, q0
+  .long  0xf22211b2                          // vorr          d1, d18, d18
+  .long  0xf22031b0                          // vorr          d3, d16, d16
+  .long  0xf4c3470d                          // vst4.16       {d20[0],d21[0],d22[0],d23[0]}, [r3]!
+  .long  0xf4c3474f                          // vst4.16       {d20[1],d21[1],d22[1],d23[1]}, [r3]
   .long  0xe2813008                          // add           r3, r1, #8
   .long  0xe591c004                          // ldr           ip, [r1, #4]
   .long  0xe1a01003                          // mov           r1, r3
@@ -3193,7 +3194,6 @@ _sk_clamp_y_vfp4:
   .long  0xf26218a1                          // vadd.i32      d17, d18, d17
   .long  0xf2201fa1                          // vmin.f32      d1, d16, d17
   .long  0xe12fff1c                          // bx            ip
-  .long  0xe320f000                          // nop           {0}
 
 HIDDEN _sk_repeat_x_vfp4
 .globl _sk_repeat_x_vfp4
@@ -6907,7 +6907,7 @@ _sk_lerp_565_avx:
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
-  .byte  233,255,255,255,225                 // jmpq          ffffffffe2001208 <_sk_linear_gradient_2stops_avx+0xffffffffe1ffeb0f>
+  .byte  233,255,255,255,225                 // jmpq          ffffffffe2001208 <_sk_linear_gradient_2stops_avx+0xffffffffe1ffeb13>
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
   .byte  255                                 // (bad)
@@ -7876,32 +7876,32 @@ _sk_store_f16_avx:
   .byte  196,67,125,25,202,1                 // vextractf128  $0x1,%ymm9,%xmm10
   .byte  196,193,41,114,210,13               // vpsrld        $0xd,%xmm10,%xmm10
   .byte  196,193,49,114,209,13               // vpsrld        $0xd,%xmm9,%xmm9
-  .byte  197,60,89,217                       // vmulps        %ymm1,%ymm8,%ymm11
+  .byte  196,66,49,43,202                    // vpackusdw     %xmm10,%xmm9,%xmm9
+  .byte  197,60,89,209                       // vmulps        %ymm1,%ymm8,%ymm10
+  .byte  196,67,125,25,211,1                 // vextractf128  $0x1,%ymm10,%xmm11
+  .byte  196,193,33,114,211,13               // vpsrld        $0xd,%xmm11,%xmm11
+  .byte  196,193,41,114,210,13               // vpsrld        $0xd,%xmm10,%xmm10
+  .byte  196,66,41,43,211                    // vpackusdw     %xmm11,%xmm10,%xmm10
+  .byte  197,60,89,218                       // vmulps        %ymm2,%ymm8,%ymm11
   .byte  196,67,125,25,220,1                 // vextractf128  $0x1,%ymm11,%xmm12
   .byte  196,193,25,114,212,13               // vpsrld        $0xd,%xmm12,%xmm12
   .byte  196,193,33,114,211,13               // vpsrld        $0xd,%xmm11,%xmm11
-  .byte  197,60,89,234                       // vmulps        %ymm2,%ymm8,%ymm13
-  .byte  196,67,125,25,238,1                 // vextractf128  $0x1,%ymm13,%xmm14
-  .byte  196,193,9,114,214,13                // vpsrld        $0xd,%xmm14,%xmm14
-  .byte  196,193,17,114,213,13               // vpsrld        $0xd,%xmm13,%xmm13
+  .byte  196,66,33,43,220                    // vpackusdw     %xmm12,%xmm11,%xmm11
   .byte  197,60,89,195                       // vmulps        %ymm3,%ymm8,%ymm8
-  .byte  196,67,125,25,199,1                 // vextractf128  $0x1,%ymm8,%xmm15
-  .byte  196,193,1,114,215,13                // vpsrld        $0xd,%xmm15,%xmm15
+  .byte  196,67,125,25,196,1                 // vextractf128  $0x1,%ymm8,%xmm12
+  .byte  196,193,25,114,212,13               // vpsrld        $0xd,%xmm12,%xmm12
   .byte  196,193,57,114,208,13               // vpsrld        $0xd,%xmm8,%xmm8
-  .byte  196,193,33,115,251,2                // vpslldq       $0x2,%xmm11,%xmm11
-  .byte  196,65,33,235,201                   // vpor          %xmm9,%xmm11,%xmm9
-  .byte  196,193,33,115,252,2                // vpslldq       $0x2,%xmm12,%xmm11
-  .byte  196,65,33,235,226                   // vpor          %xmm10,%xmm11,%xmm12
-  .byte  196,193,57,115,248,2                // vpslldq       $0x2,%xmm8,%xmm8
-  .byte  196,65,57,235,197                   // vpor          %xmm13,%xmm8,%xmm8
-  .byte  196,193,41,115,255,2                // vpslldq       $0x2,%xmm15,%xmm10
-  .byte  196,65,41,235,238                   // vpor          %xmm14,%xmm10,%xmm13
-  .byte  196,65,49,98,216                    // vpunpckldq    %xmm8,%xmm9,%xmm11
-  .byte  196,65,49,106,208                   // vpunpckhdq    %xmm8,%xmm9,%xmm10
-  .byte  196,65,25,98,205                    // vpunpckldq    %xmm13,%xmm12,%xmm9
-  .byte  196,65,25,106,197                   // vpunpckhdq    %xmm13,%xmm12,%xmm8
+  .byte  196,66,57,43,196                    // vpackusdw     %xmm12,%xmm8,%xmm8
+  .byte  196,65,49,97,226                    // vpunpcklwd    %xmm10,%xmm9,%xmm12
+  .byte  196,65,49,105,234                   // vpunpckhwd    %xmm10,%xmm9,%xmm13
+  .byte  196,65,33,97,200                    // vpunpcklwd    %xmm8,%xmm11,%xmm9
+  .byte  196,65,33,105,192                   // vpunpckhwd    %xmm8,%xmm11,%xmm8
+  .byte  196,65,25,98,217                    // vpunpckldq    %xmm9,%xmm12,%xmm11
+  .byte  196,65,25,106,209                   // vpunpckhdq    %xmm9,%xmm12,%xmm10
+  .byte  196,65,17,98,200                    // vpunpckldq    %xmm8,%xmm13,%xmm9
+  .byte  196,65,17,106,192                   // vpunpckhdq    %xmm8,%xmm13,%xmm8
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,31                              // jne           2142 <_sk_store_f16_avx+0xd6>
+  .byte  117,31                              // jne           213e <_sk_store_f16_avx+0xd2>
   .byte  196,65,120,17,28,248                // vmovups       %xmm11,(%r8,%rdi,8)
   .byte  196,65,120,17,84,248,16             // vmovups       %xmm10,0x10(%r8,%rdi,8)
   .byte  196,65,120,17,76,248,32             // vmovups       %xmm9,0x20(%r8,%rdi,8)
@@ -7910,22 +7910,22 @@ _sk_store_f16_avx:
   .byte  255,224                             // jmpq          *%rax
   .byte  196,65,121,214,28,248               // vmovq         %xmm11,(%r8,%rdi,8)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            213e <_sk_store_f16_avx+0xd2>
+  .byte  116,240                             // je            213a <_sk_store_f16_avx+0xce>
   .byte  196,65,121,23,92,248,8              // vmovhpd       %xmm11,0x8(%r8,%rdi,8)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            213e <_sk_store_f16_avx+0xd2>
+  .byte  114,227                             // jb            213a <_sk_store_f16_avx+0xce>
   .byte  196,65,121,214,84,248,16            // vmovq         %xmm10,0x10(%r8,%rdi,8)
-  .byte  116,218                             // je            213e <_sk_store_f16_avx+0xd2>
+  .byte  116,218                             // je            213a <_sk_store_f16_avx+0xce>
   .byte  196,65,121,23,84,248,24             // vmovhpd       %xmm10,0x18(%r8,%rdi,8)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            213e <_sk_store_f16_avx+0xd2>
+  .byte  114,205                             // jb            213a <_sk_store_f16_avx+0xce>
   .byte  196,65,121,214,76,248,32            // vmovq         %xmm9,0x20(%r8,%rdi,8)
-  .byte  116,196                             // je            213e <_sk_store_f16_avx+0xd2>
+  .byte  116,196                             // je            213a <_sk_store_f16_avx+0xce>
   .byte  196,65,121,23,76,248,40             // vmovhpd       %xmm9,0x28(%r8,%rdi,8)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,183                             // jb            213e <_sk_store_f16_avx+0xd2>
+  .byte  114,183                             // jb            213a <_sk_store_f16_avx+0xce>
   .byte  196,65,121,214,68,248,48            // vmovq         %xmm8,0x30(%r8,%rdi,8)
-  .byte  235,174                             // jmp           213e <_sk_store_f16_avx+0xd2>
+  .byte  235,174                             // jmp           213a <_sk_store_f16_avx+0xce>
 
 HIDDEN _sk_store_f32_avx
 .globl _sk_store_f32_avx
@@ -7942,7 +7942,7 @@ _sk_store_f32_avx:
   .byte  196,65,37,20,196                    // vunpcklpd     %ymm12,%ymm11,%ymm8
   .byte  196,65,37,21,220                    // vunpckhpd     %ymm12,%ymm11,%ymm11
   .byte  72,133,201                          // test          %rcx,%rcx
-  .byte  117,55                              // jne           21fd <_sk_store_f32_avx+0x6d>
+  .byte  117,55                              // jne           21f9 <_sk_store_f32_avx+0x6d>
   .byte  196,67,45,24,225,1                  // vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   .byte  196,67,61,24,235,1                  // vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   .byte  196,67,45,6,201,49                  // vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -7955,22 +7955,22 @@ _sk_store_f32_avx:
   .byte  255,224                             // jmpq          *%rax
   .byte  196,65,121,17,20,128                // vmovupd       %xmm10,(%r8,%rax,4)
   .byte  72,131,249,1                        // cmp           $0x1,%rcx
-  .byte  116,240                             // je            21f9 <_sk_store_f32_avx+0x69>
+  .byte  116,240                             // je            21f5 <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,76,128,16             // vmovupd       %xmm9,0x10(%r8,%rax,4)
   .byte  72,131,249,3                        // cmp           $0x3,%rcx
-  .byte  114,227                             // jb            21f9 <_sk_store_f32_avx+0x69>
+  .byte  114,227                             // jb            21f5 <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,68,128,32             // vmovupd       %xmm8,0x20(%r8,%rax,4)
-  .byte  116,218                             // je            21f9 <_sk_store_f32_avx+0x69>
+  .byte  116,218                             // je            21f5 <_sk_store_f32_avx+0x69>
   .byte  196,65,121,17,92,128,48             // vmovupd       %xmm11,0x30(%r8,%rax,4)
   .byte  72,131,249,5                        // cmp           $0x5,%rcx
-  .byte  114,205                             // jb            21f9 <_sk_store_f32_avx+0x69>
+  .byte  114,205                             // jb            21f5 <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,84,128,64,1           // vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  .byte  116,195                             // je            21f9 <_sk_store_f32_avx+0x69>
+  .byte  116,195                             // je            21f5 <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,76,128,80,1           // vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   .byte  72,131,249,7                        // cmp           $0x7,%rcx
-  .byte  114,181                             // jb            21f9 <_sk_store_f32_avx+0x69>
+  .byte  114,181                             // jb            21f5 <_sk_store_f32_avx+0x69>
   .byte  196,67,125,25,68,128,96,1           // vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  .byte  235,171                             // jmp           21f9 <_sk_store_f32_avx+0x69>
+  .byte  235,171                             // jmp           21f5 <_sk_store_f32_avx+0x69>
 
 HIDDEN _sk_clamp_x_avx
 .globl _sk_clamp_x_avx
@@ -10038,27 +10038,29 @@ _sk_store_f16_sse41:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  185,0,0,128,7                       // mov           $0x7800000,%ecx
   .byte  102,68,15,110,193                   // movd          %ecx,%xmm8
-  .byte  102,69,15,112,192,0                 // pshufd        $0x0,%xmm8,%xmm8
-  .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
-  .byte  68,15,89,200                        // mulps         %xmm0,%xmm9
-  .byte  102,65,15,114,209,13                // psrld         $0xd,%xmm9
-  .byte  102,69,15,111,208                   // movdqa        %xmm8,%xmm10
+  .byte  102,69,15,112,200,0                 // pshufd        $0x0,%xmm8,%xmm9
+  .byte  102,69,15,111,193                   // movdqa        %xmm9,%xmm8
+  .byte  68,15,89,192                        // mulps         %xmm0,%xmm8
+  .byte  102,65,15,114,208,13                // psrld         $0xd,%xmm8
+  .byte  102,69,15,56,43,192                 // packusdw      %xmm8,%xmm8
+  .byte  102,69,15,111,209                   // movdqa        %xmm9,%xmm10
   .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
   .byte  102,65,15,114,210,13                // psrld         $0xd,%xmm10
-  .byte  102,69,15,111,216                   // movdqa        %xmm8,%xmm11
+  .byte  102,69,15,56,43,210                 // packusdw      %xmm10,%xmm10
+  .byte  102,69,15,111,217                   // movdqa        %xmm9,%xmm11
   .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
   .byte  102,65,15,114,211,13                // psrld         $0xd,%xmm11
-  .byte  68,15,89,195                        // mulps         %xmm3,%xmm8
-  .byte  102,65,15,114,208,13                // psrld         $0xd,%xmm8
-  .byte  102,65,15,115,250,2                 // pslldq        $0x2,%xmm10
-  .byte  102,69,15,235,209                   // por           %xmm9,%xmm10
-  .byte  102,65,15,115,248,2                 // pslldq        $0x2,%xmm8
-  .byte  102,69,15,235,195                   // por           %xmm11,%xmm8
-  .byte  102,69,15,111,202                   // movdqa        %xmm10,%xmm9
-  .byte  102,69,15,98,200                    // punpckldq     %xmm8,%xmm9
+  .byte  102,69,15,56,43,219                 // packusdw      %xmm11,%xmm11
+  .byte  68,15,89,203                        // mulps         %xmm3,%xmm9
+  .byte  102,65,15,114,209,13                // psrld         $0xd,%xmm9
+  .byte  102,69,15,56,43,201                 // packusdw      %xmm9,%xmm9
+  .byte  102,69,15,97,194                    // punpcklwd     %xmm10,%xmm8
+  .byte  102,69,15,97,217                    // punpcklwd     %xmm9,%xmm11
+  .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
+  .byte  102,69,15,98,203                    // punpckldq     %xmm11,%xmm9
   .byte  243,68,15,127,12,248                // movdqu        %xmm9,(%rax,%rdi,8)
-  .byte  102,69,15,106,208                   // punpckhdq     %xmm8,%xmm10
-  .byte  243,68,15,127,84,248,16             // movdqu        %xmm10,0x10(%rax,%rdi,8)
+  .byte  102,69,15,106,195                   // punpckhdq     %xmm11,%xmm8
+  .byte  243,68,15,127,68,248,16             // movdqu        %xmm8,0x10(%rax,%rdi,8)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
@@ -12263,27 +12265,37 @@ _sk_store_f16_sse2:
   .byte  72,139,0                            // mov           (%rax),%rax
   .byte  185,0,0,128,7                       // mov           $0x7800000,%ecx
   .byte  102,68,15,110,193                   // movd          %ecx,%xmm8
-  .byte  102,69,15,112,192,0                 // pshufd        $0x0,%xmm8,%xmm8
-  .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
-  .byte  68,15,89,200                        // mulps         %xmm0,%xmm9
-  .byte  102,65,15,114,209,13                // psrld         $0xd,%xmm9
-  .byte  102,69,15,111,208                   // movdqa        %xmm8,%xmm10
+  .byte  102,69,15,112,200,0                 // pshufd        $0x0,%xmm8,%xmm9
+  .byte  102,69,15,111,193                   // movdqa        %xmm9,%xmm8
+  .byte  68,15,89,192                        // mulps         %xmm0,%xmm8
+  .byte  102,65,15,114,208,13                // psrld         $0xd,%xmm8
+  .byte  102,65,15,114,240,16                // pslld         $0x10,%xmm8
+  .byte  102,65,15,114,224,16                // psrad         $0x10,%xmm8
+  .byte  102,69,15,107,192                   // packssdw      %xmm8,%xmm8
+  .byte  102,69,15,111,209                   // movdqa        %xmm9,%xmm10
   .byte  68,15,89,209                        // mulps         %xmm1,%xmm10
   .byte  102,65,15,114,210,13                // psrld         $0xd,%xmm10
-  .byte  102,69,15,111,216                   // movdqa        %xmm8,%xmm11
+  .byte  102,65,15,114,242,16                // pslld         $0x10,%xmm10
+  .byte  102,65,15,114,226,16                // psrad         $0x10,%xmm10
+  .byte  102,69,15,107,210                   // packssdw      %xmm10,%xmm10
+  .byte  102,69,15,111,217                   // movdqa        %xmm9,%xmm11
   .byte  68,15,89,218                        // mulps         %xmm2,%xmm11
   .byte  102,65,15,114,211,13                // psrld         $0xd,%xmm11
-  .byte  68,15,89,195                        // mulps         %xmm3,%xmm8
-  .byte  102,65,15,114,208,13                // psrld         $0xd,%xmm8
-  .byte  102,65,15,115,250,2                 // pslldq        $0x2,%xmm10
-  .byte  102,69,15,235,209                   // por           %xmm9,%xmm10
-  .byte  102,65,15,115,248,2                 // pslldq        $0x2,%xmm8
-  .byte  102,69,15,235,195                   // por           %xmm11,%xmm8
-  .byte  102,69,15,111,202                   // movdqa        %xmm10,%xmm9
-  .byte  102,69,15,98,200                    // punpckldq     %xmm8,%xmm9
+  .byte  102,65,15,114,243,16                // pslld         $0x10,%xmm11
+  .byte  102,65,15,114,227,16                // psrad         $0x10,%xmm11
+  .byte  102,69,15,107,219                   // packssdw      %xmm11,%xmm11
+  .byte  68,15,89,203                        // mulps         %xmm3,%xmm9
+  .byte  102,65,15,114,209,13                // psrld         $0xd,%xmm9
+  .byte  102,65,15,114,241,16                // pslld         $0x10,%xmm9
+  .byte  102,65,15,114,225,16                // psrad         $0x10,%xmm9
+  .byte  102,69,15,107,201                   // packssdw      %xmm9,%xmm9
+  .byte  102,69,15,97,194                    // punpcklwd     %xmm10,%xmm8
+  .byte  102,69,15,97,217                    // punpcklwd     %xmm9,%xmm11
+  .byte  102,69,15,111,200                   // movdqa        %xmm8,%xmm9
+  .byte  102,69,15,98,203                    // punpckldq     %xmm11,%xmm9
   .byte  243,68,15,127,12,248                // movdqu        %xmm9,(%rax,%rdi,8)
-  .byte  102,69,15,106,208                   // punpckhdq     %xmm8,%xmm10
-  .byte  243,68,15,127,84,248,16             // movdqu        %xmm10,0x10(%rax,%rdi,8)
+  .byte  102,69,15,106,195                   // punpckhdq     %xmm11,%xmm8
+  .byte  243,68,15,127,68,248,16             // movdqu        %xmm8,0x10(%rax,%rdi,8)
   .byte  72,173                              // lods          %ds:(%rsi),%rax
   .byte  255,224                             // jmpq          *%rax
 
index a25db7c..a662394 100644 (file)
@@ -4286,32 +4286,32 @@ _sk_store_f16_avx LABEL PROC
   DB  196,67,125,25,202,1                 ; vextractf128  $0x1,%ymm9,%xmm10
   DB  196,193,41,114,210,13               ; vpsrld        $0xd,%xmm10,%xmm10
   DB  196,193,49,114,209,13               ; vpsrld        $0xd,%xmm9,%xmm9
-  DB  197,60,89,217                       ; vmulps        %ymm1,%ymm8,%ymm11
+  DB  196,66,49,43,202                    ; vpackusdw     %xmm10,%xmm9,%xmm9
+  DB  197,60,89,209                       ; vmulps        %ymm1,%ymm8,%ymm10
+  DB  196,67,125,25,211,1                 ; vextractf128  $0x1,%ymm10,%xmm11
+  DB  196,193,33,114,211,13               ; vpsrld        $0xd,%xmm11,%xmm11
+  DB  196,193,41,114,210,13               ; vpsrld        $0xd,%xmm10,%xmm10
+  DB  196,66,41,43,211                    ; vpackusdw     %xmm11,%xmm10,%xmm10
+  DB  197,60,89,218                       ; vmulps        %ymm2,%ymm8,%ymm11
   DB  196,67,125,25,220,1                 ; vextractf128  $0x1,%ymm11,%xmm12
   DB  196,193,25,114,212,13               ; vpsrld        $0xd,%xmm12,%xmm12
   DB  196,193,33,114,211,13               ; vpsrld        $0xd,%xmm11,%xmm11
-  DB  197,60,89,234                       ; vmulps        %ymm2,%ymm8,%ymm13
-  DB  196,67,125,25,238,1                 ; vextractf128  $0x1,%ymm13,%xmm14
-  DB  196,193,9,114,214,13                ; vpsrld        $0xd,%xmm14,%xmm14
-  DB  196,193,17,114,213,13               ; vpsrld        $0xd,%xmm13,%xmm13
+  DB  196,66,33,43,220                    ; vpackusdw     %xmm12,%xmm11,%xmm11
   DB  197,60,89,195                       ; vmulps        %ymm3,%ymm8,%ymm8
-  DB  196,67,125,25,199,1                 ; vextractf128  $0x1,%ymm8,%xmm15
-  DB  196,193,1,114,215,13                ; vpsrld        $0xd,%xmm15,%xmm15
+  DB  196,67,125,25,196,1                 ; vextractf128  $0x1,%ymm8,%xmm12
+  DB  196,193,25,114,212,13               ; vpsrld        $0xd,%xmm12,%xmm12
   DB  196,193,57,114,208,13               ; vpsrld        $0xd,%xmm8,%xmm8
-  DB  196,193,33,115,251,2                ; vpslldq       $0x2,%xmm11,%xmm11
-  DB  196,65,33,235,201                   ; vpor          %xmm9,%xmm11,%xmm9
-  DB  196,193,33,115,252,2                ; vpslldq       $0x2,%xmm12,%xmm11
-  DB  196,65,33,235,226                   ; vpor          %xmm10,%xmm11,%xmm12
-  DB  196,193,57,115,248,2                ; vpslldq       $0x2,%xmm8,%xmm8
-  DB  196,65,57,235,197                   ; vpor          %xmm13,%xmm8,%xmm8
-  DB  196,193,41,115,255,2                ; vpslldq       $0x2,%xmm15,%xmm10
-  DB  196,65,41,235,238                   ; vpor          %xmm14,%xmm10,%xmm13
-  DB  196,65,49,98,216                    ; vpunpckldq    %xmm8,%xmm9,%xmm11
-  DB  196,65,49,106,208                   ; vpunpckhdq    %xmm8,%xmm9,%xmm10
-  DB  196,65,25,98,205                    ; vpunpckldq    %xmm13,%xmm12,%xmm9
-  DB  196,65,25,106,197                   ; vpunpckhdq    %xmm13,%xmm12,%xmm8
+  DB  196,66,57,43,196                    ; vpackusdw     %xmm12,%xmm8,%xmm8
+  DB  196,65,49,97,226                    ; vpunpcklwd    %xmm10,%xmm9,%xmm12
+  DB  196,65,49,105,234                   ; vpunpckhwd    %xmm10,%xmm9,%xmm13
+  DB  196,65,33,97,200                    ; vpunpcklwd    %xmm8,%xmm11,%xmm9
+  DB  196,65,33,105,192                   ; vpunpckhwd    %xmm8,%xmm11,%xmm8
+  DB  196,65,25,98,217                    ; vpunpckldq    %xmm9,%xmm12,%xmm11
+  DB  196,65,25,106,209                   ; vpunpckhdq    %xmm9,%xmm12,%xmm10
+  DB  196,65,17,98,200                    ; vpunpckldq    %xmm8,%xmm13,%xmm9
+  DB  196,65,17,106,192                   ; vpunpckhdq    %xmm8,%xmm13,%xmm8
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,31                              ; jne           21de <_sk_store_f16_avx+0xd6>
+  DB  117,31                              ; jne           21da <_sk_store_f16_avx+0xd2>
   DB  196,65,120,17,28,248                ; vmovups       %xmm11,(%r8,%rdi,8)
   DB  196,65,120,17,84,248,16             ; vmovups       %xmm10,0x10(%r8,%rdi,8)
   DB  196,65,120,17,76,248,32             ; vmovups       %xmm9,0x20(%r8,%rdi,8)
@@ -4320,22 +4320,22 @@ _sk_store_f16_avx LABEL PROC
   DB  255,224                             ; jmpq          *%rax
   DB  196,65,121,214,28,248               ; vmovq         %xmm11,(%r8,%rdi,8)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            21da <_sk_store_f16_avx+0xd2>
+  DB  116,240                             ; je            21d6 <_sk_store_f16_avx+0xce>
   DB  196,65,121,23,92,248,8              ; vmovhpd       %xmm11,0x8(%r8,%rdi,8)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            21da <_sk_store_f16_avx+0xd2>
+  DB  114,227                             ; jb            21d6 <_sk_store_f16_avx+0xce>
   DB  196,65,121,214,84,248,16            ; vmovq         %xmm10,0x10(%r8,%rdi,8)
-  DB  116,218                             ; je            21da <_sk_store_f16_avx+0xd2>
+  DB  116,218                             ; je            21d6 <_sk_store_f16_avx+0xce>
   DB  196,65,121,23,84,248,24             ; vmovhpd       %xmm10,0x18(%r8,%rdi,8)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            21da <_sk_store_f16_avx+0xd2>
+  DB  114,205                             ; jb            21d6 <_sk_store_f16_avx+0xce>
   DB  196,65,121,214,76,248,32            ; vmovq         %xmm9,0x20(%r8,%rdi,8)
-  DB  116,196                             ; je            21da <_sk_store_f16_avx+0xd2>
+  DB  116,196                             ; je            21d6 <_sk_store_f16_avx+0xce>
   DB  196,65,121,23,76,248,40             ; vmovhpd       %xmm9,0x28(%r8,%rdi,8)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,183                             ; jb            21da <_sk_store_f16_avx+0xd2>
+  DB  114,183                             ; jb            21d6 <_sk_store_f16_avx+0xce>
   DB  196,65,121,214,68,248,48            ; vmovq         %xmm8,0x30(%r8,%rdi,8)
-  DB  235,174                             ; jmp           21da <_sk_store_f16_avx+0xd2>
+  DB  235,174                             ; jmp           21d6 <_sk_store_f16_avx+0xce>
 
 PUBLIC _sk_store_f32_avx
 _sk_store_f32_avx LABEL PROC
@@ -4351,7 +4351,7 @@ _sk_store_f32_avx LABEL PROC
   DB  196,65,37,20,196                    ; vunpcklpd     %ymm12,%ymm11,%ymm8
   DB  196,65,37,21,220                    ; vunpckhpd     %ymm12,%ymm11,%ymm11
   DB  72,133,201                          ; test          %rcx,%rcx
-  DB  117,55                              ; jne           2299 <_sk_store_f32_avx+0x6d>
+  DB  117,55                              ; jne           2295 <_sk_store_f32_avx+0x6d>
   DB  196,67,45,24,225,1                  ; vinsertf128   $0x1,%xmm9,%ymm10,%ymm12
   DB  196,67,61,24,235,1                  ; vinsertf128   $0x1,%xmm11,%ymm8,%ymm13
   DB  196,67,45,6,201,49                  ; vperm2f128    $0x31,%ymm9,%ymm10,%ymm9
@@ -4364,22 +4364,22 @@ _sk_store_f32_avx LABEL PROC
   DB  255,224                             ; jmpq          *%rax
   DB  196,65,121,17,20,128                ; vmovupd       %xmm10,(%r8,%rax,4)
   DB  72,131,249,1                        ; cmp           $0x1,%rcx
-  DB  116,240                             ; je            2295 <_sk_store_f32_avx+0x69>
+  DB  116,240                             ; je            2291 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,76,128,16             ; vmovupd       %xmm9,0x10(%r8,%rax,4)
   DB  72,131,249,3                        ; cmp           $0x3,%rcx
-  DB  114,227                             ; jb            2295 <_sk_store_f32_avx+0x69>
+  DB  114,227                             ; jb            2291 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,68,128,32             ; vmovupd       %xmm8,0x20(%r8,%rax,4)
-  DB  116,218                             ; je            2295 <_sk_store_f32_avx+0x69>
+  DB  116,218                             ; je            2291 <_sk_store_f32_avx+0x69>
   DB  196,65,121,17,92,128,48             ; vmovupd       %xmm11,0x30(%r8,%rax,4)
   DB  72,131,249,5                        ; cmp           $0x5,%rcx
-  DB  114,205                             ; jb            2295 <_sk_store_f32_avx+0x69>
+  DB  114,205                             ; jb            2291 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,84,128,64,1           ; vextractf128  $0x1,%ymm10,0x40(%r8,%rax,4)
-  DB  116,195                             ; je            2295 <_sk_store_f32_avx+0x69>
+  DB  116,195                             ; je            2291 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,76,128,80,1           ; vextractf128  $0x1,%ymm9,0x50(%r8,%rax,4)
   DB  72,131,249,7                        ; cmp           $0x7,%rcx
-  DB  114,181                             ; jb            2295 <_sk_store_f32_avx+0x69>
+  DB  114,181                             ; jb            2291 <_sk_store_f32_avx+0x69>
   DB  196,67,125,25,68,128,96,1           ; vextractf128  $0x1,%ymm8,0x60(%r8,%rax,4)
-  DB  235,171                             ; jmp           2295 <_sk_store_f32_avx+0x69>
+  DB  235,171                             ; jmp           2291 <_sk_store_f32_avx+0x69>
 
 PUBLIC _sk_clamp_x_avx
 _sk_clamp_x_avx LABEL PROC
@@ -6412,27 +6412,29 @@ _sk_store_f16_sse41 LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  185,0,0,128,7                       ; mov           $0x7800000,%ecx
   DB  102,68,15,110,193                   ; movd          %ecx,%xmm8
-  DB  102,69,15,112,192,0                 ; pshufd        $0x0,%xmm8,%xmm8
-  DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
-  DB  68,15,89,200                        ; mulps         %xmm0,%xmm9
-  DB  102,65,15,114,209,13                ; psrld         $0xd,%xmm9
-  DB  102,69,15,111,208                   ; movdqa        %xmm8,%xmm10
+  DB  102,69,15,112,200,0                 ; pshufd        $0x0,%xmm8,%xmm9
+  DB  102,69,15,111,193                   ; movdqa        %xmm9,%xmm8
+  DB  68,15,89,192                        ; mulps         %xmm0,%xmm8
+  DB  102,65,15,114,208,13                ; psrld         $0xd,%xmm8
+  DB  102,69,15,56,43,192                 ; packusdw      %xmm8,%xmm8
+  DB  102,69,15,111,209                   ; movdqa        %xmm9,%xmm10
   DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
   DB  102,65,15,114,210,13                ; psrld         $0xd,%xmm10
-  DB  102,69,15,111,216                   ; movdqa        %xmm8,%xmm11
+  DB  102,69,15,56,43,210                 ; packusdw      %xmm10,%xmm10
+  DB  102,69,15,111,217                   ; movdqa        %xmm9,%xmm11
   DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
   DB  102,65,15,114,211,13                ; psrld         $0xd,%xmm11
-  DB  68,15,89,195                        ; mulps         %xmm3,%xmm8
-  DB  102,65,15,114,208,13                ; psrld         $0xd,%xmm8
-  DB  102,65,15,115,250,2                 ; pslldq        $0x2,%xmm10
-  DB  102,69,15,235,209                   ; por           %xmm9,%xmm10
-  DB  102,65,15,115,248,2                 ; pslldq        $0x2,%xmm8
-  DB  102,69,15,235,195                   ; por           %xmm11,%xmm8
-  DB  102,69,15,111,202                   ; movdqa        %xmm10,%xmm9
-  DB  102,69,15,98,200                    ; punpckldq     %xmm8,%xmm9
+  DB  102,69,15,56,43,219                 ; packusdw      %xmm11,%xmm11
+  DB  68,15,89,203                        ; mulps         %xmm3,%xmm9
+  DB  102,65,15,114,209,13                ; psrld         $0xd,%xmm9
+  DB  102,69,15,56,43,201                 ; packusdw      %xmm9,%xmm9
+  DB  102,69,15,97,194                    ; punpcklwd     %xmm10,%xmm8
+  DB  102,69,15,97,217                    ; punpcklwd     %xmm9,%xmm11
+  DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
+  DB  102,69,15,98,203                    ; punpckldq     %xmm11,%xmm9
   DB  243,68,15,127,12,248                ; movdqu        %xmm9,(%rax,%rdi,8)
-  DB  102,69,15,106,208                   ; punpckhdq     %xmm8,%xmm10
-  DB  243,68,15,127,84,248,16             ; movdqu        %xmm10,0x10(%rax,%rdi,8)
+  DB  102,69,15,106,195                   ; punpckhdq     %xmm11,%xmm8
+  DB  243,68,15,127,68,248,16             ; movdqu        %xmm8,0x10(%rax,%rdi,8)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
@@ -8599,27 +8601,37 @@ _sk_store_f16_sse2 LABEL PROC
   DB  72,139,0                            ; mov           (%rax),%rax
   DB  185,0,0,128,7                       ; mov           $0x7800000,%ecx
   DB  102,68,15,110,193                   ; movd          %ecx,%xmm8
-  DB  102,69,15,112,192,0                 ; pshufd        $0x0,%xmm8,%xmm8
-  DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
-  DB  68,15,89,200                        ; mulps         %xmm0,%xmm9
-  DB  102,65,15,114,209,13                ; psrld         $0xd,%xmm9
-  DB  102,69,15,111,208                   ; movdqa        %xmm8,%xmm10
+  DB  102,69,15,112,200,0                 ; pshufd        $0x0,%xmm8,%xmm9
+  DB  102,69,15,111,193                   ; movdqa        %xmm9,%xmm8
+  DB  68,15,89,192                        ; mulps         %xmm0,%xmm8
+  DB  102,65,15,114,208,13                ; psrld         $0xd,%xmm8
+  DB  102,65,15,114,240,16                ; pslld         $0x10,%xmm8
+  DB  102,65,15,114,224,16                ; psrad         $0x10,%xmm8
+  DB  102,69,15,107,192                   ; packssdw      %xmm8,%xmm8
+  DB  102,69,15,111,209                   ; movdqa        %xmm9,%xmm10
   DB  68,15,89,209                        ; mulps         %xmm1,%xmm10
   DB  102,65,15,114,210,13                ; psrld         $0xd,%xmm10
-  DB  102,69,15,111,216                   ; movdqa        %xmm8,%xmm11
+  DB  102,65,15,114,242,16                ; pslld         $0x10,%xmm10
+  DB  102,65,15,114,226,16                ; psrad         $0x10,%xmm10
+  DB  102,69,15,107,210                   ; packssdw      %xmm10,%xmm10
+  DB  102,69,15,111,217                   ; movdqa        %xmm9,%xmm11
   DB  68,15,89,218                        ; mulps         %xmm2,%xmm11
   DB  102,65,15,114,211,13                ; psrld         $0xd,%xmm11
-  DB  68,15,89,195                        ; mulps         %xmm3,%xmm8
-  DB  102,65,15,114,208,13                ; psrld         $0xd,%xmm8
-  DB  102,65,15,115,250,2                 ; pslldq        $0x2,%xmm10
-  DB  102,69,15,235,209                   ; por           %xmm9,%xmm10
-  DB  102,65,15,115,248,2                 ; pslldq        $0x2,%xmm8
-  DB  102,69,15,235,195                   ; por           %xmm11,%xmm8
-  DB  102,69,15,111,202                   ; movdqa        %xmm10,%xmm9
-  DB  102,69,15,98,200                    ; punpckldq     %xmm8,%xmm9
+  DB  102,65,15,114,243,16                ; pslld         $0x10,%xmm11
+  DB  102,65,15,114,227,16                ; psrad         $0x10,%xmm11
+  DB  102,69,15,107,219                   ; packssdw      %xmm11,%xmm11
+  DB  68,15,89,203                        ; mulps         %xmm3,%xmm9
+  DB  102,65,15,114,209,13                ; psrld         $0xd,%xmm9
+  DB  102,65,15,114,241,16                ; pslld         $0x10,%xmm9
+  DB  102,65,15,114,225,16                ; psrad         $0x10,%xmm9
+  DB  102,69,15,107,201                   ; packssdw      %xmm9,%xmm9
+  DB  102,69,15,97,194                    ; punpcklwd     %xmm10,%xmm8
+  DB  102,69,15,97,217                    ; punpcklwd     %xmm9,%xmm11
+  DB  102,69,15,111,200                   ; movdqa        %xmm8,%xmm9
+  DB  102,69,15,98,203                    ; punpckldq     %xmm11,%xmm9
   DB  243,68,15,127,12,248                ; movdqu        %xmm9,(%rax,%rdi,8)
-  DB  102,69,15,106,208                   ; punpckhdq     %xmm8,%xmm10
-  DB  243,68,15,127,84,248,16             ; movdqu        %xmm10,0x10(%rax,%rdi,8)
+  DB  102,69,15,106,195                   ; punpckhdq     %xmm11,%xmm8
+  DB  243,68,15,127,68,248,16             ; movdqu        %xmm8,0x10(%rax,%rdi,8)
   DB  72,173                              ; lods          %ds:(%rsi),%rax
   DB  255,224                             ; jmpq          *%rax
 
index 96035bd..54e957a 100644 (file)
@@ -28,6 +28,14 @@ SI Dst bit_cast(const Src& src) {
     return unaligned_load<Dst>(&src);
 }
 
+template <typename Dst, typename Src>
+SI Dst widen_cast(const Src& src) {
+    static_assert(sizeof(Dst) > sizeof(Src), "");
+    Dst dst;
+    memcpy(&dst, &src, sizeof(Src));
+    return dst;
+}
+
 // A couple functions for embedding constants directly into code,
 // so that no .const or .literal4 section is created.
 SI int C(int x) {
index dd2bb13..fa64e80 100644 (file)
@@ -634,117 +634,10 @@ STAGE(load_f16) {
 STAGE(store_f16) {
     auto ptr = *(uint64_t**)ctx + x;
 
-#if !defined(JUMPER)
-    auto float_to_half = [&](F f) {
-        return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-            >> 13;                                                // then line up the mantissa.
-    };
-    auto rgba = (int16_t*)ptr;
-    rgba[0] = float_to_half(r);
-    rgba[1] = float_to_half(g);
-    rgba[2] = float_to_half(b);
-    rgba[3] = float_to_half(a);
-#elif defined(__aarch64__)
-    float16x4x4_t halfs = {{
-        vcvt_f16_f32(r),
-        vcvt_f16_f32(g),
-        vcvt_f16_f32(b),
-        vcvt_f16_f32(a),
-    }};
-    vst4_f16((float16_t*)ptr, halfs);
-#elif defined(__arm__)
-    float16x4x2_t rb_ga = {{
-        vcvt_f16_f32(float32x4_t{r[0], b[0], r[1], b[1]}),
-        vcvt_f16_f32(float32x4_t{g[0], a[0], g[1], a[1]}),
-    }};
-    vst2_f16((float16_t*)ptr, rb_ga);
-#elif defined(__AVX2__) && defined(__FMA__) && defined(__F16C__)
-    auto R = _mm256_cvtps_ph(r, _MM_FROUND_CUR_DIRECTION),
-         G = _mm256_cvtps_ph(g, _MM_FROUND_CUR_DIRECTION),
-         B = _mm256_cvtps_ph(b, _MM_FROUND_CUR_DIRECTION),
-         A = _mm256_cvtps_ph(a, _MM_FROUND_CUR_DIRECTION);
-
-    auto rg0123 = _mm_unpacklo_epi16(R, G),  // r0 g0 r1 g1 r2 g2 r3 g3
-         rg4567 = _mm_unpackhi_epi16(R, G),  // r4 g4 r5 g5 r6 g6 r7 g7
-         ba0123 = _mm_unpacklo_epi16(B, A),
-         ba4567 = _mm_unpackhi_epi16(B, A);
-
-    auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
-         _23 = _mm_unpackhi_epi32(rg0123, ba0123),
-         _45 = _mm_unpacklo_epi32(rg4567, ba4567),
-         _67 = _mm_unpackhi_epi32(rg4567, ba4567);
-
-    if (__builtin_expect(tail,0)) {
-        auto dst = (double*)ptr;
-        if (tail > 0) { _mm_storel_pd(dst+0, _01); }
-        if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
-        if (tail > 2) { _mm_storel_pd(dst+2, _23); }
-        if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
-        if (tail > 4) { _mm_storel_pd(dst+4, _45); }
-        if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
-        if (tail > 6) { _mm_storel_pd(dst+6, _67); }
-    } else {
-        _mm_storeu_si128((__m128i*)ptr + 0, _01);
-        _mm_storeu_si128((__m128i*)ptr + 1, _23);
-        _mm_storeu_si128((__m128i*)ptr + 2, _45);
-        _mm_storeu_si128((__m128i*)ptr + 3, _67);
-    }
-#elif defined(__AVX__)
-    auto float_to_half = [&](F f) {
-        return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-            >> 13;                                                // then line up the mantissa.
-    };
-    U32 R = float_to_half(r),
-        G = float_to_half(g),
-        B = float_to_half(b),
-        A = float_to_half(a);
-    auto r0123 = _mm256_extractf128_si256(R, 0),
-         r4567 = _mm256_extractf128_si256(R, 1),
-         g0123 = _mm256_extractf128_si256(G, 0),
-         g4567 = _mm256_extractf128_si256(G, 1),
-         b0123 = _mm256_extractf128_si256(B, 0),
-         b4567 = _mm256_extractf128_si256(B, 1),
-         a0123 = _mm256_extractf128_si256(A, 0),
-         a4567 = _mm256_extractf128_si256(A, 1);
-    auto rg0123 = r0123 | _mm_slli_si128(g0123,2),
-         rg4567 = r4567 | _mm_slli_si128(g4567,2),
-         ba0123 = b0123 | _mm_slli_si128(a0123,2),
-         ba4567 = b4567 | _mm_slli_si128(a4567,2);
-
-    auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
-         _23 = _mm_unpackhi_epi32(rg0123, ba0123),
-         _45 = _mm_unpacklo_epi32(rg4567, ba4567),
-         _67 = _mm_unpackhi_epi32(rg4567, ba4567);
-
-    if (__builtin_expect(tail,0)) {
-        auto dst = (double*)ptr;
-        if (tail > 0) { _mm_storel_pd(dst+0, _01); }
-        if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
-        if (tail > 2) { _mm_storel_pd(dst+2, _23); }
-        if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
-        if (tail > 4) { _mm_storel_pd(dst+4, _45); }
-        if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
-        if (tail > 6) { _mm_storel_pd(dst+6, _67); }
-    } else {
-        _mm_storeu_si128((__m128i*)ptr + 0, _01);
-        _mm_storeu_si128((__m128i*)ptr + 1, _23);
-        _mm_storeu_si128((__m128i*)ptr + 2, _45);
-        _mm_storeu_si128((__m128i*)ptr + 3, _67);
-    }
-#elif defined(__SSE2__)
-    auto float_to_half = [&](F f) {
-        return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
-            >> 13;                                                // then line up the mantissa.
-    };
-    U32 R = float_to_half(r),
-        G = float_to_half(g),
-        B = float_to_half(b),
-        A = float_to_half(a);
-    U32 rg = R | _mm_slli_si128(G,2),
-        ba = B | _mm_slli_si128(A,2);
-    _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
-    _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
-#endif
+    store4(ptr,tail, to_half(r)
+                   , to_half(g)
+                   , to_half(b)
+                   , to_half(a));
 }
 
 STAGE(store_f32) {
index 3e9edd8..1685da9 100644 (file)
         *b = ptr[2];
         *a = ptr[3];
     }
+    SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+        auto ptr = (uint16_t*)vptr;
+        ptr[0] = r;
+        ptr[1] = g;
+        ptr[2] = b;
+        ptr[3] = a;
+    }
 
     SI F from_half(U16 h) {
         if ((int16_t)h < 0x0400) { h = 0; }   // Flush denorm and negative to zero.
         return bit_cast<F>(h << 13)           // Line up the mantissa,
              * bit_cast<F>(U32(0x77800000));  // then fix up the exponent.
     }
+    SI U16 to_half(F f) {
+        return bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
+            >> 13;                                                // then line up the mantissa.
+    }
 
 #elif defined(__aarch64__)
     #include <arm_neon.h>
         *b = rgba.val[2];
         *a = rgba.val[3];
     }
-
-    SI F from_half(U16 h) {
-        return vcvt_f32_f16(h);
+    SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+        uint16x4x4_t rgba = {{r,g,b,a}};
+        vst4_u16((uint16_t*)ptr, rgba);
     }
 
+    SI F from_half(U16 h) { return vcvt_f32_f16(h); }
+    SI U16 to_half(F   f) { return vcvt_f16_f32(f); }
+
 #elif defined(__arm__)
     #if defined(__thumb2__) || !defined(__ARM_ARCH_7A__) || !defined(__ARM_VFPV4__)
         #error On ARMv7, compile with -march=armv7-a -mfpu=neon-vfp4, without -mthumb.
         *b = unaligned_load<U16>(rgba.val+2);
         *a = unaligned_load<U16>(rgba.val+3);
     }
+    SI void store4(void* vptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+        auto ptr = (uint16_t*)vptr;
+        uint16x4x4_t rgba = {{
+            widen_cast<uint16x4_t>(r),
+            widen_cast<uint16x4_t>(g),
+            widen_cast<uint16x4_t>(b),
+            widen_cast<uint16x4_t>(a),
+        }};
+        vst4_lane_u16(ptr + 0, rgba, 0);
+        vst4_lane_u16(ptr + 4, rgba, 1);
+    }
 
     SI F from_half(U16 h) {
-        uint16x4_t v;
-        memcpy(&v, &h, sizeof(h));
+        auto v = widen_cast<uint16x4_t>(h);
         return vget_low_f32(vcvt_f32_f16(v));
     }
+    SI U16 to_half(F f) {
+        auto v = widen_cast<float32x4_t>(f);
+        uint16x4_t h = vcvt_f16_f32(v);
+        return unaligned_load<U16>(&h);
+    }
 
 #elif defined(__AVX__)
     #include <immintrin.h>
         *b = _mm_unpacklo_epi64(ba0123, ba4567);
         *a = _mm_unpackhi_epi64(ba0123, ba4567);
     }
+    SI void store4(void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+        auto rg0123 = _mm_unpacklo_epi16(r, g),  // r0 g0 r1 g1 r2 g2 r3 g3
+             rg4567 = _mm_unpackhi_epi16(r, g),  // r4 g4 r5 g5 r6 g6 r7 g7
+             ba0123 = _mm_unpacklo_epi16(b, a),
+             ba4567 = _mm_unpackhi_epi16(b, a);
+
+        auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
+             _23 = _mm_unpackhi_epi32(rg0123, ba0123),
+             _45 = _mm_unpacklo_epi32(rg4567, ba4567),
+             _67 = _mm_unpackhi_epi32(rg4567, ba4567);
+
+        if (__builtin_expect(tail,0)) {
+            auto dst = (double*)ptr;
+            if (tail > 0) { _mm_storel_pd(dst+0, _01); }
+            if (tail > 1) { _mm_storeh_pd(dst+1, _01); }
+            if (tail > 2) { _mm_storel_pd(dst+2, _23); }
+            if (tail > 3) { _mm_storeh_pd(dst+3, _23); }
+            if (tail > 4) { _mm_storel_pd(dst+4, _45); }
+            if (tail > 5) { _mm_storeh_pd(dst+5, _45); }
+            if (tail > 6) { _mm_storel_pd(dst+6, _67); }
+        } else {
+            _mm_storeu_si128((__m128i*)ptr + 0, _01);
+            _mm_storeu_si128((__m128i*)ptr + 1, _23);
+            _mm_storeu_si128((__m128i*)ptr + 2, _45);
+            _mm_storeu_si128((__m128i*)ptr + 3, _67);
+        }
+    }
 
     SI F from_half(U16 h) {
     #if defined(__AVX2__)
              * bit_cast<F>(U32(0x77800000_i));  // then fix up the exponent.
     #endif
     }
+    SI U16 to_half(F f) {
+    #if defined(__AVX2__)
+        return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
+    #else
+        return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
+                    >> 13);                                            // then line up the mantissa.
+    #endif
+    }
 
 #elif defined(__SSE2__)
     #include <immintrin.h>
         return unaligned_load<U16>(&p);  // We have two copies.  Return (the lower) one.
     }
     SI U8 pack(U16 v) {
-        __m128i r;
-        memcpy(&r, &v, sizeof(v));
+        auto r = widen_cast<__m128i>(v);
         r = _mm_packus_epi16(r,r);
         return unaligned_load<U8>(&r);
     }
         *b = unaligned_load<U16>((uint16_t*)&ba + 0);
         *a = unaligned_load<U16>((uint16_t*)&ba + 4);
     }
+    SI void store4(const void* ptr, size_t tail, U16 r, U16 g, U16 b, U16 a) {
+        auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
+             ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
+        _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
+        _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
+    }
 
     SI F from_half(U16 h) {
-        __m128i v;
-        memcpy(&v, &h, sizeof(h));
+        auto v = widen_cast<__m128i>(h);
 
         // Same deal as AVX: flush denorms and negatives to zero.
         v = _mm_andnot_si128(_mm_cmplt_epi16(v, _mm_set1_epi32(0x04000400_i)), v);
         return bit_cast<F>(w << 13)             // Line up the mantissa,
              * bit_cast<F>(U32(0x77800000_i));  // then fix up the exponent.
     }
+    SI U16 to_half(F f) {
+        return pack(bit_cast<U32>(f * bit_cast<F>(U32(0x07800000_i)))  // Fix up the exponent,
+                    >> 13);                                            // then line up the mantissa.
+    }
 #endif
 
 // We need to be a careful with casts.