simd/x86_64/jidctflt-sse2.asm

   1 ;
   2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright (C) 2009, 2016, D. R. Commander.
   6 ; Copyright (C) 2018, Matthias Räncker.
   7 ;
   8 ; Based on the x86 SIMD extension for IJG JPEG library
   9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11 ;
  12 ; This file should be assembled with NASM (Netwide Assembler),
  13 ; can *not* be assembled with Microsoft's MASM or any compatible
  14 ; assembler (including Borland's Turbo Assembler).
  15 ; NASM is available from http://nasm.sourceforge.net/ or
  16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17 ;
  18 ; This file contains a floating-point implementation of the inverse DCT
  19 ; (Discrete Cosine Transform). The following code is based directly on
  20 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  21
  22 %include "jsimdext.inc"
  23 %include "jdct.inc"
  24
  25 ; --------------------------------------------------------------------------
  26
  27 %macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  28     shufps      %1, %2, 0x44
  29 %endmacro
  30
  31 %macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  32     shufps      %1, %2, 0xEE
  33 %endmacro
  34
  35 ; --------------------------------------------------------------------------
  36     SECTION     SEG_CONST
  37
  38     alignz      32
  39     GLOBAL_DATA(jconst_idct_float_sse2)
  40
  41 EXTN(jconst_idct_float_sse2):
  42
  43 PD_1_414        times 4  dd  1.414213562373095048801689
  44 PD_1_847        times 4  dd  1.847759065022573512256366
  45 PD_1_082        times 4  dd  1.082392200292393968799446
  46 PD_M2_613       times 4  dd -2.613125929752753055713286
  47 PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
  48 PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
  49
  50     alignz      32
  51
  52 ; --------------------------------------------------------------------------
  53     SECTION     SEG_TEXT
  54     BITS        64
  55 ;
  56 ; Perform dequantization and inverse DCT on one block of coefficients.
  57 ;
  58 ; GLOBAL(void)
  59 ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
  60 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
  61 ;
  62
  63 ; r10 = void *dct_table
  64 ; r11 = JCOEFPTR coef_block
  65 ; r12 = JSAMPARRAY output_buf
  66 ; r13d = JDIMENSION output_col
  67
  68 %define original_rbp  rbp + 0
  69 %define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
  70                                         ; xmmword wk[WK_NUM]
  71 %define WK_NUM        2
  72 %define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
  73                                         ; FAST_FLOAT workspace[DCTSIZE2]
  74
  75     align       32
  76     GLOBAL_FUNCTION(jsimd_idct_float_sse2)
  77
  78 EXTN(jsimd_idct_float_sse2):
  79     push        rbp
  80     mov         rax, rsp                     ; rax = original rbp
  81     sub         rsp, byte 4
  82     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
  83     mov         [rsp], rax
  84     mov         rbp, rsp                     ; rbp = aligned rbp
  85     lea         rsp, [workspace]
  86     collect_args 4
  87     push        rbx
  88
  89     ; ---- Pass 1: process columns from input, store into work array.
  90
  91     mov         rdx, r10                ; quantptr
  92     mov         rsi, r11                ; inptr
  93     lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
  94     mov         rcx, DCTSIZE/4          ; ctr
  95 .columnloop:
  96 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  97     mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  98     or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  99     jnz         near .columnDCT
 100
 101     movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 102     movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 103     movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 104     movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 105     movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 106     movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 107     movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 108     por         xmm1, xmm2
 109     por         xmm3, xmm4
 110     por         xmm5, xmm6
 111     por         xmm1, xmm3
 112     por         xmm5, xmm7
 113     por         xmm1, xmm5
 114     packsswb    xmm1, xmm1
 115     movd        eax, xmm1
 116     test        rax, rax
 117     jnz         short .columnDCT
 118
 119     ; -- AC terms all zero
 120
 121     movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 122
 123     punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
 124     psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
 125     cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
 126
 127     mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 128
 129     movaps      xmm1, xmm0
 130     movaps      xmm2, xmm0
 131     movaps      xmm3, xmm0
 132
 133     shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
 134     shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
 135     shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
 136     shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
 137
 138     movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 139     movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
 140     movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 141     movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
 142     movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
 143     movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
 144     movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 145     movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 146     jmp         near .nextcolumn
 147 %endif
 148 .columnDCT:
 149
 150     ; -- Even part
 151
 152     movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 153     movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 154     movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 155     movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 156
 157     punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
 158     punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
 159     psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
 160     psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
 161     cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
 162     cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
 163
 164     punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
 165     punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
 166     psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
 167     psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
 168     cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
 169     cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
 170
 171     mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 172     mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 173     mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 174     mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 175
 176     movaps      xmm4, xmm0
 177     movaps      xmm5, xmm1
 178     subps       xmm0, xmm2              ; xmm0=tmp11
 179     subps       xmm1, xmm3
 180     addps       xmm4, xmm2              ; xmm4=tmp10
 181     addps       xmm5, xmm3              ; xmm5=tmp13
 182
 183     mulps       xmm1, [rel PD_1_414]
 184     subps       xmm1, xmm5              ; xmm1=tmp12
 185
 186     movaps      xmm6, xmm4
 187     movaps      xmm7, xmm0
 188     subps       xmm4, xmm5              ; xmm4=tmp3
 189     subps       xmm0, xmm1              ; xmm0=tmp2
 190     addps       xmm6, xmm5              ; xmm6=tmp0
 191     addps       xmm7, xmm1              ; xmm7=tmp1
 192
 193     movaps      XMMWORD [wk(1)], xmm4   ; tmp3
 194     movaps      XMMWORD [wk(0)], xmm0   ; tmp2
 195
 196     ; -- Odd part
 197
 198     movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 199     movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 200     movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 201     movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 202
 203     punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
 204     punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
 205     psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
 206     psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
 207     cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
 208     cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
 209
 210     punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
 211     punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
 212     psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
 213     psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
 214     cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
 215     cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
 216
 217     mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 218     mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 219     mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 220     mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 221
 222     movaps      xmm4, xmm2
 223     movaps      xmm0, xmm5
 224     addps       xmm2, xmm1              ; xmm2=z11
 225     addps       xmm5, xmm3              ; xmm5=z13
 226     subps       xmm4, xmm1              ; xmm4=z12
 227     subps       xmm0, xmm3              ; xmm0=z10
 228
 229     movaps      xmm1, xmm2
 230     subps       xmm2, xmm5
 231     addps       xmm1, xmm5              ; xmm1=tmp7
 232
 233     mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
 234
 235     movaps      xmm3, xmm0
 236     addps       xmm0, xmm4
 237     mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
 238     mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
 239     mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
 240     addps       xmm3, xmm0              ; xmm3=tmp12
 241     subps       xmm4, xmm0              ; xmm4=tmp10
 242
 243     ; -- Final output stage
 244
 245     subps       xmm3, xmm1              ; xmm3=tmp6
 246     movaps      xmm5, xmm6
 247     movaps      xmm0, xmm7
 248     addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
 249     addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
 250     subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
 251     subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
 252     subps       xmm2, xmm3              ; xmm2=tmp5
 253
 254     movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
 255     unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
 256     unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
 257     movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
 258     unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
 259     unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
 260
 261     movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
 262     movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
 263
 264     movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
 265     movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
 266
 267     addps       xmm4, xmm2              ; xmm4=tmp4
 268     movaps      xmm0, xmm7
 269     movaps      xmm3, xmm5
 270     addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
 271     addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
 272     subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
 273     subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
 274
 275     movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
 276     unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
 277     unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
 278     movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
 279     unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
 280     unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
 281
 282     movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
 283     unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
 284     unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
 285     movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
 286     unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
 287     unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
 288
 289     movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
 290     movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
 291
 292     movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
 293     movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 294     movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 295     movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 296
 297     movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
 298     unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
 299     unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
 300     movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
 301     unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
 302     unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
 303
 304     movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
 305     movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
 306     movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
 307     movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 308
 309 .nextcolumn:
 310     add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
 311     add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
 312     add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
 313     dec         rcx                                    ; ctr
 314     jnz         near .columnloop
 315
 316     ; -- Prefetch the next coefficient block
 317
 318     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
 319     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
 320     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
 321     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
 322
 323     ; ---- Pass 2: process rows from work array, store into output array.
 324
 325     mov         rax, [original_rbp]
 326     lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
 327     mov         rdi, r12                ; (JSAMPROW *)
 328     mov         eax, r13d
 329     mov         rcx, DCTSIZE/4          ; ctr
 330 .rowloop:
 331
 332     ; -- Even part
 333
 334     movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
 335     movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
 336     movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
 337     movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
 338
 339     movaps      xmm4, xmm0
 340     movaps      xmm5, xmm1
 341     subps       xmm0, xmm2              ; xmm0=tmp11
 342     subps       xmm1, xmm3
 343     addps       xmm4, xmm2              ; xmm4=tmp10
 344     addps       xmm5, xmm3              ; xmm5=tmp13
 345
 346     mulps       xmm1, [rel PD_1_414]
 347     subps       xmm1, xmm5              ; xmm1=tmp12
 348
 349     movaps      xmm6, xmm4
 350     movaps      xmm7, xmm0
 351     subps       xmm4, xmm5              ; xmm4=tmp3
 352     subps       xmm0, xmm1              ; xmm0=tmp2
 353     addps       xmm6, xmm5              ; xmm6=tmp0
 354     addps       xmm7, xmm1              ; xmm7=tmp1
 355
 356     movaps      XMMWORD [wk(1)], xmm4   ; tmp3
 357     movaps      XMMWORD [wk(0)], xmm0   ; tmp2
 358
 359     ; -- Odd part
 360
 361     movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
 362     movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
 363     movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
 364     movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
 365
 366     movaps      xmm4, xmm2
 367     movaps      xmm0, xmm5
 368     addps       xmm2, xmm1              ; xmm2=z11
 369     addps       xmm5, xmm3              ; xmm5=z13
 370     subps       xmm4, xmm1              ; xmm4=z12
 371     subps       xmm0, xmm3              ; xmm0=z10
 372
 373     movaps      xmm1, xmm2
 374     subps       xmm2, xmm5
 375     addps       xmm1, xmm5              ; xmm1=tmp7
 376
 377     mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
 378
 379     movaps      xmm3, xmm0
 380     addps       xmm0, xmm4
 381     mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
 382     mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
 383     mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
 384     addps       xmm3, xmm0              ; xmm3=tmp12
 385     subps       xmm4, xmm0              ; xmm4=tmp10
 386
 387     ; -- Final output stage
 388
 389     subps       xmm3, xmm1              ; xmm3=tmp6
 390     movaps      xmm5, xmm6
 391     movaps      xmm0, xmm7
 392     addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
 393     addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
 394     subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
 395     subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
 396     subps       xmm2, xmm3              ; xmm2=tmp5
 397
 398     movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
 399     pcmpeqd     xmm3, xmm3
 400     psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 401
 402     addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
 403     addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
 404     addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
 405     addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
 406
 407     pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
 408     pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
 409     pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
 410     pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
 411     por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
 412     por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
 413
 414     movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
 415     movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
 416
 417     addps       xmm4, xmm2              ; xmm4=tmp4
 418     movaps      xmm7, xmm1
 419     movaps      xmm5, xmm3
 420     addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
 421     addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
 422     subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
 423     subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
 424
 425     movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
 426     pcmpeqd     xmm4, xmm4
 427     psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 428
 429     addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
 430     addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
 431     addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
 432     addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
 433
 434     pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
 435     pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
 436     pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
 437     pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
 438     por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
 439     por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
 440
 441     movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
 442
 443     packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
 444     packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
 445     paddb       xmm6, xmm2
 446     paddb       xmm1, xmm2
 447
 448     movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
 449     punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
 450     punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
 451
 452     movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
 453     punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
 454     punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
 455
 456     pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
 457     pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
 458
 459     mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
 460     mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
 461     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
 462     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
 463     mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
 464     mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
 465     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
 466     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
 467
 468     add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
 469     add         rdi, byte 4*SIZEOF_JSAMPROW
 470     dec         rcx                            ; ctr
 471     jnz         near .rowloop
 472
 473     pop         rbx
 474     uncollect_args 4
 475     mov         rsp, rbp                ; rsp <- aligned rbp
 476     pop         rsp                     ; rsp <- original rbp
 477     pop         rbp
 478     ret
 479
 480 ; For some reason, the OS X linker does not honor the request to align the
 481 ; segment unless we do this.
 482     align       32