simd/x86_64/jidctflt-sse2.asm

   1 ;
   2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright (C) 2009, 2016, D. R. Commander.
   6 ;
   7 ; Based on the x86 SIMD extension for IJG JPEG library
   8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
   9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10 ;
  11 ; This file should be assembled with NASM (Netwide Assembler),
  12 ; can *not* be assembled with Microsoft's MASM or any compatible
  13 ; assembler (including Borland's Turbo Assembler).
  14 ; NASM is available from http://nasm.sourceforge.net/ or
  15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16 ;
  17 ; This file contains a floating-point implementation of the inverse DCT
  18 ; (Discrete Cosine Transform). The following code is based directly on
  19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  20 ;
  21 ; [TAB8]
  22
  23 %include "jsimdext.inc"
  24 %include "jdct.inc"
  25
  26 ; --------------------------------------------------------------------------
  27
  28 %macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  29     shufps      %1, %2, 0x44
  30 %endmacro
  31
  32 %macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  33     shufps      %1, %2, 0xEE
  34 %endmacro
  35
  36 ; --------------------------------------------------------------------------
  37     SECTION     SEG_CONST
  38
  39     alignz      32
  40     GLOBAL_DATA(jconst_idct_float_sse2)
  41
  42 EXTN(jconst_idct_float_sse2):
  43
  44 PD_1_414        times 4  dd  1.414213562373095048801689
  45 PD_1_847        times 4  dd  1.847759065022573512256366
  46 PD_1_082        times 4  dd  1.082392200292393968799446
  47 PD_M2_613       times 4  dd -2.613125929752753055713286
  48 PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
  49 PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
  50
  51     alignz      32
  52
  53 ; --------------------------------------------------------------------------
  54     SECTION     SEG_TEXT
  55     BITS        64
  56 ;
  57 ; Perform dequantization and inverse DCT on one block of coefficients.
  58 ;
  59 ; GLOBAL(void)
  60 ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
  61 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
  62 ;
  63
  64 ; r10 = void *dct_table
  65 ; r11 = JCOEFPTR coef_block
  66 ; r12 = JSAMPARRAY output_buf
  67 ; r13d = JDIMENSION output_col
  68
  69 %define original_rbp  rbp + 0
  70 %define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
  71                                         ; xmmword wk[WK_NUM]
  72 %define WK_NUM        2
  73 %define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
  74                                         ; FAST_FLOAT workspace[DCTSIZE2]
  75
  76     align       32
  77     GLOBAL_FUNCTION(jsimd_idct_float_sse2)
  78
  79 EXTN(jsimd_idct_float_sse2):
  80     push        rbp
  81     mov         rax, rsp                     ; rax = original rbp
  82     sub         rsp, byte 4
  83     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
  84     mov         [rsp], rax
  85     mov         rbp, rsp                     ; rbp = aligned rbp
  86     lea         rsp, [workspace]
  87     collect_args 4
  88     push        rbx
  89
  90     ; ---- Pass 1: process columns from input, store into work array.
  91
  92     mov         rdx, r10                ; quantptr
  93     mov         rsi, r11                ; inptr
  94     lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
  95     mov         rcx, DCTSIZE/4          ; ctr
  96 .columnloop:
  97 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  98     mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  99     or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 100     jnz         near .columnDCT
 101
 102     movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 103     movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 104     movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 105     movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 106     movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 107     movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 108     movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 109     por         xmm1, xmm2
 110     por         xmm3, xmm4
 111     por         xmm5, xmm6
 112     por         xmm1, xmm3
 113     por         xmm5, xmm7
 114     por         xmm1, xmm5
 115     packsswb    xmm1, xmm1
 116     movd        eax, xmm1
 117     test        rax, rax
 118     jnz         short .columnDCT
 119
 120     ; -- AC terms all zero
 121
 122     movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 123
 124     punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
 125     psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
 126     cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
 127
 128     mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 129
 130     movaps      xmm1, xmm0
 131     movaps      xmm2, xmm0
 132     movaps      xmm3, xmm0
 133
 134     shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
 135     shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
 136     shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
 137     shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
 138
 139     movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 140     movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
 141     movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 142     movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
 143     movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
 144     movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
 145     movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 146     movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 147     jmp         near .nextcolumn
 148 %endif
 149 .columnDCT:
 150
 151     ; -- Even part
 152
 153     movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 154     movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 155     movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 156     movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 157
 158     punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
 159     punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
 160     psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
 161     psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
 162     cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
 163     cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
 164
 165     punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
 166     punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
 167     psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
 168     psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
 169     cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
 170     cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
 171
 172     mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 173     mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 174     mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 175     mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 176
 177     movaps      xmm4, xmm0
 178     movaps      xmm5, xmm1
 179     subps       xmm0, xmm2              ; xmm0=tmp11
 180     subps       xmm1, xmm3
 181     addps       xmm4, xmm2              ; xmm4=tmp10
 182     addps       xmm5, xmm3              ; xmm5=tmp13
 183
 184     mulps       xmm1, [rel PD_1_414]
 185     subps       xmm1, xmm5              ; xmm1=tmp12
 186
 187     movaps      xmm6, xmm4
 188     movaps      xmm7, xmm0
 189     subps       xmm4, xmm5              ; xmm4=tmp3
 190     subps       xmm0, xmm1              ; xmm0=tmp2
 191     addps       xmm6, xmm5              ; xmm6=tmp0
 192     addps       xmm7, xmm1              ; xmm7=tmp1
 193
 194     movaps      XMMWORD [wk(1)], xmm4   ; tmp3
 195     movaps      XMMWORD [wk(0)], xmm0   ; tmp2
 196
 197     ; -- Odd part
 198
 199     movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 200     movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 201     movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 202     movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 203
 204     punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
 205     punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
 206     psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
 207     psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
 208     cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
 209     cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
 210
 211     punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
 212     punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
 213     psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
 214     psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
 215     cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
 216     cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
 217
 218     mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 219     mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 220     mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 221     mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 222
 223     movaps      xmm4, xmm2
 224     movaps      xmm0, xmm5
 225     addps       xmm2, xmm1              ; xmm2=z11
 226     addps       xmm5, xmm3              ; xmm5=z13
 227     subps       xmm4, xmm1              ; xmm4=z12
 228     subps       xmm0, xmm3              ; xmm0=z10
 229
 230     movaps      xmm1, xmm2
 231     subps       xmm2, xmm5
 232     addps       xmm1, xmm5              ; xmm1=tmp7
 233
 234     mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
 235
 236     movaps      xmm3, xmm0
 237     addps       xmm0, xmm4
 238     mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
 239     mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
 240     mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
 241     addps       xmm3, xmm0              ; xmm3=tmp12
 242     subps       xmm4, xmm0              ; xmm4=tmp10
 243
 244     ; -- Final output stage
 245
 246     subps       xmm3, xmm1              ; xmm3=tmp6
 247     movaps      xmm5, xmm6
 248     movaps      xmm0, xmm7
 249     addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
 250     addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
 251     subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
 252     subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
 253     subps       xmm2, xmm3              ; xmm2=tmp5
 254
 255     movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
 256     unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
 257     unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
 258     movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
 259     unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
 260     unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
 261
 262     movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
 263     movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
 264
 265     movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
 266     movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
 267
 268     addps       xmm4, xmm2              ; xmm4=tmp4
 269     movaps      xmm0, xmm7
 270     movaps      xmm3, xmm5
 271     addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
 272     addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
 273     subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
 274     subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
 275
 276     movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
 277     unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
 278     unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
 279     movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
 280     unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
 281     unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
 282
 283     movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
 284     unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
 285     unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
 286     movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
 287     unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
 288     unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
 289
 290     movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
 291     movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
 292
 293     movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
 294     movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 295     movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 296     movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 297
 298     movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
 299     unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
 300     unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
 301     movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
 302     unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
 303     unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
 304
 305     movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
 306     movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
 307     movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
 308     movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 309
 310 .nextcolumn:
 311     add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
 312     add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
 313     add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
 314     dec         rcx                                    ; ctr
 315     jnz         near .columnloop
 316
 317     ; -- Prefetch the next coefficient block
 318
 319     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
 320     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
 321     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
 322     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
 323
 324     ; ---- Pass 2: process rows from work array, store into output array.
 325
 326     mov         rax, [original_rbp]
 327     lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
 328     mov         rdi, r12                ; (JSAMPROW *)
 329     mov         eax, r13d
 330     mov         rcx, DCTSIZE/4          ; ctr
 331 .rowloop:
 332
 333     ; -- Even part
 334
 335     movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
 336     movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
 337     movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
 338     movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
 339
 340     movaps      xmm4, xmm0
 341     movaps      xmm5, xmm1
 342     subps       xmm0, xmm2              ; xmm0=tmp11
 343     subps       xmm1, xmm3
 344     addps       xmm4, xmm2              ; xmm4=tmp10
 345     addps       xmm5, xmm3              ; xmm5=tmp13
 346
 347     mulps       xmm1, [rel PD_1_414]
 348     subps       xmm1, xmm5              ; xmm1=tmp12
 349
 350     movaps      xmm6, xmm4
 351     movaps      xmm7, xmm0
 352     subps       xmm4, xmm5              ; xmm4=tmp3
 353     subps       xmm0, xmm1              ; xmm0=tmp2
 354     addps       xmm6, xmm5              ; xmm6=tmp0
 355     addps       xmm7, xmm1              ; xmm7=tmp1
 356
 357     movaps      XMMWORD [wk(1)], xmm4   ; tmp3
 358     movaps      XMMWORD [wk(0)], xmm0   ; tmp2
 359
 360     ; -- Odd part
 361
 362     movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
 363     movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
 364     movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
 365     movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
 366
 367     movaps      xmm4, xmm2
 368     movaps      xmm0, xmm5
 369     addps       xmm2, xmm1              ; xmm2=z11
 370     addps       xmm5, xmm3              ; xmm5=z13
 371     subps       xmm4, xmm1              ; xmm4=z12
 372     subps       xmm0, xmm3              ; xmm0=z10
 373
 374     movaps      xmm1, xmm2
 375     subps       xmm2, xmm5
 376     addps       xmm1, xmm5              ; xmm1=tmp7
 377
 378     mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
 379
 380     movaps      xmm3, xmm0
 381     addps       xmm0, xmm4
 382     mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
 383     mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
 384     mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
 385     addps       xmm3, xmm0              ; xmm3=tmp12
 386     subps       xmm4, xmm0              ; xmm4=tmp10
 387
 388     ; -- Final output stage
 389
 390     subps       xmm3, xmm1              ; xmm3=tmp6
 391     movaps      xmm5, xmm6
 392     movaps      xmm0, xmm7
 393     addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
 394     addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
 395     subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
 396     subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
 397     subps       xmm2, xmm3              ; xmm2=tmp5
 398
 399     movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
 400     pcmpeqd     xmm3, xmm3
 401     psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 402
 403     addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
 404     addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
 405     addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
 406     addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
 407
 408     pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
 409     pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
 410     pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
 411     pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
 412     por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
 413     por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
 414
 415     movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
 416     movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
 417
 418     addps       xmm4, xmm2              ; xmm4=tmp4
 419     movaps      xmm7, xmm1
 420     movaps      xmm5, xmm3
 421     addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
 422     addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
 423     subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
 424     subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
 425
 426     movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
 427     pcmpeqd     xmm4, xmm4
 428     psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 429
 430     addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
 431     addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
 432     addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
 433     addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
 434
 435     pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
 436     pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
 437     pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
 438     pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
 439     por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
 440     por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
 441
 442     movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
 443
 444     packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
 445     packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
 446     paddb       xmm6, xmm2
 447     paddb       xmm1, xmm2
 448
 449     movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
 450     punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
 451     punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
 452
 453     movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
 454     punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
 455     punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
 456
 457     pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
 458     pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
 459
 460     mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
 461     mov         rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
 462     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
 463     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
 464     mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
 465     mov         rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
 466     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
 467     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
 468
 469     add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
 470     add         rdi, byte 4*SIZEOF_JSAMPROW
 471     dec         rcx                            ; ctr
 472     jnz         near .rowloop
 473
 474     pop         rbx
 475     uncollect_args 4
 476     mov         rsp, rbp                ; rsp <- aligned rbp
 477     pop         rsp                     ; rsp <- original rbp
 478     pop         rbp
 479     ret
 480
 481 ; For some reason, the OS X linker does not honor the request to align the
 482 ; segment unless we do this.
 483     align       32