simd/jiss2flt-64.asm

   1 ;
   2 ; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright 2009 D. R. Commander
   6 ;
   7 ; Based on
   8 ; x86 SIMD extension for IJG JPEG library
   9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11 ;
  12 ; This file should be assembled with NASM (Netwide Assembler),
  13 ; can *not* be assembled with Microsoft's MASM or any compatible
  14 ; assembler (including Borland's Turbo Assembler).
  15 ; NASM is available from http://nasm.sourceforge.net/ or
  16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17 ;
  18 ; This file contains a floating-point implementation of the inverse DCT
  19 ; (Discrete Cosine Transform). The following code is based directly on
  20 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  21 ;
  22 ; [TAB8]
  23
  24 %include "jsimdext.inc"
  25 %include "jdct.inc"
  26
  27 ; --------------------------------------------------------------------------
  28
  29 %macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  30         shufps  %1,%2,0x44
  31 %endmacro
  32
  33 %macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  34         shufps  %1,%2,0xEE
  35 %endmacro
  36
  37 ; --------------------------------------------------------------------------
  38         SECTION SEG_CONST
  39
  40         alignz  16
  41         global  EXTN(jconst_idct_float_sse2)
  42
  43 EXTN(jconst_idct_float_sse2):
  44
  45 PD_1_414        times 4 dd  1.414213562373095048801689
  46 PD_1_847        times 4 dd  1.847759065022573512256366
  47 PD_1_082        times 4 dd  1.082392200292393968799446
  48 PD_M2_613       times 4 dd -2.613125929752753055713286
  49 PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
  50 PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
  51
  52         alignz  16
  53
  54 ; --------------------------------------------------------------------------
  55         SECTION SEG_TEXT
  56         BITS    64
  57 ;
  58 ; Perform dequantization and inverse DCT on one block of coefficients.
  59 ;
  60 ; GLOBAL(void)
  61 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
  62 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
  63 ;
  64
  65 ; r10 = void * dct_table
  66 ; r11 = JCOEFPTR coef_block
  67 ; r12 = JSAMPARRAY output_buf
  68 ; r13 = JDIMENSION output_col
  69
  70 %define original_rbp    rbp+0
  71 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
  72 %define WK_NUM          2
  73 %define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
  74                                         ; FAST_FLOAT workspace[DCTSIZE2]
  75
  76         align   16
  77         global  EXTN(jsimd_idct_float_sse2)
  78
  79 EXTN(jsimd_idct_float_sse2):
  80         push    rbp
  81         mov     rax,rsp                         ; rax = original rbp
  82         sub     rsp, byte 4
  83         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
  84         mov     [rsp],rax
  85         mov     rbp,rsp                         ; rbp = aligned rbp
  86         lea     rsp, [workspace]
  87         collect_args
  88         push    rbx
  89
  90         ; ---- Pass 1: process columns from input, store into work array.
  91
  92         mov     rdx, r10        ; quantptr
  93         mov     rsi, r11                ; inptr
  94         lea     rdi, [workspace]                        ; FAST_FLOAT * wsptr
  95         mov     rcx, DCTSIZE/4                          ; ctr
  96 .columnloop:
  97 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  98         mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  99         or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 100         jnz     near .columnDCT
 101
 102         movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 103         movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 104         movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 105         movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 106         movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 107         movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 108         movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 109         por     xmm1,xmm2
 110         por     xmm3,xmm4
 111         por     xmm5,xmm6
 112         por     xmm1,xmm3
 113         por     xmm5,xmm7
 114         por     xmm1,xmm5
 115         packsswb xmm1,xmm1
 116         movd    eax,xmm1
 117         test    rax,rax
 118         jnz     short .columnDCT
 119
 120         ; -- AC terms all zero
 121
 122         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 123
 124         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
 125         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
 126         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
 127
 128         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 129
 130         movaps  xmm1,xmm0
 131         movaps  xmm2,xmm0
 132         movaps  xmm3,xmm0
 133
 134         shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
 135         shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
 136         shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
 137         shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
 138
 139         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 140         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
 141         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 142         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
 143         movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
 144         movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
 145         movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 146         movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 147         jmp     near .nextcolumn
 148 %endif
 149 .columnDCT:
 150
 151         ; -- Even part
 152
 153         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 154         movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 155         movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 156         movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 157
 158         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
 159         punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
 160         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
 161         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
 162         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
 163         cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
 164
 165         punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
 166         punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
 167         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
 168         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
 169         cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
 170         cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
 171
 172         mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 173         mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 174         mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 175         mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 176
 177         movaps  xmm4,xmm0
 178         movaps  xmm5,xmm1
 179         subps   xmm0,xmm2               ; xmm0=tmp11
 180         subps   xmm1,xmm3
 181         addps   xmm4,xmm2               ; xmm4=tmp10
 182         addps   xmm5,xmm3               ; xmm5=tmp13
 183
 184         mulps   xmm1,[rel PD_1_414]
 185         subps   xmm1,xmm5               ; xmm1=tmp12
 186
 187         movaps  xmm6,xmm4
 188         movaps  xmm7,xmm0
 189         subps   xmm4,xmm5               ; xmm4=tmp3
 190         subps   xmm0,xmm1               ; xmm0=tmp2
 191         addps   xmm6,xmm5               ; xmm6=tmp0
 192         addps   xmm7,xmm1               ; xmm7=tmp1
 193
 194         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
 195         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
 196
 197         ; -- Odd part
 198
 199         movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 200         movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 201         movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 202         movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 203
 204         punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
 205         punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
 206         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
 207         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
 208         cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
 209         cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
 210
 211         punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
 212         punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
 213         psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
 214         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
 215         cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
 216         cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
 217
 218         mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 219         mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 220         mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 221         mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 222
 223         movaps  xmm4,xmm2
 224         movaps  xmm0,xmm5
 225         addps   xmm2,xmm1               ; xmm2=z11
 226         addps   xmm5,xmm3               ; xmm5=z13
 227         subps   xmm4,xmm1               ; xmm4=z12
 228         subps   xmm0,xmm3               ; xmm0=z10
 229
 230         movaps  xmm1,xmm2
 231         subps   xmm2,xmm5
 232         addps   xmm1,xmm5               ; xmm1=tmp7
 233
 234         mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
 235
 236         movaps  xmm3,xmm0
 237         addps   xmm0,xmm4
 238         mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
 239         mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
 240         mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
 241         addps   xmm3,xmm0               ; xmm3=tmp12
 242         subps   xmm4,xmm0               ; xmm4=tmp10
 243
 244         ; -- Final output stage
 245
 246         subps   xmm3,xmm1               ; xmm3=tmp6
 247         movaps  xmm5,xmm6
 248         movaps  xmm0,xmm7
 249         addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
 250         addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
 251         subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
 252         subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
 253         subps   xmm2,xmm3               ; xmm2=tmp5
 254
 255         movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
 256         unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
 257         unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
 258         movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
 259         unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
 260         unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
 261
 262         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
 263         movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
 264
 265         movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
 266         movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
 267
 268         addps   xmm4,xmm2               ; xmm4=tmp4
 269         movaps  xmm0,xmm7
 270         movaps  xmm3,xmm5
 271         addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
 272         addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
 273         subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
 274         subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
 275
 276         movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
 277         unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
 278         unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
 279         movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
 280         unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
 281         unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
 282
 283         movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
 284         unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
 285         unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
 286         movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
 287         unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
 288         unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
 289
 290         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
 291         movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
 292
 293         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
 294         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 295         movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 296         movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 297
 298         movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
 299         unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
 300         unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
 301         movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
 302         unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
 303         unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
 304
 305         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
 306         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
 307         movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
 308         movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 309
 310 .nextcolumn:
 311         add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
 312         add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
 313         add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
 314         dec     rcx                                     ; ctr
 315         jnz     near .columnloop
 316
 317         ; -- Prefetch the next coefficient block
 318
 319         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
 320         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
 321         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
 322         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
 323
 324         ; ---- Pass 2: process rows from work array, store into output array.
 325
 326         mov     rax, [original_rbp]
 327         lea     rsi, [workspace]                        ; FAST_FLOAT * wsptr
 328         mov     rdi, r12        ; (JSAMPROW *)
 329         mov     rax, r13
 330         mov     rcx, DCTSIZE/4                          ; ctr
 331 .rowloop:
 332
 333         ; -- Even part
 334
 335         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
 336         movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
 337         movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
 338         movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
 339
 340         movaps  xmm4,xmm0
 341         movaps  xmm5,xmm1
 342         subps   xmm0,xmm2               ; xmm0=tmp11
 343         subps   xmm1,xmm3
 344         addps   xmm4,xmm2               ; xmm4=tmp10
 345         addps   xmm5,xmm3               ; xmm5=tmp13
 346
 347         mulps   xmm1,[rel PD_1_414]
 348         subps   xmm1,xmm5               ; xmm1=tmp12
 349
 350         movaps  xmm6,xmm4
 351         movaps  xmm7,xmm0
 352         subps   xmm4,xmm5               ; xmm4=tmp3
 353         subps   xmm0,xmm1               ; xmm0=tmp2
 354         addps   xmm6,xmm5               ; xmm6=tmp0
 355         addps   xmm7,xmm1               ; xmm7=tmp1
 356
 357         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
 358         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
 359
 360         ; -- Odd part
 361
 362         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
 363         movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
 364         movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
 365         movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
 366
 367         movaps  xmm4,xmm2
 368         movaps  xmm0,xmm5
 369         addps   xmm2,xmm1               ; xmm2=z11
 370         addps   xmm5,xmm3               ; xmm5=z13
 371         subps   xmm4,xmm1               ; xmm4=z12
 372         subps   xmm0,xmm3               ; xmm0=z10
 373
 374         movaps  xmm1,xmm2
 375         subps   xmm2,xmm5
 376         addps   xmm1,xmm5               ; xmm1=tmp7
 377
 378         mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
 379
 380         movaps  xmm3,xmm0
 381         addps   xmm0,xmm4
 382         mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
 383         mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
 384         mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
 385         addps   xmm3,xmm0               ; xmm3=tmp12
 386         subps   xmm4,xmm0               ; xmm4=tmp10
 387
 388         ; -- Final output stage
 389
 390         subps   xmm3,xmm1               ; xmm3=tmp6
 391         movaps  xmm5,xmm6
 392         movaps  xmm0,xmm7
 393         addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
 394         addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
 395         subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
 396         subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
 397         subps   xmm2,xmm3               ; xmm2=tmp5
 398
 399         movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
 400         pcmpeqd xmm3,xmm3
 401         psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 402
 403         addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
 404         addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
 405         addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
 406         addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
 407
 408         pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
 409         pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
 410         pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
 411         pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
 412         por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
 413         por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
 414
 415         movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
 416         movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
 417
 418         addps   xmm4,xmm2               ; xmm4=tmp4
 419         movaps  xmm7,xmm1
 420         movaps  xmm5,xmm3
 421         addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
 422         addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
 423         subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
 424         subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
 425
 426         movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
 427         pcmpeqd xmm4,xmm4
 428         psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 429
 430         addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
 431         addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
 432         addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
 433         addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
 434
 435         pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
 436         pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
 437         pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
 438         pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
 439         por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
 440         por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
 441
 442         movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
 443
 444         packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
 445         packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
 446         paddb     xmm6,xmm2
 447         paddb     xmm1,xmm2
 448
 449         movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
 450         punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
 451         punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
 452
 453         movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
 454         punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
 455         punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
 456
 457         pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
 458         pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
 459
 460         mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
 461         mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
 462         movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
 463         movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
 464         mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
 465         mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
 466         movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
 467         movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
 468
 469         add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
 470         add     rdi, byte 4*SIZEOF_JSAMPROW
 471         dec     rcx                             ; ctr
 472         jnz     near .rowloop
 473
 474         pop     rbx
 475         uncollect_args
 476         mov     rsp,rbp         ; rsp <- aligned rbp
 477         pop     rsp             ; rsp <- original rbp
 478         pop     rbp
 479         ret
 480
 481 ; For some reason, the OS X linker does not honor the request to align the
 482 ; segment unless we do this.
 483         align   16