simd/jidctflt-sse2-64.asm

   1 ;
   2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright (C) 2009, D. R. Commander.
   6 ;
   7 ; Based on the x86 SIMD extension for IJG JPEG library
   8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
   9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10 ;
  11 ; This file should be assembled with NASM (Netwide Assembler),
  12 ; can *not* be assembled with Microsoft's MASM or any compatible
  13 ; assembler (including Borland's Turbo Assembler).
  14 ; NASM is available from http://nasm.sourceforge.net/ or
  15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16 ;
  17 ; This file contains a floating-point implementation of the inverse DCT
  18 ; (Discrete Cosine Transform). The following code is based directly on
  19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  20 ;
  21 ; [TAB8]
  22
  23 %include "jsimdext.inc"
  24 %include "jdct.inc"
  25
  26 ; --------------------------------------------------------------------------
  27
  28 %macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  29         shufps  %1,%2,0x44
  30 %endmacro
  31
  32 %macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  33         shufps  %1,%2,0xEE
  34 %endmacro
  35
  36 ; --------------------------------------------------------------------------
  37         SECTION SEG_CONST
  38
  39         alignz  16
  40         global  EXTN(jconst_idct_float_sse2)
  41
  42 EXTN(jconst_idct_float_sse2):
  43
  44 PD_1_414        times 4 dd  1.414213562373095048801689
  45 PD_1_847        times 4 dd  1.847759065022573512256366
  46 PD_1_082        times 4 dd  1.082392200292393968799446
  47 PD_M2_613       times 4 dd -2.613125929752753055713286
  48 PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
  49 PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
  50
  51         alignz  16
  52
  53 ; --------------------------------------------------------------------------
  54         SECTION SEG_TEXT
  55         BITS    64
  56 ;
  57 ; Perform dequantization and inverse DCT on one block of coefficients.
  58 ;
  59 ; GLOBAL(void)
  60 ; jsimd_idct_float_sse2 (void *dct_table, JCOEFPTR coef_block,
  61 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
  62 ;
  63
  64 ; r10 = void *dct_table
  65 ; r11 = JCOEFPTR coef_block
  66 ; r12 = JSAMPARRAY output_buf
  67 ; r13 = JDIMENSION output_col
  68
  69 %define original_rbp    rbp+0
  70 %define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
  71 %define WK_NUM          2
  72 %define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
  73                                         ; FAST_FLOAT workspace[DCTSIZE2]
  74
  75         align   16
  76         global  EXTN(jsimd_idct_float_sse2)
  77
  78 EXTN(jsimd_idct_float_sse2):
  79         push    rbp
  80         mov     rax,rsp                         ; rax = original rbp
  81         sub     rsp, byte 4
  82         and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
  83         mov     [rsp],rax
  84         mov     rbp,rsp                         ; rbp = aligned rbp
  85         lea     rsp, [workspace]
  86         collect_args
  87         push    rbx
  88
  89         ; ---- Pass 1: process columns from input, store into work array.
  90
  91         mov     rdx, r10                ; quantptr
  92         mov     rsi, r11                ; inptr
  93         lea     rdi, [workspace]                        ; FAST_FLOAT *wsptr
  94         mov     rcx, DCTSIZE/4                          ; ctr
  95 .columnloop:
  96 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  97         mov     eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  98         or      eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  99         jnz     near .columnDCT
 100
 101         movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 102         movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 103         movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 104         movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 105         movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 106         movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 107         movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 108         por     xmm1,xmm2
 109         por     xmm3,xmm4
 110         por     xmm5,xmm6
 111         por     xmm1,xmm3
 112         por     xmm5,xmm7
 113         por     xmm1,xmm5
 114         packsswb xmm1,xmm1
 115         movd    eax,xmm1
 116         test    rax,rax
 117         jnz     short .columnDCT
 118
 119         ; -- AC terms all zero
 120
 121         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 122
 123         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
 124         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
 125         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
 126
 127         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 128
 129         movaps  xmm1,xmm0
 130         movaps  xmm2,xmm0
 131         movaps  xmm3,xmm0
 132
 133         shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
 134         shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
 135         shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
 136         shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
 137
 138         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 139         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
 140         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 141         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
 142         movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
 143         movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
 144         movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 145         movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 146         jmp     near .nextcolumn
 147 %endif
 148 .columnDCT:
 149
 150         ; -- Even part
 151
 152         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 153         movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 154         movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 155         movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 156
 157         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
 158         punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
 159         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
 160         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
 161         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
 162         cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
 163
 164         punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
 165         punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
 166         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
 167         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
 168         cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
 169         cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
 170
 171         mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 172         mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 173         mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 174         mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 175
 176         movaps  xmm4,xmm0
 177         movaps  xmm5,xmm1
 178         subps   xmm0,xmm2               ; xmm0=tmp11
 179         subps   xmm1,xmm3
 180         addps   xmm4,xmm2               ; xmm4=tmp10
 181         addps   xmm5,xmm3               ; xmm5=tmp13
 182
 183         mulps   xmm1,[rel PD_1_414]
 184         subps   xmm1,xmm5               ; xmm1=tmp12
 185
 186         movaps  xmm6,xmm4
 187         movaps  xmm7,xmm0
 188         subps   xmm4,xmm5               ; xmm4=tmp3
 189         subps   xmm0,xmm1               ; xmm0=tmp2
 190         addps   xmm6,xmm5               ; xmm6=tmp0
 191         addps   xmm7,xmm1               ; xmm7=tmp1
 192
 193         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
 194         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
 195
 196         ; -- Odd part
 197
 198         movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 199         movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 200         movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 201         movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 202
 203         punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
 204         punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
 205         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
 206         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
 207         cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
 208         cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
 209
 210         punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
 211         punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
 212         psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
 213         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
 214         cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
 215         cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
 216
 217         mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 218         mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 219         mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 220         mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 221
 222         movaps  xmm4,xmm2
 223         movaps  xmm0,xmm5
 224         addps   xmm2,xmm1               ; xmm2=z11
 225         addps   xmm5,xmm3               ; xmm5=z13
 226         subps   xmm4,xmm1               ; xmm4=z12
 227         subps   xmm0,xmm3               ; xmm0=z10
 228
 229         movaps  xmm1,xmm2
 230         subps   xmm2,xmm5
 231         addps   xmm1,xmm5               ; xmm1=tmp7
 232
 233         mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
 234
 235         movaps  xmm3,xmm0
 236         addps   xmm0,xmm4
 237         mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
 238         mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
 239         mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
 240         addps   xmm3,xmm0               ; xmm3=tmp12
 241         subps   xmm4,xmm0               ; xmm4=tmp10
 242
 243         ; -- Final output stage
 244
 245         subps   xmm3,xmm1               ; xmm3=tmp6
 246         movaps  xmm5,xmm6
 247         movaps  xmm0,xmm7
 248         addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
 249         addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
 250         subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
 251         subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
 252         subps   xmm2,xmm3               ; xmm2=tmp5
 253
 254         movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
 255         unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
 256         unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
 257         movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
 258         unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
 259         unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
 260
 261         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
 262         movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
 263
 264         movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
 265         movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
 266
 267         addps   xmm4,xmm2               ; xmm4=tmp4
 268         movaps  xmm0,xmm7
 269         movaps  xmm3,xmm5
 270         addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
 271         addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
 272         subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
 273         subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
 274
 275         movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
 276         unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
 277         unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
 278         movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
 279         unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
 280         unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
 281
 282         movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
 283         unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
 284         unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
 285         movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
 286         unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
 287         unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
 288
 289         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
 290         movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
 291
 292         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
 293         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 294         movaps  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 295         movaps  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 296
 297         movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
 298         unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
 299         unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
 300         movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
 301         unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
 302         unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
 303
 304         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
 305         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
 306         movaps  XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
 307         movaps  XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 308
 309 .nextcolumn:
 310         add     rsi, byte 4*SIZEOF_JCOEF                ; coef_block
 311         add     rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
 312         add     rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
 313         dec     rcx                                     ; ctr
 314         jnz     near .columnloop
 315
 316         ; -- Prefetch the next coefficient block
 317
 318         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
 319         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
 320         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
 321         prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
 322
 323         ; ---- Pass 2: process rows from work array, store into output array.
 324
 325         mov     rax, [original_rbp]
 326         lea     rsi, [workspace]                        ; FAST_FLOAT *wsptr
 327         mov     rdi, r12        ; (JSAMPROW *)
 328         mov     eax, r13d
 329         mov     rcx, DCTSIZE/4                          ; ctr
 330 .rowloop:
 331
 332         ; -- Even part
 333
 334         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
 335         movaps  xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
 336         movaps  xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
 337         movaps  xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
 338
 339         movaps  xmm4,xmm0
 340         movaps  xmm5,xmm1
 341         subps   xmm0,xmm2               ; xmm0=tmp11
 342         subps   xmm1,xmm3
 343         addps   xmm4,xmm2               ; xmm4=tmp10
 344         addps   xmm5,xmm3               ; xmm5=tmp13
 345
 346         mulps   xmm1,[rel PD_1_414]
 347         subps   xmm1,xmm5               ; xmm1=tmp12
 348
 349         movaps  xmm6,xmm4
 350         movaps  xmm7,xmm0
 351         subps   xmm4,xmm5               ; xmm4=tmp3
 352         subps   xmm0,xmm1               ; xmm0=tmp2
 353         addps   xmm6,xmm5               ; xmm6=tmp0
 354         addps   xmm7,xmm1               ; xmm7=tmp1
 355
 356         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
 357         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
 358
 359         ; -- Odd part
 360
 361         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
 362         movaps  xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
 363         movaps  xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
 364         movaps  xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
 365
 366         movaps  xmm4,xmm2
 367         movaps  xmm0,xmm5
 368         addps   xmm2,xmm1               ; xmm2=z11
 369         addps   xmm5,xmm3               ; xmm5=z13
 370         subps   xmm4,xmm1               ; xmm4=z12
 371         subps   xmm0,xmm3               ; xmm0=z10
 372
 373         movaps  xmm1,xmm2
 374         subps   xmm2,xmm5
 375         addps   xmm1,xmm5               ; xmm1=tmp7
 376
 377         mulps   xmm2,[rel PD_1_414]     ; xmm2=tmp11
 378
 379         movaps  xmm3,xmm0
 380         addps   xmm0,xmm4
 381         mulps   xmm0,[rel PD_1_847]     ; xmm0=z5
 382         mulps   xmm3,[rel PD_M2_613]    ; xmm3=(z10 * -2.613125930)
 383         mulps   xmm4,[rel PD_1_082]     ; xmm4=(z12 * 1.082392200)
 384         addps   xmm3,xmm0               ; xmm3=tmp12
 385         subps   xmm4,xmm0               ; xmm4=tmp10
 386
 387         ; -- Final output stage
 388
 389         subps   xmm3,xmm1               ; xmm3=tmp6
 390         movaps  xmm5,xmm6
 391         movaps  xmm0,xmm7
 392         addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
 393         addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
 394         subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
 395         subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
 396         subps   xmm2,xmm3               ; xmm2=tmp5
 397
 398         movaps  xmm1,[rel PD_RNDINT_MAGIC]      ; xmm1=[rel PD_RNDINT_MAGIC]
 399         pcmpeqd xmm3,xmm3
 400         psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 401
 402         addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
 403         addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
 404         addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
 405         addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
 406
 407         pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
 408         pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
 409         pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
 410         pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
 411         por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
 412         por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
 413
 414         movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
 415         movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
 416
 417         addps   xmm4,xmm2               ; xmm4=tmp4
 418         movaps  xmm7,xmm1
 419         movaps  xmm5,xmm3
 420         addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
 421         addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
 422         subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
 423         subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
 424
 425         movaps  xmm2,[rel PD_RNDINT_MAGIC]      ; xmm2=[rel PD_RNDINT_MAGIC]
 426         pcmpeqd xmm4,xmm4
 427         psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 428
 429         addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
 430         addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
 431         addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
 432         addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
 433
 434         pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
 435         pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
 436         pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
 437         pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
 438         por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
 439         por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
 440
 441         movdqa    xmm2,[rel PB_CENTERJSAMP]     ; xmm2=[rel PB_CENTERJSAMP]
 442
 443         packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
 444         packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
 445         paddb     xmm6,xmm2
 446         paddb     xmm1,xmm2
 447
 448         movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
 449         punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
 450         punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
 451
 452         movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
 453         punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
 454         punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
 455
 456         pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
 457         pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
 458
 459         mov     rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
 460         mov     rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
 461         movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
 462         movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
 463         mov     rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
 464         mov     rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
 465         movq    XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
 466         movq    XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
 467
 468         add     rsi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
 469         add     rdi, byte 4*SIZEOF_JSAMPROW
 470         dec     rcx                             ; ctr
 471         jnz     near .rowloop
 472
 473         pop     rbx
 474         uncollect_args
 475         mov     rsp,rbp         ; rsp <- aligned rbp
 476         pop     rsp             ; rsp <- original rbp
 477         pop     rbp
 478         ret
 479
 480 ; For some reason, the OS X linker does not honor the request to align the
 481 ; segment unless we do this.
 482         align   16