simd/x86_64/jidctflt-sse2.asm

   1 ;
   2 ; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright (C) 2009, 2016, D. R. Commander.
   6 ; Copyright (C) 2018, Matthias Räncker.
   7 ; Copyright (C) 2023, Aliaksiej Kandracienka.
   8 ;
   9 ; Based on the x86 SIMD extension for IJG JPEG library
  10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  12 ;
  13 ; This file should be assembled with NASM (Netwide Assembler),
  14 ; can *not* be assembled with Microsoft's MASM or any compatible
  15 ; assembler (including Borland's Turbo Assembler).
  16 ; NASM is available from http://nasm.sourceforge.net/ or
  17 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  18 ;
  19 ; This file contains a floating-point implementation of the inverse DCT
  20 ; (Discrete Cosine Transform). The following code is based directly on
  21 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  22
  23 %include "jsimdext.inc"
  24 %include "jdct.inc"
  25
  26 ; --------------------------------------------------------------------------
  27
  28 %macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  29     shufps      %1, %2, 0x44
  30 %endmacro
  31
  32 %macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  33     shufps      %1, %2, 0xEE
  34 %endmacro
  35
  36 ; --------------------------------------------------------------------------
  37     SECTION     SEG_CONST
  38
  39     alignz      32
  40     GLOBAL_DATA(jconst_idct_float_sse2)
  41
  42 EXTN(jconst_idct_float_sse2):
  43
  44 PD_1_414        times 4  dd  1.414213562373095048801689
  45 PD_1_847        times 4  dd  1.847759065022573512256366
  46 PD_1_082        times 4  dd  1.082392200292393968799446
  47 PD_M2_613       times 4  dd -2.613125929752753055713286
  48 PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
  49 PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
  50
  51     alignz      32
  52
  53 ; --------------------------------------------------------------------------
  54     SECTION     SEG_TEXT
  55     BITS        64
  56 ;
  57 ; Perform dequantization and inverse DCT on one block of coefficients.
  58 ;
  59 ; GLOBAL(void)
  60 ; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
  61 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
  62 ;
  63
  64 ; r10 = void *dct_table
  65 ; r11 = JCOEFPTR coef_block
  66 ; r12 = JSAMPARRAY output_buf
  67 ; r13d = JDIMENSION output_col
  68
  69 %define wk(i)         r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
  70                                         ; xmmword wk[WK_NUM]
  71 %define WK_NUM        2
  72 %define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
  73                                         ; FAST_FLOAT workspace[DCTSIZE2]
  74
  75     align       32
  76     GLOBAL_FUNCTION(jsimd_idct_float_sse2)
  77
  78 EXTN(jsimd_idct_float_sse2):
  79     push        rbp
  80     mov         rbp, rsp
  81     push        r15
  82     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
  83     ; Allocate stack space for wk array.  r15 is used to access it.
  84     mov         r15, rsp
  85     lea         rsp, [workspace]
  86     collect_args 4
  87     push        rbx
  88
  89     ; ---- Pass 1: process columns from input, store into work array.
  90
  91     mov         rdx, r10                ; quantptr
  92     mov         rsi, r11                ; inptr
  93     lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
  94     mov         rcx, DCTSIZE/4          ; ctr
  95 .columnloop:
  96 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
  97     mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
  98     or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
  99     jnz         near .columnDCT
 100
 101     movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 102     movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 103     movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 104     movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 105     movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 106     movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 107     movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 108     por         xmm1, xmm2
 109     por         xmm3, xmm4
 110     por         xmm5, xmm6
 111     por         xmm1, xmm3
 112     por         xmm5, xmm7
 113     por         xmm1, xmm5
 114     packsswb    xmm1, xmm1
 115     movd        eax, xmm1
 116     test        rax, rax
 117     jnz         short .columnDCT
 118
 119     ; -- AC terms all zero
 120
 121     movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 122
 123     punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
 124     psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
 125     cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
 126
 127     mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 128
 129     movaps      xmm1, xmm0
 130     movaps      xmm2, xmm0
 131     movaps      xmm3, xmm0
 132
 133     shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
 134     shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
 135     shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
 136     shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
 137
 138     movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 139     movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
 140     movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 141     movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
 142     movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
 143     movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
 144     movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 145     movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 146     jmp         near .nextcolumn
 147 %endif
 148 .columnDCT:
 149
 150     ; -- Even part
 151
 152     movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
 153     movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
 154     movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
 155     movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
 156
 157     punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
 158     punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
 159     psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
 160     psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
 161     cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
 162     cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
 163
 164     punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
 165     punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
 166     psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
 167     psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
 168     cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
 169     cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
 170
 171     mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 172     mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 173     mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 174     mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 175
 176     movaps      xmm4, xmm0
 177     movaps      xmm5, xmm1
 178     subps       xmm0, xmm2              ; xmm0=tmp11
 179     subps       xmm1, xmm3
 180     addps       xmm4, xmm2              ; xmm4=tmp10
 181     addps       xmm5, xmm3              ; xmm5=tmp13
 182
 183     mulps       xmm1, [rel PD_1_414]
 184     subps       xmm1, xmm5              ; xmm1=tmp12
 185
 186     movaps      xmm6, xmm4
 187     movaps      xmm7, xmm0
 188     subps       xmm4, xmm5              ; xmm4=tmp3
 189     subps       xmm0, xmm1              ; xmm0=tmp2
 190     addps       xmm6, xmm5              ; xmm6=tmp0
 191     addps       xmm7, xmm1              ; xmm7=tmp1
 192
 193     movaps      XMMWORD [wk(1)], xmm4   ; tmp3
 194     movaps      XMMWORD [wk(0)], xmm0   ; tmp2
 195
 196     ; -- Odd part
 197
 198     movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
 199     movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
 200     movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
 201     movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
 202
 203     punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
 204     punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
 205     psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
 206     psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
 207     cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
 208     cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
 209
 210     punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
 211     punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
 212     psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
 213     psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
 214     cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
 215     cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
 216
 217     mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 218     mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 219     mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 220     mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
 221
 222     movaps      xmm4, xmm2
 223     movaps      xmm0, xmm5
 224     addps       xmm2, xmm1              ; xmm2=z11
 225     addps       xmm5, xmm3              ; xmm5=z13
 226     subps       xmm4, xmm1              ; xmm4=z12
 227     subps       xmm0, xmm3              ; xmm0=z10
 228
 229     movaps      xmm1, xmm2
 230     subps       xmm2, xmm5
 231     addps       xmm1, xmm5              ; xmm1=tmp7
 232
 233     mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
 234
 235     movaps      xmm3, xmm0
 236     addps       xmm0, xmm4
 237     mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
 238     mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
 239     mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
 240     addps       xmm3, xmm0              ; xmm3=tmp12
 241     subps       xmm4, xmm0              ; xmm4=tmp10
 242
 243     ; -- Final output stage
 244
 245     subps       xmm3, xmm1              ; xmm3=tmp6
 246     movaps      xmm5, xmm6
 247     movaps      xmm0, xmm7
 248     addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
 249     addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
 250     subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
 251     subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
 252     subps       xmm2, xmm3              ; xmm2=tmp5
 253
 254     movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
 255     unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
 256     unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
 257     movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
 258     unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
 259     unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
 260
 261     movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
 262     movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
 263
 264     movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
 265     movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
 266
 267     addps       xmm4, xmm2              ; xmm4=tmp4
 268     movaps      xmm0, xmm7
 269     movaps      xmm3, xmm5
 270     addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
 271     addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
 272     subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
 273     subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
 274
 275     movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
 276     unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
 277     unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
 278     movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
 279     unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
 280     unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
 281
 282     movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
 283     unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
 284     unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
 285     movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
 286     unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
 287     unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
 288
 289     movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
 290     movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
 291
 292     movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
 293     movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
 294     movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
 295     movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
 296
 297     movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
 298     unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
 299     unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
 300     movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
 301     unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
 302     unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
 303
 304     movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
 305     movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
 306     movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
 307     movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
 308
 309 .nextcolumn:
 310     add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
 311     add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
 312     add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
 313     dec         rcx                                    ; ctr
 314     jnz         near .columnloop
 315
 316     ; -- Prefetch the next coefficient block
 317
 318     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
 319     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
 320     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
 321     prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
 322
 323     ; ---- Pass 2: process rows from work array, store into output array.
 324
 325     lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
 326     mov         rdi, r12                ; (JSAMPROW *)
 327     mov         eax, r13d
 328     mov         rcx, DCTSIZE/4          ; ctr
 329 .rowloop:
 330
 331     ; -- Even part
 332
 333     movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
 334     movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
 335     movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
 336     movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
 337
 338     movaps      xmm4, xmm0
 339     movaps      xmm5, xmm1
 340     subps       xmm0, xmm2              ; xmm0=tmp11
 341     subps       xmm1, xmm3
 342     addps       xmm4, xmm2              ; xmm4=tmp10
 343     addps       xmm5, xmm3              ; xmm5=tmp13
 344
 345     mulps       xmm1, [rel PD_1_414]
 346     subps       xmm1, xmm5              ; xmm1=tmp12
 347
 348     movaps      xmm6, xmm4
 349     movaps      xmm7, xmm0
 350     subps       xmm4, xmm5              ; xmm4=tmp3
 351     subps       xmm0, xmm1              ; xmm0=tmp2
 352     addps       xmm6, xmm5              ; xmm6=tmp0
 353     addps       xmm7, xmm1              ; xmm7=tmp1
 354
 355     movaps      XMMWORD [wk(1)], xmm4   ; tmp3
 356     movaps      XMMWORD [wk(0)], xmm0   ; tmp2
 357
 358     ; -- Odd part
 359
 360     movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
 361     movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
 362     movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
 363     movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
 364
 365     movaps      xmm4, xmm2
 366     movaps      xmm0, xmm5
 367     addps       xmm2, xmm1              ; xmm2=z11
 368     addps       xmm5, xmm3              ; xmm5=z13
 369     subps       xmm4, xmm1              ; xmm4=z12
 370     subps       xmm0, xmm3              ; xmm0=z10
 371
 372     movaps      xmm1, xmm2
 373     subps       xmm2, xmm5
 374     addps       xmm1, xmm5              ; xmm1=tmp7
 375
 376     mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
 377
 378     movaps      xmm3, xmm0
 379     addps       xmm0, xmm4
 380     mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
 381     mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
 382     mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
 383     addps       xmm3, xmm0              ; xmm3=tmp12
 384     subps       xmm4, xmm0              ; xmm4=tmp10
 385
 386     ; -- Final output stage
 387
 388     subps       xmm3, xmm1              ; xmm3=tmp6
 389     movaps      xmm5, xmm6
 390     movaps      xmm0, xmm7
 391     addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
 392     addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
 393     subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
 394     subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
 395     subps       xmm2, xmm3              ; xmm2=tmp5
 396
 397     movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
 398     pcmpeqd     xmm3, xmm3
 399     psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 400
 401     addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
 402     addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
 403     addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
 404     addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
 405
 406     pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
 407     pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
 408     pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
 409     pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
 410     por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
 411     por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
 412
 413     movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
 414     movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
 415
 416     addps       xmm4, xmm2              ; xmm4=tmp4
 417     movaps      xmm7, xmm1
 418     movaps      xmm5, xmm3
 419     addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
 420     addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
 421     subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
 422     subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
 423
 424     movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
 425     pcmpeqd     xmm4, xmm4
 426     psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 427
 428     addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
 429     addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
 430     addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
 431     addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
 432
 433     pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
 434     pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
 435     pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
 436     pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
 437     por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
 438     por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
 439
 440     movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
 441
 442     packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
 443     packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
 444     paddb       xmm6, xmm2
 445     paddb       xmm1, xmm2
 446
 447     movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
 448     punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
 449     punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
 450
 451     movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
 452     punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
 453     punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
 454
 455     pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
 456     pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
 457
 458     mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
 459     mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
 460     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
 461     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
 462     mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
 463     mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
 464     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
 465     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
 466
 467     add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
 468     add         rdi, byte 4*SIZEOF_JSAMPROW
 469     dec         rcx                            ; ctr
 470     jnz         near .rowloop
 471
 472     pop         rbx
 473     uncollect_args 4
 474     lea         rsp, [rbp-8]
 475     pop         r15
 476     pop         rbp
 477     ret
 478
 479 ; For some reason, the OS X linker does not honor the request to align the
 480 ; segment unless we do this.
 481     align       32