simd/jiss2flt.asm

   1 ;
   2 ; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ;
   6 ; Based on
   7 ; x86 SIMD extension for IJG JPEG library
   8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
   9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10 ;
  11 ; This file should be assembled with NASM (Netwide Assembler),
  12 ; can *not* be assembled with Microsoft's MASM or any compatible
  13 ; assembler (including Borland's Turbo Assembler).
  14 ; NASM is available from http://nasm.sourceforge.net/ or
  15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16 ;
  17 ; This file contains a floating-point implementation of the inverse DCT
  18 ; (Discrete Cosine Transform). The following code is based directly on
  19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
  20 ;
  21 ; [TAB8]
  22
  23 %include "jsimdext.inc"
  24 %include "jdct.inc"
  25
  26 ; --------------------------------------------------------------------------
  27
  28 %macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  29         shufps  %1,%2,0x44
  30 %endmacro
  31
  32 %macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  33         shufps  %1,%2,0xEE
  34 %endmacro
  35
  36 ; --------------------------------------------------------------------------
  37         SECTION SEG_CONST
  38
  39         alignz  16
  40         global  EXTN(jconst_idct_float_sse2)
  41
  42 EXTN(jconst_idct_float_sse2):
  43
  44 PD_1_414        times 4 dd  1.414213562373095048801689
  45 PD_1_847        times 4 dd  1.847759065022573512256366
  46 PD_1_082        times 4 dd  1.082392200292393968799446
  47 PD_M2_613       times 4 dd -2.613125929752753055713286
  48 PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
  49 PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
  50
  51         alignz  16
  52
  53 ; --------------------------------------------------------------------------
  54         SECTION SEG_TEXT
  55         BITS    32
  56 ;
  57 ; Perform dequantization and inverse DCT on one block of coefficients.
  58 ;
  59 ; GLOBAL(void)
  60 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
  61 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
  62 ;
  63
  64 %define dct_table(b)    (b)+8                   ; void * dct_table
  65 %define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
  66 %define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
  67 %define output_col(b)   (b)+20          ; JDIMENSION output_col
  68
  69 %define original_ebp    ebp+0
  70 %define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
  71 %define WK_NUM          2
  72 %define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
  73                                         ; FAST_FLOAT workspace[DCTSIZE2]
  74
  75         align   16
  76         global  EXTN(jsimd_idct_float_sse2)
  77
  78 EXTN(jsimd_idct_float_sse2):
  79         push    ebp
  80         mov     eax,esp                         ; eax = original ebp
  81         sub     esp, byte 4
  82         and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
  83         mov     [esp],eax
  84         mov     ebp,esp                         ; ebp = aligned ebp
  85         lea     esp, [workspace]
  86         push    ebx
  87 ;       push    ecx             ; need not be preserved
  88 ;       push    edx             ; need not be preserved
  89         push    esi
  90         push    edi
  91
  92         get_GOT ebx             ; get GOT address
  93
  94         ; ---- Pass 1: process columns from input, store into work array.
  95
  96 ;       mov     eax, [original_ebp]
  97         mov     edx, POINTER [dct_table(eax)]   ; quantptr
  98         mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
  99         lea     edi, [workspace]                        ; FAST_FLOAT * wsptr
 100         mov     ecx, DCTSIZE/4                          ; ctr
 101         alignx  16,7
 102 .columnloop:
 103 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
 104         mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
 105         or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
 106         jnz     near .columnDCT
 107
 108         movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
 109         movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
 110         movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
 111         movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
 112         movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
 113         movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
 114         movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
 115         por     xmm1,xmm2
 116         por     xmm3,xmm4
 117         por     xmm5,xmm6
 118         por     xmm1,xmm3
 119         por     xmm5,xmm7
 120         por     xmm1,xmm5
 121         packsswb xmm1,xmm1
 122         movd    eax,xmm1
 123         test    eax,eax
 124         jnz     short .columnDCT
 125
 126         ; -- AC terms all zero
 127
 128         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
 129
 130         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
 131         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
 132         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
 133
 134         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 135
 136         movaps  xmm1,xmm0
 137         movaps  xmm2,xmm0
 138         movaps  xmm3,xmm0
 139
 140         shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
 141         shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
 142         shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
 143         shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
 144
 145         movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
 146         movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
 147         movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
 148         movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
 149         movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
 150         movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
 151         movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
 152         movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
 153         jmp     near .nextcolumn
 154         alignx  16,7
 155 %endif
 156 .columnDCT:
 157
 158         ; -- Even part
 159
 160         movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
 161         movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
 162         movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
 163         movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
 164
 165         punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
 166         punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
 167         psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
 168         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
 169         cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
 170         cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
 171
 172         punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
 173         punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
 174         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
 175         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
 176         cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
 177         cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
 178
 179         mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 180         mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 181         mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 182         mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 183
 184         movaps  xmm4,xmm0
 185         movaps  xmm5,xmm1
 186         subps   xmm0,xmm2               ; xmm0=tmp11
 187         subps   xmm1,xmm3
 188         addps   xmm4,xmm2               ; xmm4=tmp10
 189         addps   xmm5,xmm3               ; xmm5=tmp13
 190
 191         mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
 192         subps   xmm1,xmm5               ; xmm1=tmp12
 193
 194         movaps  xmm6,xmm4
 195         movaps  xmm7,xmm0
 196         subps   xmm4,xmm5               ; xmm4=tmp3
 197         subps   xmm0,xmm1               ; xmm0=tmp2
 198         addps   xmm6,xmm5               ; xmm6=tmp0
 199         addps   xmm7,xmm1               ; xmm7=tmp1
 200
 201         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
 202         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
 203
 204         ; -- Odd part
 205
 206         movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
 207         movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
 208         movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
 209         movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
 210
 211         punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
 212         punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
 213         psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
 214         psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
 215         cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
 216         cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
 217
 218         punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
 219         punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
 220         psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
 221         psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
 222         cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
 223         cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
 224
 225         mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 226         mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 227         mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 228         mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
 229
 230         movaps  xmm4,xmm2
 231         movaps  xmm0,xmm5
 232         addps   xmm2,xmm1               ; xmm2=z11
 233         addps   xmm5,xmm3               ; xmm5=z13
 234         subps   xmm4,xmm1               ; xmm4=z12
 235         subps   xmm0,xmm3               ; xmm0=z10
 236
 237         movaps  xmm1,xmm2
 238         subps   xmm2,xmm5
 239         addps   xmm1,xmm5               ; xmm1=tmp7
 240
 241         mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
 242
 243         movaps  xmm3,xmm0
 244         addps   xmm0,xmm4
 245         mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
 246         mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
 247         mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
 248         addps   xmm3,xmm0               ; xmm3=tmp12
 249         subps   xmm4,xmm0               ; xmm4=tmp10
 250
 251         ; -- Final output stage
 252
 253         subps   xmm3,xmm1               ; xmm3=tmp6
 254         movaps  xmm5,xmm6
 255         movaps  xmm0,xmm7
 256         addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
 257         addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
 258         subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
 259         subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
 260         subps   xmm2,xmm3               ; xmm2=tmp5
 261
 262         movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
 263         unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
 264         unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
 265         movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
 266         unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
 267         unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
 268
 269         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
 270         movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
 271
 272         movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
 273         movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
 274
 275         addps   xmm4,xmm2               ; xmm4=tmp4
 276         movaps  xmm0,xmm7
 277         movaps  xmm3,xmm5
 278         addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
 279         addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
 280         subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
 281         subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
 282
 283         movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
 284         unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
 285         unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
 286         movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
 287         unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
 288         unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
 289
 290         movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
 291         unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
 292         unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
 293         movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
 294         unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
 295         unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
 296
 297         movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
 298         movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
 299
 300         movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
 301         movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
 302         movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
 303         movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
 304
 305         movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
 306         unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
 307         unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
 308         movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
 309         unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
 310         unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
 311
 312         movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
 313         movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
 314         movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
 315         movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
 316
 317 .nextcolumn:
 318         add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
 319         add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
 320         add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
 321         dec     ecx                                     ; ctr
 322         jnz     near .columnloop
 323
 324         ; -- Prefetch the next coefficient block
 325
 326         prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
 327         prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
 328         prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
 329         prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
 330
 331         ; ---- Pass 2: process rows from work array, store into output array.
 332
 333         mov     eax, [original_ebp]
 334         lea     esi, [workspace]                        ; FAST_FLOAT * wsptr
 335         mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
 336         mov     eax, JDIMENSION [output_col(eax)]
 337         mov     ecx, DCTSIZE/4                          ; ctr
 338         alignx  16,7
 339 .rowloop:
 340
 341         ; -- Even part
 342
 343         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
 344         movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
 345         movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
 346         movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
 347
 348         movaps  xmm4,xmm0
 349         movaps  xmm5,xmm1
 350         subps   xmm0,xmm2               ; xmm0=tmp11
 351         subps   xmm1,xmm3
 352         addps   xmm4,xmm2               ; xmm4=tmp10
 353         addps   xmm5,xmm3               ; xmm5=tmp13
 354
 355         mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
 356         subps   xmm1,xmm5               ; xmm1=tmp12
 357
 358         movaps  xmm6,xmm4
 359         movaps  xmm7,xmm0
 360         subps   xmm4,xmm5               ; xmm4=tmp3
 361         subps   xmm0,xmm1               ; xmm0=tmp2
 362         addps   xmm6,xmm5               ; xmm6=tmp0
 363         addps   xmm7,xmm1               ; xmm7=tmp1
 364
 365         movaps  XMMWORD [wk(1)], xmm4   ; tmp3
 366         movaps  XMMWORD [wk(0)], xmm0   ; tmp2
 367
 368         ; -- Odd part
 369
 370         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
 371         movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
 372         movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
 373         movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
 374
 375         movaps  xmm4,xmm2
 376         movaps  xmm0,xmm5
 377         addps   xmm2,xmm1               ; xmm2=z11
 378         addps   xmm5,xmm3               ; xmm5=z13
 379         subps   xmm4,xmm1               ; xmm4=z12
 380         subps   xmm0,xmm3               ; xmm0=z10
 381
 382         movaps  xmm1,xmm2
 383         subps   xmm2,xmm5
 384         addps   xmm1,xmm5               ; xmm1=tmp7
 385
 386         mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
 387
 388         movaps  xmm3,xmm0
 389         addps   xmm0,xmm4
 390         mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
 391         mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
 392         mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
 393         addps   xmm3,xmm0               ; xmm3=tmp12
 394         subps   xmm4,xmm0               ; xmm4=tmp10
 395
 396         ; -- Final output stage
 397
 398         subps   xmm3,xmm1               ; xmm3=tmp6
 399         movaps  xmm5,xmm6
 400         movaps  xmm0,xmm7
 401         addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
 402         addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
 403         subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
 404         subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
 405         subps   xmm2,xmm3               ; xmm2=tmp5
 406
 407         movaps  xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm1=[PD_RNDINT_MAGIC]
 408         pcmpeqd xmm3,xmm3
 409         psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 410
 411         addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
 412         addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
 413         addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
 414         addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
 415
 416         pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
 417         pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
 418         pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
 419         pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
 420         por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
 421         por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
 422
 423         movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
 424         movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
 425
 426         addps   xmm4,xmm2               ; xmm4=tmp4
 427         movaps  xmm7,xmm1
 428         movaps  xmm5,xmm3
 429         addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
 430         addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
 431         subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
 432         subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
 433
 434         movaps  xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm2=[PD_RNDINT_MAGIC]
 435         pcmpeqd xmm4,xmm4
 436         psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
 437
 438         addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
 439         addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
 440         addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
 441         addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
 442
 443         pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
 444         pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
 445         pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
 446         pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
 447         por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
 448         por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
 449
 450         movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
 451
 452         packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
 453         packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
 454         paddb     xmm6,xmm2
 455         paddb     xmm1,xmm2
 456
 457         movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
 458         punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
 459         punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
 460
 461         movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
 462         punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
 463         punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
 464
 465         pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
 466         pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
 467
 468         pushpic ebx                     ; save GOT address
 469
 470         mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
 471         mov     ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
 472         movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
 473         movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
 474         mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
 475         mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
 476         movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
 477         movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
 478
 479         poppic  ebx                     ; restore GOT address
 480
 481         add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
 482         add     edi, byte 4*SIZEOF_JSAMPROW
 483         dec     ecx                             ; ctr
 484         jnz     near .rowloop
 485
 486         pop     edi
 487         pop     esi
 488 ;       pop     edx             ; need not be preserved
 489 ;       pop     ecx             ; need not be preserved
 490         pop     ebx
 491         mov     esp,ebp         ; esp <- aligned ebp
 492         pop     esp             ; esp <- original ebp
 493         pop     ebp
 494         ret
 495
 496 ; For some reason, the OS X linker does not honor the request to align the
 497 ; segment unless we do this.
 498         align   16