simd/x86_64/jdsample-sse2.asm

   1 ;
   2 ; jdsample.asm - upsampling (64-bit SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright (C) 2009, 2016, D. R. Commander.
   6 ; Copyright (C) 2018, Matthias Räncker.
   7 ;
   8 ; Based on the x86 SIMD extension for IJG JPEG library
   9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11 ;
  12 ; This file should be assembled with NASM (Netwide Assembler),
  13 ; can *not* be assembled with Microsoft's MASM or any compatible
  14 ; assembler (including Borland's Turbo Assembler).
  15 ; NASM is available from http://nasm.sourceforge.net/ or
  16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17
  18 %include "jsimdext.inc"
  19
  20 ; --------------------------------------------------------------------------
  21     SECTION     SEG_CONST
  22
  23     alignz      32
  24     GLOBAL_DATA(jconst_fancy_upsample_sse2)
  25
  26 EXTN(jconst_fancy_upsample_sse2):
  27
  28 PW_ONE   times 8 dw 1
  29 PW_TWO   times 8 dw 2
  30 PW_THREE times 8 dw 3
  31 PW_SEVEN times 8 dw 7
  32 PW_EIGHT times 8 dw 8
  33
  34     alignz      32
  35
  36 ; --------------------------------------------------------------------------
  37     SECTION     SEG_TEXT
  38     BITS        64
  39 ;
  40 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  41 ;
  42 ; The upsampling algorithm is linear interpolation between pixel centers,
  43 ; also known as a "triangle filter".  This is a good compromise between
  44 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
  45 ; of the way between input pixel centers.
  46 ;
  47 ; GLOBAL(void)
  48 ; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
  49 ;                                JDIMENSION downsampled_width,
  50 ;                                JSAMPARRAY input_data,
  51 ;                                JSAMPARRAY *output_data_ptr);
  52 ;
  53
  54 ; r10 = int max_v_samp_factor
  55 ; r11d = JDIMENSION downsampled_width
  56 ; r12 = JSAMPARRAY input_data
  57 ; r13 = JSAMPARRAY *output_data_ptr
  58
  59     align       32
  60     GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
  61
  62 EXTN(jsimd_h2v1_fancy_upsample_sse2):
  63     push        rbp
  64     mov         rax, rsp
  65     mov         rbp, rsp
  66     collect_args 4
  67
  68     mov         eax, r11d               ; colctr
  69     test        rax, rax
  70     jz          near .return
  71
  72     mov         rcx, r10                ; rowctr
  73     test        rcx, rcx
  74     jz          near .return
  75
  76     mov         rsi, r12                ; input_data
  77     mov         rdi, r13
  78     mov         rdip, JSAMPARRAY [rdi]  ; output_data
  79 .rowloop:
  80     push        rax                     ; colctr
  81     push        rdi
  82     push        rsi
  83
  84     mov         rsip, JSAMPROW [rsi]    ; inptr
  85     mov         rdip, JSAMPROW [rdi]    ; outptr
  86
  87     test        rax, SIZEOF_XMMWORD-1
  88     jz          short .skip
  89     mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
  90     mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
  91 .skip:
  92     pxor        xmm0, xmm0              ; xmm0=(all 0's)
  93     pcmpeqb     xmm7, xmm7
  94     psrldq      xmm7, (SIZEOF_XMMWORD-1)
  95     pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  96
  97     add         rax, byte SIZEOF_XMMWORD-1
  98     and         rax, byte -SIZEOF_XMMWORD
  99     cmp         rax, byte SIZEOF_XMMWORD
 100     ja          short .columnloop
 101
 102 .columnloop_last:
 103     pcmpeqb     xmm6, xmm6
 104     pslldq      xmm6, (SIZEOF_XMMWORD-1)
 105     pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 106     jmp         short .upsample
 107
 108 .columnloop:
 109     movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 110     pslldq      xmm6, (SIZEOF_XMMWORD-1)
 111
 112 .upsample:
 113     movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 114     movdqa      xmm2, xmm1
 115     movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
 116     pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
 117     psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
 118
 119     por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
 120     por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
 121
 122     movdqa      xmm7, xmm1
 123     psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
 124
 125     movdqa      xmm4, xmm1
 126     punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
 127     punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
 128     movdqa      xmm5, xmm2
 129     punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
 130     punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
 131     movdqa      xmm6, xmm3
 132     punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
 133     punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
 134
 135     pmullw      xmm1, [rel PW_THREE]
 136     pmullw      xmm4, [rel PW_THREE]
 137     paddw       xmm2, [rel PW_ONE]
 138     paddw       xmm5, [rel PW_ONE]
 139     paddw       xmm3, [rel PW_TWO]
 140     paddw       xmm6, [rel PW_TWO]
 141
 142     paddw       xmm2, xmm1
 143     paddw       xmm5, xmm4
 144     psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
 145     psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
 146     paddw       xmm3, xmm1
 147     paddw       xmm6, xmm4
 148     psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
 149     psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
 150
 151     psllw       xmm3, BYTE_BIT
 152     psllw       xmm6, BYTE_BIT
 153     por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
 154     por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
 155
 156     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
 157     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
 158
 159     sub         rax, byte SIZEOF_XMMWORD
 160     add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
 161     add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
 162     cmp         rax, byte SIZEOF_XMMWORD
 163     ja          near .columnloop
 164     test        eax, eax
 165     jnz         near .columnloop_last
 166
 167     pop         rsi
 168     pop         rdi
 169     pop         rax
 170
 171     add         rsi, byte SIZEOF_JSAMPROW  ; input_data
 172     add         rdi, byte SIZEOF_JSAMPROW  ; output_data
 173     dec         rcx                        ; rowctr
 174     jg          near .rowloop
 175
 176 .return:
 177     uncollect_args 4
 178     pop         rbp
 179     ret
 180
 181 ; --------------------------------------------------------------------------
 182 ;
 183 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
 184 ; Again a triangle filter; see comments for h2v1 case, above.
 185 ;
 186 ; GLOBAL(void)
 187 ; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
 188 ;                                JDIMENSION downsampled_width,
 189 ;                                JSAMPARRAY input_data,
 190 ;                                JSAMPARRAY *output_data_ptr);
 191 ;
 192
 193 ; r10 = int max_v_samp_factor
 194 ; r11d = JDIMENSION downsampled_width
 195 ; r12 = JSAMPARRAY input_data
 196 ; r13 = JSAMPARRAY *output_data_ptr
 197
 198 %define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 199 %define WK_NUM  4
 200
 201     align       32
 202     GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
 203
 204 EXTN(jsimd_h2v2_fancy_upsample_sse2):
 205     push        rbp
 206     mov         rax, rsp                     ; rax = original rbp
 207     sub         rsp, byte 4
 208     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
 209     mov         [rsp], rax
 210     mov         rbp, rsp                     ; rbp = aligned rbp
 211     lea         rsp, [wk(0)]
 212     collect_args 4
 213     push        rbx
 214
 215     mov         eax, r11d               ; colctr
 216     test        rax, rax
 217     jz          near .return
 218
 219     mov         rcx, r10                ; rowctr
 220     test        rcx, rcx
 221     jz          near .return
 222
 223     mov         rsi, r12                ; input_data
 224     mov         rdi, r13
 225     mov         rdip, JSAMPARRAY [rdi]  ; output_data
 226 .rowloop:
 227     push        rax                     ; colctr
 228     push        rcx
 229     push        rdi
 230     push        rsi
 231
 232     mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
 233     mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
 234     mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
 235     mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
 236     mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
 237
 238     test        rax, SIZEOF_XMMWORD-1
 239     jz          short .skip
 240     push        rdx
 241     mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
 242     mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
 243     mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
 244     mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
 245     mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
 246     mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
 247     pop         rdx
 248 .skip:
 249     ; -- process the first column block
 250
 251     movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
 252     movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
 253     movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
 254
 255     pxor        xmm3, xmm3              ; xmm3=(all 0's)
 256     movdqa      xmm4, xmm0
 257     punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
 258     punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
 259     movdqa      xmm5, xmm1
 260     punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
 261     punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
 262     movdqa      xmm6, xmm2
 263     punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
 264     punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
 265
 266     pmullw      xmm0, [rel PW_THREE]
 267     pmullw      xmm4, [rel PW_THREE]
 268
 269     pcmpeqb     xmm7, xmm7
 270     psrldq      xmm7, (SIZEOF_XMMWORD-2)
 271
 272     paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
 273     paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
 274     paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
 275     paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
 276
 277     movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
 278     movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
 279     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
 280     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
 281
 282     pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
 283     pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
 284
 285     movdqa      XMMWORD [wk(0)], xmm1
 286     movdqa      XMMWORD [wk(1)], xmm2
 287
 288     add         rax, byte SIZEOF_XMMWORD-1
 289     and         rax, byte -SIZEOF_XMMWORD
 290     cmp         rax, byte SIZEOF_XMMWORD
 291     ja          short .columnloop
 292
 293 .columnloop_last:
 294     ; -- process the last column block
 295
 296     pcmpeqb     xmm1, xmm1
 297     pslldq      xmm1, (SIZEOF_XMMWORD-2)
 298     movdqa      xmm2, xmm1
 299
 300     pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
 301     pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
 302
 303     movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
 304     movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
 305
 306     jmp         near .upsample
 307
 308 .columnloop:
 309     ; -- process the next column block
 310
 311     movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
 312     movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
 313     movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
 314
 315     pxor        xmm3, xmm3              ; xmm3=(all 0's)
 316     movdqa      xmm4, xmm0
 317     punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
 318     punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
 319     movdqa      xmm5, xmm1
 320     punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
 321     punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
 322     movdqa      xmm6, xmm2
 323     punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
 324     punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
 325
 326     pmullw      xmm0, [rel PW_THREE]
 327     pmullw      xmm4, [rel PW_THREE]
 328
 329     paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
 330     paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
 331     paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
 332     paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
 333
 334     movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
 335     movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
 336     movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 337     movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
 338
 339     pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
 340     pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
 341
 342     movdqa      XMMWORD [wk(2)], xmm1
 343     movdqa      XMMWORD [wk(3)], xmm2
 344
 345 .upsample:
 346     ; -- process the upper row
 347
 348     movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
 349     movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
 350
 351     movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
 352     movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
 353     psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
 354     pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
 355     movdqa      xmm5, xmm7
 356     movdqa      xmm6, xmm3
 357     psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
 358     pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
 359
 360     por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
 361     por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
 362
 363     movdqa      xmm1, xmm7
 364     movdqa      xmm2, xmm3
 365     pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
 366     psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
 367     movdqa      xmm4, xmm3
 368     psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
 369
 370     por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
 371     por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
 372
 373     movdqa      XMMWORD [wk(0)], xmm4
 374
 375     pmullw      xmm7, [rel PW_THREE]
 376     pmullw      xmm3, [rel PW_THREE]
 377     paddw       xmm1, [rel PW_EIGHT]
 378     paddw       xmm5, [rel PW_EIGHT]
 379     paddw       xmm0, [rel PW_SEVEN]
 380     paddw       xmm2, [rel PW_SEVEN]
 381
 382     paddw       xmm1, xmm7
 383     paddw       xmm5, xmm3
 384     psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
 385     psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
 386     paddw       xmm0, xmm7
 387     paddw       xmm2, xmm3
 388     psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
 389     psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
 390
 391     psllw       xmm0, BYTE_BIT
 392     psllw       xmm2, BYTE_BIT
 393     por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
 394     por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
 395
 396     movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
 397     movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
 398
 399     ; -- process the lower row
 400
 401     movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
 402     movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
 403
 404     movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
 405     movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
 406     psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
 407     pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
 408     movdqa      xmm0, xmm6
 409     movdqa      xmm2, xmm4
 410     psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
 411     pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
 412
 413     por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
 414     por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
 415
 416     movdqa      xmm1, xmm6
 417     movdqa      xmm5, xmm4
 418     pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
 419     psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
 420     movdqa      xmm3, xmm4
 421     psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
 422
 423     por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
 424     por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
 425
 426     movdqa      XMMWORD [wk(1)], xmm3
 427
 428     pmullw      xmm6, [rel PW_THREE]
 429     pmullw      xmm4, [rel PW_THREE]
 430     paddw       xmm1, [rel PW_EIGHT]
 431     paddw       xmm0, [rel PW_EIGHT]
 432     paddw       xmm7, [rel PW_SEVEN]
 433     paddw       xmm5, [rel PW_SEVEN]
 434
 435     paddw       xmm1, xmm6
 436     paddw       xmm0, xmm4
 437     psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
 438     psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
 439     paddw       xmm7, xmm6
 440     paddw       xmm5, xmm4
 441     psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
 442     psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
 443
 444     psllw       xmm7, BYTE_BIT
 445     psllw       xmm5, BYTE_BIT
 446     por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
 447     por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
 448
 449     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
 450     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
 451
 452     sub         rax, byte SIZEOF_XMMWORD
 453     add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
 454     add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
 455     add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
 456     add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
 457     add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
 458     cmp         rax, byte SIZEOF_XMMWORD
 459     ja          near .columnloop
 460     test        rax, rax
 461     jnz         near .columnloop_last
 462
 463     pop         rsi
 464     pop         rdi
 465     pop         rcx
 466     pop         rax
 467
 468     add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
 469     add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
 470     sub         rcx, byte 2                  ; rowctr
 471     jg          near .rowloop
 472
 473 .return:
 474     pop         rbx
 475     uncollect_args 4
 476     mov         rsp, rbp                ; rsp <- aligned rbp
 477     pop         rsp                     ; rsp <- original rbp
 478     pop         rbp
 479     ret
 480
 481 ; --------------------------------------------------------------------------
 482 ;
 483 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
 484 ; It's still a box filter.
 485 ;
 486 ; GLOBAL(void)
 487 ; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
 488 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 489 ;
 490
 491 ; r10 = int max_v_samp_factor
 492 ; r11d = JDIMENSION output_width
 493 ; r12 = JSAMPARRAY input_data
 494 ; r13 = JSAMPARRAY *output_data_ptr
 495
 496     align       32
 497     GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
 498
 499 EXTN(jsimd_h2v1_upsample_sse2):
 500     push        rbp
 501     mov         rax, rsp
 502     mov         rbp, rsp
 503     collect_args 4
 504
 505     mov         edx, r11d
 506     add         rdx, byte (2*SIZEOF_XMMWORD)-1
 507     and         rdx, byte -(2*SIZEOF_XMMWORD)
 508     jz          near .return
 509
 510     mov         rcx, r10                ; rowctr
 511     test        rcx, rcx
 512     jz          short .return
 513
 514     mov         rsi, r12                ; input_data
 515     mov         rdi, r13
 516     mov         rdip, JSAMPARRAY [rdi]  ; output_data
 517 .rowloop:
 518     push        rdi
 519     push        rsi
 520
 521     mov         rsip, JSAMPROW [rsi]    ; inptr
 522     mov         rdip, JSAMPROW [rdi]    ; outptr
 523     mov         rax, rdx                ; colctr
 524 .columnloop:
 525
 526     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 527
 528     movdqa      xmm1, xmm0
 529     punpcklbw   xmm0, xmm0
 530     punpckhbw   xmm1, xmm1
 531
 532     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 533     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
 534
 535     sub         rax, byte 2*SIZEOF_XMMWORD
 536     jz          short .nextrow
 537
 538     movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 539
 540     movdqa      xmm3, xmm2
 541     punpcklbw   xmm2, xmm2
 542     punpckhbw   xmm3, xmm3
 543
 544     movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 545     movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
 546
 547     sub         rax, byte 2*SIZEOF_XMMWORD
 548     jz          short .nextrow
 549
 550     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
 551     add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
 552     jmp         short .columnloop
 553
 554 .nextrow:
 555     pop         rsi
 556     pop         rdi
 557
 558     add         rsi, byte SIZEOF_JSAMPROW  ; input_data
 559     add         rdi, byte SIZEOF_JSAMPROW  ; output_data
 560     dec         rcx                        ; rowctr
 561     jg          short .rowloop
 562
 563 .return:
 564     uncollect_args 4
 565     pop         rbp
 566     ret
 567
 568 ; --------------------------------------------------------------------------
 569 ;
 570 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
 571 ; It's still a box filter.
 572 ;
 573 ; GLOBAL(void)
 574 ; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
 575 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 576 ;
 577
 578 ; r10 = int max_v_samp_factor
 579 ; r11d = JDIMENSION output_width
 580 ; r12 = JSAMPARRAY input_data
 581 ; r13 = JSAMPARRAY *output_data_ptr
 582
 583     align       32
 584     GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
 585
 586 EXTN(jsimd_h2v2_upsample_sse2):
 587     push        rbp
 588     mov         rax, rsp
 589     mov         rbp, rsp
 590     collect_args 4
 591     push        rbx
 592
 593     mov         edx, r11d
 594     add         rdx, byte (2*SIZEOF_XMMWORD)-1
 595     and         rdx, byte -(2*SIZEOF_XMMWORD)
 596     jz          near .return
 597
 598     mov         rcx, r10                ; rowctr
 599     test        rcx, rcx
 600     jz          near .return
 601
 602     mov         rsi, r12                ; input_data
 603     mov         rdi, r13
 604     mov         rdip, JSAMPARRAY [rdi]  ; output_data
 605 .rowloop:
 606     push        rdi
 607     push        rsi
 608
 609     mov         rsip, JSAMPROW [rsi]                   ; inptr
 610     mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
 611     mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
 612     mov         rax, rdx                               ; colctr
 613 .columnloop:
 614
 615     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 616
 617     movdqa      xmm1, xmm0
 618     punpcklbw   xmm0, xmm0
 619     punpckhbw   xmm1, xmm1
 620
 621     movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
 622     movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
 623     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 624     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
 625
 626     sub         rax, byte 2*SIZEOF_XMMWORD
 627     jz          short .nextrow
 628
 629     movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 630
 631     movdqa      xmm3, xmm2
 632     punpcklbw   xmm2, xmm2
 633     punpckhbw   xmm3, xmm3
 634
 635     movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
 636     movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
 637     movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 638     movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
 639
 640     sub         rax, byte 2*SIZEOF_XMMWORD
 641     jz          short .nextrow
 642
 643     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
 644     add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
 645     add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
 646     jmp         short .columnloop
 647
 648 .nextrow:
 649     pop         rsi
 650     pop         rdi
 651
 652     add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
 653     add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
 654     sub         rcx, byte 2                  ; rowctr
 655     jg          near .rowloop
 656
 657 .return:
 658     pop         rbx
 659     uncollect_args 4
 660     pop         rbp
 661     ret
 662
 663 ; For some reason, the OS X linker does not honor the request to align the
 664 ; segment unless we do this.
 665     align       32