simd/x86_64/jdsample-sse2.asm

   1 ;
   2 ; jdsample.asm - upsampling (64-bit SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright (C) 2009, 2016, D. R. Commander.
   6 ;
   7 ; Based on the x86 SIMD extension for IJG JPEG library
   8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
   9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  10 ;
  11 ; This file should be assembled with NASM (Netwide Assembler),
  12 ; can *not* be assembled with Microsoft's MASM or any compatible
  13 ; assembler (including Borland's Turbo Assembler).
  14 ; NASM is available from http://nasm.sourceforge.net/ or
  15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  16 ;
  17 ; [TAB8]
  18
  19 %include "jsimdext.inc"
  20
  21 ; --------------------------------------------------------------------------
  22     SECTION     SEG_CONST
  23
  24     alignz      32
  25     GLOBAL_DATA(jconst_fancy_upsample_sse2)
  26
  27 EXTN(jconst_fancy_upsample_sse2):
  28
  29 PW_ONE   times 8 dw 1
  30 PW_TWO   times 8 dw 2
  31 PW_THREE times 8 dw 3
  32 PW_SEVEN times 8 dw 7
  33 PW_EIGHT times 8 dw 8
  34
  35     alignz      32
  36
  37 ; --------------------------------------------------------------------------
  38     SECTION     SEG_TEXT
  39     BITS        64
  40 ;
  41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  42 ;
  43 ; The upsampling algorithm is linear interpolation between pixel centers,
  44 ; also known as a "triangle filter".  This is a good compromise between
  45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
  46 ; of the way between input pixel centers.
  47 ;
  48 ; GLOBAL(void)
  49 ; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
  50 ;                                JDIMENSION downsampled_width,
  51 ;                                JSAMPARRAY input_data,
  52 ;                                JSAMPARRAY *output_data_ptr);
  53 ;
  54
  55 ; r10 = int max_v_samp_factor
  56 ; r11d = JDIMENSION downsampled_width
  57 ; r12 = JSAMPARRAY input_data
  58 ; r13 = JSAMPARRAY *output_data_ptr
  59
  60     align       32
  61     GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
  62
  63 EXTN(jsimd_h2v1_fancy_upsample_sse2):
  64     push        rbp
  65     mov         rax, rsp
  66     mov         rbp, rsp
  67     collect_args 4
  68
  69     mov         eax, r11d               ; colctr
  70     test        rax, rax
  71     jz          near .return
  72
  73     mov         rcx, r10                ; rowctr
  74     test        rcx, rcx
  75     jz          near .return
  76
  77     mov         rsi, r12                ; input_data
  78     mov         rdi, r13
  79     mov         rdi, JSAMPARRAY [rdi]   ; output_data
  80 .rowloop:
  81     push        rax                     ; colctr
  82     push        rdi
  83     push        rsi
  84
  85     mov         rsi, JSAMPROW [rsi]     ; inptr
  86     mov         rdi, JSAMPROW [rdi]     ; outptr
  87
  88     test        rax, SIZEOF_XMMWORD-1
  89     jz          short .skip
  90     mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
  91     mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
  92 .skip:
  93     pxor        xmm0, xmm0              ; xmm0=(all 0's)
  94     pcmpeqb     xmm7, xmm7
  95     psrldq      xmm7, (SIZEOF_XMMWORD-1)
  96     pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  97
  98     add         rax, byte SIZEOF_XMMWORD-1
  99     and         rax, byte -SIZEOF_XMMWORD
 100     cmp         rax, byte SIZEOF_XMMWORD
 101     ja          short .columnloop
 102
 103 .columnloop_last:
 104     pcmpeqb     xmm6, xmm6
 105     pslldq      xmm6, (SIZEOF_XMMWORD-1)
 106     pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 107     jmp         short .upsample
 108
 109 .columnloop:
 110     movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 111     pslldq      xmm6, (SIZEOF_XMMWORD-1)
 112
 113 .upsample:
 114     movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 115     movdqa      xmm2, xmm1
 116     movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
 117     pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
 118     psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
 119
 120     por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
 121     por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
 122
 123     movdqa      xmm7, xmm1
 124     psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
 125
 126     movdqa      xmm4, xmm1
 127     punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
 128     punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
 129     movdqa      xmm5, xmm2
 130     punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
 131     punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
 132     movdqa      xmm6, xmm3
 133     punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
 134     punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
 135
 136     pmullw      xmm1, [rel PW_THREE]
 137     pmullw      xmm4, [rel PW_THREE]
 138     paddw       xmm2, [rel PW_ONE]
 139     paddw       xmm5, [rel PW_ONE]
 140     paddw       xmm3, [rel PW_TWO]
 141     paddw       xmm6, [rel PW_TWO]
 142
 143     paddw       xmm2, xmm1
 144     paddw       xmm5, xmm4
 145     psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
 146     psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
 147     paddw       xmm3, xmm1
 148     paddw       xmm6, xmm4
 149     psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
 150     psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
 151
 152     psllw       xmm3, BYTE_BIT
 153     psllw       xmm6, BYTE_BIT
 154     por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
 155     por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
 156
 157     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
 158     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
 159
 160     sub         rax, byte SIZEOF_XMMWORD
 161     add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
 162     add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
 163     cmp         rax, byte SIZEOF_XMMWORD
 164     ja          near .columnloop
 165     test        eax, eax
 166     jnz         near .columnloop_last
 167
 168     pop         rsi
 169     pop         rdi
 170     pop         rax
 171
 172     add         rsi, byte SIZEOF_JSAMPROW  ; input_data
 173     add         rdi, byte SIZEOF_JSAMPROW  ; output_data
 174     dec         rcx                        ; rowctr
 175     jg          near .rowloop
 176
 177 .return:
 178     uncollect_args 4
 179     pop         rbp
 180     ret
 181
 182 ; --------------------------------------------------------------------------
 183 ;
 184 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
 185 ; Again a triangle filter; see comments for h2v1 case, above.
 186 ;
 187 ; GLOBAL(void)
 188 ; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
 189 ;                                JDIMENSION downsampled_width,
 190 ;                                JSAMPARRAY input_data,
 191 ;                                JSAMPARRAY *output_data_ptr);
 192 ;
 193
 194 ; r10 = int max_v_samp_factor
 195 ; r11d = JDIMENSION downsampled_width
 196 ; r12 = JSAMPARRAY input_data
 197 ; r13 = JSAMPARRAY *output_data_ptr
 198
 199 %define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 200 %define WK_NUM  4
 201
 202     align       32
 203     GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
 204
 205 EXTN(jsimd_h2v2_fancy_upsample_sse2):
 206     push        rbp
 207     mov         rax, rsp                     ; rax = original rbp
 208     sub         rsp, byte 4
 209     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
 210     mov         [rsp], rax
 211     mov         rbp, rsp                     ; rbp = aligned rbp
 212     lea         rsp, [wk(0)]
 213     collect_args 4
 214     push        rbx
 215
 216     mov         eax, r11d               ; colctr
 217     test        rax, rax
 218     jz          near .return
 219
 220     mov         rcx, r10                ; rowctr
 221     test        rcx, rcx
 222     jz          near .return
 223
 224     mov         rsi, r12                ; input_data
 225     mov         rdi, r13
 226     mov         rdi, JSAMPARRAY [rdi]   ; output_data
 227 .rowloop:
 228     push        rax                     ; colctr
 229     push        rcx
 230     push        rdi
 231     push        rsi
 232
 233     mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
 234     mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
 235     mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
 236     mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
 237     mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
 238
 239     test        rax, SIZEOF_XMMWORD-1
 240     jz          short .skip
 241     push        rdx
 242     mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
 243     mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
 244     mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
 245     mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
 246     mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
 247     mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
 248     pop         rdx
 249 .skip:
 250     ; -- process the first column block
 251
 252     movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
 253     movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
 254     movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
 255
 256     pxor        xmm3, xmm3              ; xmm3=(all 0's)
 257     movdqa      xmm4, xmm0
 258     punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
 259     punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
 260     movdqa      xmm5, xmm1
 261     punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
 262     punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
 263     movdqa      xmm6, xmm2
 264     punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
 265     punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
 266
 267     pmullw      xmm0, [rel PW_THREE]
 268     pmullw      xmm4, [rel PW_THREE]
 269
 270     pcmpeqb     xmm7, xmm7
 271     psrldq      xmm7, (SIZEOF_XMMWORD-2)
 272
 273     paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
 274     paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
 275     paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
 276     paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
 277
 278     movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
 279     movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
 280     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
 281     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
 282
 283     pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
 284     pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
 285
 286     movdqa      XMMWORD [wk(0)], xmm1
 287     movdqa      XMMWORD [wk(1)], xmm2
 288
 289     add         rax, byte SIZEOF_XMMWORD-1
 290     and         rax, byte -SIZEOF_XMMWORD
 291     cmp         rax, byte SIZEOF_XMMWORD
 292     ja          short .columnloop
 293
 294 .columnloop_last:
 295     ; -- process the last column block
 296
 297     pcmpeqb     xmm1, xmm1
 298     pslldq      xmm1, (SIZEOF_XMMWORD-2)
 299     movdqa      xmm2, xmm1
 300
 301     pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
 302     pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
 303
 304     movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
 305     movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
 306
 307     jmp         near .upsample
 308
 309 .columnloop:
 310     ; -- process the next column block
 311
 312     movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
 313     movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
 314     movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
 315
 316     pxor        xmm3, xmm3              ; xmm3=(all 0's)
 317     movdqa      xmm4, xmm0
 318     punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
 319     punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
 320     movdqa      xmm5, xmm1
 321     punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
 322     punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
 323     movdqa      xmm6, xmm2
 324     punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
 325     punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
 326
 327     pmullw      xmm0, [rel PW_THREE]
 328     pmullw      xmm4, [rel PW_THREE]
 329
 330     paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
 331     paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
 332     paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
 333     paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
 334
 335     movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
 336     movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
 337     movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 338     movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
 339
 340     pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
 341     pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
 342
 343     movdqa      XMMWORD [wk(2)], xmm1
 344     movdqa      XMMWORD [wk(3)], xmm2
 345
 346 .upsample:
 347     ; -- process the upper row
 348
 349     movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
 350     movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
 351
 352     movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
 353     movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
 354     psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
 355     pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
 356     movdqa      xmm5, xmm7
 357     movdqa      xmm6, xmm3
 358     psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
 359     pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
 360
 361     por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
 362     por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
 363
 364     movdqa      xmm1, xmm7
 365     movdqa      xmm2, xmm3
 366     pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
 367     psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
 368     movdqa      xmm4, xmm3
 369     psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
 370
 371     por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
 372     por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
 373
 374     movdqa      XMMWORD [wk(0)], xmm4
 375
 376     pmullw      xmm7, [rel PW_THREE]
 377     pmullw      xmm3, [rel PW_THREE]
 378     paddw       xmm1, [rel PW_EIGHT]
 379     paddw       xmm5, [rel PW_EIGHT]
 380     paddw       xmm0, [rel PW_SEVEN]
 381     paddw       xmm2, [rel PW_SEVEN]
 382
 383     paddw       xmm1, xmm7
 384     paddw       xmm5, xmm3
 385     psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
 386     psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
 387     paddw       xmm0, xmm7
 388     paddw       xmm2, xmm3
 389     psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
 390     psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
 391
 392     psllw       xmm0, BYTE_BIT
 393     psllw       xmm2, BYTE_BIT
 394     por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
 395     por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
 396
 397     movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
 398     movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
 399
 400     ; -- process the lower row
 401
 402     movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
 403     movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
 404
 405     movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
 406     movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
 407     psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
 408     pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
 409     movdqa      xmm0, xmm6
 410     movdqa      xmm2, xmm4
 411     psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
 412     pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
 413
 414     por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
 415     por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
 416
 417     movdqa      xmm1, xmm6
 418     movdqa      xmm5, xmm4
 419     pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
 420     psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
 421     movdqa      xmm3, xmm4
 422     psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
 423
 424     por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
 425     por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
 426
 427     movdqa      XMMWORD [wk(1)], xmm3
 428
 429     pmullw      xmm6, [rel PW_THREE]
 430     pmullw      xmm4, [rel PW_THREE]
 431     paddw       xmm1, [rel PW_EIGHT]
 432     paddw       xmm0, [rel PW_EIGHT]
 433     paddw       xmm7, [rel PW_SEVEN]
 434     paddw       xmm5, [rel PW_SEVEN]
 435
 436     paddw       xmm1, xmm6
 437     paddw       xmm0, xmm4
 438     psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
 439     psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
 440     paddw       xmm7, xmm6
 441     paddw       xmm5, xmm4
 442     psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
 443     psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
 444
 445     psllw       xmm7, BYTE_BIT
 446     psllw       xmm5, BYTE_BIT
 447     por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
 448     por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
 449
 450     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
 451     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
 452
 453     sub         rax, byte SIZEOF_XMMWORD
 454     add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
 455     add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
 456     add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
 457     add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
 458     add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
 459     cmp         rax, byte SIZEOF_XMMWORD
 460     ja          near .columnloop
 461     test        rax, rax
 462     jnz         near .columnloop_last
 463
 464     pop         rsi
 465     pop         rdi
 466     pop         rcx
 467     pop         rax
 468
 469     add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
 470     add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
 471     sub         rcx, byte 2                  ; rowctr
 472     jg          near .rowloop
 473
 474 .return:
 475     pop         rbx
 476     uncollect_args 4
 477     mov         rsp, rbp                ; rsp <- aligned rbp
 478     pop         rsp                     ; rsp <- original rbp
 479     pop         rbp
 480     ret
 481
 482 ; --------------------------------------------------------------------------
 483 ;
 484 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
 485 ; It's still a box filter.
 486 ;
 487 ; GLOBAL(void)
 488 ; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
 489 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 490 ;
 491
 492 ; r10 = int max_v_samp_factor
 493 ; r11d = JDIMENSION output_width
 494 ; r12 = JSAMPARRAY input_data
 495 ; r13 = JSAMPARRAY *output_data_ptr
 496
 497     align       32
 498     GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
 499
 500 EXTN(jsimd_h2v1_upsample_sse2):
 501     push        rbp
 502     mov         rax, rsp
 503     mov         rbp, rsp
 504     collect_args 4
 505
 506     mov         edx, r11d
 507     add         rdx, byte (2*SIZEOF_XMMWORD)-1
 508     and         rdx, byte -(2*SIZEOF_XMMWORD)
 509     jz          near .return
 510
 511     mov         rcx, r10                ; rowctr
 512     test        rcx, rcx
 513     jz          short .return
 514
 515     mov         rsi, r12                ; input_data
 516     mov         rdi, r13
 517     mov         rdi, JSAMPARRAY [rdi]   ; output_data
 518 .rowloop:
 519     push        rdi
 520     push        rsi
 521
 522     mov         rsi, JSAMPROW [rsi]     ; inptr
 523     mov         rdi, JSAMPROW [rdi]     ; outptr
 524     mov         rax, rdx                ; colctr
 525 .columnloop:
 526
 527     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 528
 529     movdqa      xmm1, xmm0
 530     punpcklbw   xmm0, xmm0
 531     punpckhbw   xmm1, xmm1
 532
 533     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 534     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
 535
 536     sub         rax, byte 2*SIZEOF_XMMWORD
 537     jz          short .nextrow
 538
 539     movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 540
 541     movdqa      xmm3, xmm2
 542     punpcklbw   xmm2, xmm2
 543     punpckhbw   xmm3, xmm3
 544
 545     movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 546     movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
 547
 548     sub         rax, byte 2*SIZEOF_XMMWORD
 549     jz          short .nextrow
 550
 551     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
 552     add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
 553     jmp         short .columnloop
 554
 555 .nextrow:
 556     pop         rsi
 557     pop         rdi
 558
 559     add         rsi, byte SIZEOF_JSAMPROW  ; input_data
 560     add         rdi, byte SIZEOF_JSAMPROW  ; output_data
 561     dec         rcx                        ; rowctr
 562     jg          short .rowloop
 563
 564 .return:
 565     uncollect_args 4
 566     pop         rbp
 567     ret
 568
 569 ; --------------------------------------------------------------------------
 570 ;
 571 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
 572 ; It's still a box filter.
 573 ;
 574 ; GLOBAL(void)
 575 ; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
 576 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 577 ;
 578
 579 ; r10 = int max_v_samp_factor
 580 ; r11d = JDIMENSION output_width
 581 ; r12 = JSAMPARRAY input_data
 582 ; r13 = JSAMPARRAY *output_data_ptr
 583
 584     align       32
 585     GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
 586
 587 EXTN(jsimd_h2v2_upsample_sse2):
 588     push        rbp
 589     mov         rax, rsp
 590     mov         rbp, rsp
 591     collect_args 4
 592     push        rbx
 593
 594     mov         edx, r11d
 595     add         rdx, byte (2*SIZEOF_XMMWORD)-1
 596     and         rdx, byte -(2*SIZEOF_XMMWORD)
 597     jz          near .return
 598
 599     mov         rcx, r10                ; rowctr
 600     test        rcx, rcx
 601     jz          near .return
 602
 603     mov         rsi, r12                ; input_data
 604     mov         rdi, r13
 605     mov         rdi, JSAMPARRAY [rdi]   ; output_data
 606 .rowloop:
 607     push        rdi
 608     push        rsi
 609
 610     mov         rsi, JSAMPROW [rsi]                    ; inptr
 611     mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
 612     mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
 613     mov         rax, rdx                               ; colctr
 614 .columnloop:
 615
 616     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 617
 618     movdqa      xmm1, xmm0
 619     punpcklbw   xmm0, xmm0
 620     punpckhbw   xmm1, xmm1
 621
 622     movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
 623     movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
 624     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 625     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
 626
 627     sub         rax, byte 2*SIZEOF_XMMWORD
 628     jz          short .nextrow
 629
 630     movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 631
 632     movdqa      xmm3, xmm2
 633     punpcklbw   xmm2, xmm2
 634     punpckhbw   xmm3, xmm3
 635
 636     movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
 637     movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
 638     movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 639     movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
 640
 641     sub         rax, byte 2*SIZEOF_XMMWORD
 642     jz          short .nextrow
 643
 644     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
 645     add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
 646     add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
 647     jmp         short .columnloop
 648
 649 .nextrow:
 650     pop         rsi
 651     pop         rdi
 652
 653     add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
 654     add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
 655     sub         rcx, byte 2                  ; rowctr
 656     jg          near .rowloop
 657
 658 .return:
 659     pop         rbx
 660     uncollect_args 4
 661     pop         rbp
 662     ret
 663
 664 ; For some reason, the OS X linker does not honor the request to align the
 665 ; segment unless we do this.
 666     align       32