simd/x86_64/jdsample-sse2.asm

   1 ;
   2 ; jdsample.asm - upsampling (64-bit SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright (C) 2009, 2016, D. R. Commander.
   6 ; Copyright (C) 2018, Matthias Räncker.
   7 ; Copyright (C) 2023, Aliaksiej Kandracienka.
   8 ;
   9 ; Based on the x86 SIMD extension for IJG JPEG library
  10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  12 ;
  13 ; This file should be assembled with NASM (Netwide Assembler),
  14 ; can *not* be assembled with Microsoft's MASM or any compatible
  15 ; assembler (including Borland's Turbo Assembler).
  16 ; NASM is available from http://nasm.sourceforge.net/ or
  17 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  18
  19 %include "jsimdext.inc"
  20
  21 ; --------------------------------------------------------------------------
  22     SECTION     SEG_CONST
  23
  24     alignz      32
  25     GLOBAL_DATA(jconst_fancy_upsample_sse2)
  26
  27 EXTN(jconst_fancy_upsample_sse2):
  28
  29 PW_ONE   times 8 dw 1
  30 PW_TWO   times 8 dw 2
  31 PW_THREE times 8 dw 3
  32 PW_SEVEN times 8 dw 7
  33 PW_EIGHT times 8 dw 8
  34
  35     alignz      32
  36
  37 ; --------------------------------------------------------------------------
  38     SECTION     SEG_TEXT
  39     BITS        64
  40 ;
  41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
  42 ;
  43 ; The upsampling algorithm is linear interpolation between pixel centers,
  44 ; also known as a "triangle filter".  This is a good compromise between
  45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
  46 ; of the way between input pixel centers.
  47 ;
  48 ; GLOBAL(void)
  49 ; jsimd_h2v1_fancy_upsample_sse2(int max_v_samp_factor,
  50 ;                                JDIMENSION downsampled_width,
  51 ;                                JSAMPARRAY input_data,
  52 ;                                JSAMPARRAY *output_data_ptr);
  53 ;
  54
  55 ; r10 = int max_v_samp_factor
  56 ; r11d = JDIMENSION downsampled_width
  57 ; r12 = JSAMPARRAY input_data
  58 ; r13 = JSAMPARRAY *output_data_ptr
  59
  60     align       32
  61     GLOBAL_FUNCTION(jsimd_h2v1_fancy_upsample_sse2)
  62
  63 EXTN(jsimd_h2v1_fancy_upsample_sse2):
  64     push        rbp
  65     mov         rbp, rsp
  66     collect_args 4
  67
  68     mov         eax, r11d               ; colctr
  69     test        rax, rax
  70     jz          near .return
  71
  72     mov         rcx, r10                ; rowctr
  73     test        rcx, rcx
  74     jz          near .return
  75
  76     mov         rsi, r12                ; input_data
  77     mov         rdi, r13
  78     mov         rdip, JSAMPARRAY [rdi]  ; output_data
  79 .rowloop:
  80     push        rax                     ; colctr
  81     push        rdi
  82     push        rsi
  83
  84     mov         rsip, JSAMPROW [rsi]    ; inptr
  85     mov         rdip, JSAMPROW [rdi]    ; outptr
  86
  87     test        rax, SIZEOF_XMMWORD-1
  88     jz          short .skip
  89     mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
  90     mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
  91 .skip:
  92     pxor        xmm0, xmm0              ; xmm0=(all 0's)
  93     pcmpeqb     xmm7, xmm7
  94     psrldq      xmm7, (SIZEOF_XMMWORD-1)
  95     pand        xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
  96
  97     add         rax, byte SIZEOF_XMMWORD-1
  98     and         rax, byte -SIZEOF_XMMWORD
  99     cmp         rax, byte SIZEOF_XMMWORD
 100     ja          short .columnloop
 101
 102 .columnloop_last:
 103     pcmpeqb     xmm6, xmm6
 104     pslldq      xmm6, (SIZEOF_XMMWORD-1)
 105     pand        xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 106     jmp         short .upsample
 107
 108 .columnloop:
 109     movdqa      xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 110     pslldq      xmm6, (SIZEOF_XMMWORD-1)
 111
 112 .upsample:
 113     movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 114     movdqa      xmm2, xmm1
 115     movdqa      xmm3, xmm1                ; xmm1=( 0  1  2 ... 13 14 15)
 116     pslldq      xmm2, 1                   ; xmm2=(--  0  1 ... 12 13 14)
 117     psrldq      xmm3, 1                   ; xmm3=( 1  2  3 ... 14 15 --)
 118
 119     por         xmm2, xmm7                ; xmm2=(-1  0  1 ... 12 13 14)
 120     por         xmm3, xmm6                ; xmm3=( 1  2  3 ... 14 15 16)
 121
 122     movdqa      xmm7, xmm1
 123     psrldq      xmm7, (SIZEOF_XMMWORD-1)  ; xmm7=(15 -- -- ... -- -- --)
 124
 125     movdqa      xmm4, xmm1
 126     punpcklbw   xmm1, xmm0                ; xmm1=( 0  1  2  3  4  5  6  7)
 127     punpckhbw   xmm4, xmm0                ; xmm4=( 8  9 10 11 12 13 14 15)
 128     movdqa      xmm5, xmm2
 129     punpcklbw   xmm2, xmm0                ; xmm2=(-1  0  1  2  3  4  5  6)
 130     punpckhbw   xmm5, xmm0                ; xmm5=( 7  8  9 10 11 12 13 14)
 131     movdqa      xmm6, xmm3
 132     punpcklbw   xmm3, xmm0                ; xmm3=( 1  2  3  4  5  6  7  8)
 133     punpckhbw   xmm6, xmm0                ; xmm6=( 9 10 11 12 13 14 15 16)
 134
 135     pmullw      xmm1, [rel PW_THREE]
 136     pmullw      xmm4, [rel PW_THREE]
 137     paddw       xmm2, [rel PW_ONE]
 138     paddw       xmm5, [rel PW_ONE]
 139     paddw       xmm3, [rel PW_TWO]
 140     paddw       xmm6, [rel PW_TWO]
 141
 142     paddw       xmm2, xmm1
 143     paddw       xmm5, xmm4
 144     psrlw       xmm2, 2                 ; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
 145     psrlw       xmm5, 2                 ; xmm5=OutHE=(16 18 20 22 24 26 28 30)
 146     paddw       xmm3, xmm1
 147     paddw       xmm6, xmm4
 148     psrlw       xmm3, 2                 ; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
 149     psrlw       xmm6, 2                 ; xmm6=OutHO=(17 19 21 23 25 27 29 31)
 150
 151     psllw       xmm3, BYTE_BIT
 152     psllw       xmm6, BYTE_BIT
 153     por         xmm2, xmm3              ; xmm2=OutL=( 0  1  2 ... 13 14 15)
 154     por         xmm5, xmm6              ; xmm5=OutH=(16 17 18 ... 29 30 31)
 155
 156     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
 157     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
 158
 159     sub         rax, byte SIZEOF_XMMWORD
 160     add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr
 161     add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr
 162     cmp         rax, byte SIZEOF_XMMWORD
 163     ja          near .columnloop
 164     test        eax, eax
 165     jnz         near .columnloop_last
 166
 167     pop         rsi
 168     pop         rdi
 169     pop         rax
 170
 171     add         rsi, byte SIZEOF_JSAMPROW  ; input_data
 172     add         rdi, byte SIZEOF_JSAMPROW  ; output_data
 173     dec         rcx                        ; rowctr
 174     jg          near .rowloop
 175
 176 .return:
 177     uncollect_args 4
 178     pop         rbp
 179     ret
 180
 181 ; --------------------------------------------------------------------------
 182 ;
 183 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
 184 ; Again a triangle filter; see comments for h2v1 case, above.
 185 ;
 186 ; GLOBAL(void)
 187 ; jsimd_h2v2_fancy_upsample_sse2(int max_v_samp_factor,
 188 ;                                JDIMENSION downsampled_width,
 189 ;                                JSAMPARRAY input_data,
 190 ;                                JSAMPARRAY *output_data_ptr);
 191 ;
 192
 193 ; r10 = int max_v_samp_factor
 194 ; r11d = JDIMENSION downsampled_width
 195 ; r12 = JSAMPARRAY input_data
 196 ; r13 = JSAMPARRAY *output_data_ptr
 197
 198 %define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
 199 %define WK_NUM  4
 200
 201     align       32
 202     GLOBAL_FUNCTION(jsimd_h2v2_fancy_upsample_sse2)
 203
 204 EXTN(jsimd_h2v2_fancy_upsample_sse2):
 205     push        rbp
 206     mov         rbp, rsp
 207     push        r15
 208     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
 209     ; Allocate stack space for wk array.  r15 is used to access it.
 210     mov         r15, rsp
 211     sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
 212     collect_args 4
 213     push        rbx
 214
 215     mov         eax, r11d               ; colctr
 216     test        rax, rax
 217     jz          near .return
 218
 219     mov         rcx, r10                ; rowctr
 220     test        rcx, rcx
 221     jz          near .return
 222
 223     mov         rsi, r12                ; input_data
 224     mov         rdi, r13
 225     mov         rdip, JSAMPARRAY [rdi]  ; output_data
 226 .rowloop:
 227     push        rax                     ; colctr
 228     push        rcx
 229     push        rdi
 230     push        rsi
 231
 232     mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
 233     mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
 234     mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
 235     mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
 236     mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
 237
 238     test        rax, SIZEOF_XMMWORD-1
 239     jz          short .skip
 240     push        rdx
 241     mov         dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
 242     mov         JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
 243     mov         dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
 244     mov         JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
 245     mov         dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
 246     mov         JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl    ; insert a dummy sample
 247     pop         rdx
 248 .skip:
 249     ; -- process the first column block
 250
 251     movdqa      xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]  ; xmm0=row[ 0][0]
 252     movdqa      xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]  ; xmm1=row[-1][0]
 253     movdqa      xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]  ; xmm2=row[+1][0]
 254
 255     pxor        xmm3, xmm3              ; xmm3=(all 0's)
 256     movdqa      xmm4, xmm0
 257     punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
 258     punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
 259     movdqa      xmm5, xmm1
 260     punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
 261     punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
 262     movdqa      xmm6, xmm2
 263     punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
 264     punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
 265
 266     pmullw      xmm0, [rel PW_THREE]
 267     pmullw      xmm4, [rel PW_THREE]
 268
 269     pcmpeqb     xmm7, xmm7
 270     psrldq      xmm7, (SIZEOF_XMMWORD-2)
 271
 272     paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
 273     paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
 274     paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
 275     paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
 276
 277     movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1  ; temporarily save
 278     movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5  ; the intermediate data
 279     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
 280     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
 281
 282     pand        xmm1, xmm7              ; xmm1=( 0 -- -- -- -- -- -- --)
 283     pand        xmm2, xmm7              ; xmm2=( 0 -- -- -- -- -- -- --)
 284
 285     movdqa      XMMWORD [wk(0)], xmm1
 286     movdqa      XMMWORD [wk(1)], xmm2
 287
 288     add         rax, byte SIZEOF_XMMWORD-1
 289     and         rax, byte -SIZEOF_XMMWORD
 290     cmp         rax, byte SIZEOF_XMMWORD
 291     ja          short .columnloop
 292
 293 .columnloop_last:
 294     ; -- process the last column block
 295
 296     pcmpeqb     xmm1, xmm1
 297     pslldq      xmm1, (SIZEOF_XMMWORD-2)
 298     movdqa      xmm2, xmm1
 299
 300     pand        xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
 301     pand        xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
 302
 303     movdqa      XMMWORD [wk(2)], xmm1   ; xmm1=(-- -- -- -- -- -- -- 15)
 304     movdqa      XMMWORD [wk(3)], xmm2   ; xmm2=(-- -- -- -- -- -- -- 15)
 305
 306     jmp         near .upsample
 307
 308 .columnloop:
 309     ; -- process the next column block
 310
 311     movdqa      xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]  ; xmm0=row[ 0][1]
 312     movdqa      xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]  ; xmm1=row[-1][1]
 313     movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]  ; xmm2=row[+1][1]
 314
 315     pxor        xmm3, xmm3              ; xmm3=(all 0's)
 316     movdqa      xmm4, xmm0
 317     punpcklbw   xmm0, xmm3              ; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
 318     punpckhbw   xmm4, xmm3              ; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
 319     movdqa      xmm5, xmm1
 320     punpcklbw   xmm1, xmm3              ; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
 321     punpckhbw   xmm5, xmm3              ; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
 322     movdqa      xmm6, xmm2
 323     punpcklbw   xmm2, xmm3              ; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
 324     punpckhbw   xmm6, xmm3              ; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
 325
 326     pmullw      xmm0, [rel PW_THREE]
 327     pmullw      xmm4, [rel PW_THREE]
 328
 329     paddw       xmm1, xmm0              ; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
 330     paddw       xmm5, xmm4              ; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
 331     paddw       xmm2, xmm0              ; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
 332     paddw       xmm6, xmm4              ; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
 333
 334     movdqa      XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1  ; temporarily save
 335     movdqa      XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5  ; the intermediate data
 336     movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 337     movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
 338
 339     pslldq      xmm1, (SIZEOF_XMMWORD-2)  ; xmm1=(-- -- -- -- -- -- --  0)
 340     pslldq      xmm2, (SIZEOF_XMMWORD-2)  ; xmm2=(-- -- -- -- -- -- --  0)
 341
 342     movdqa      XMMWORD [wk(2)], xmm1
 343     movdqa      XMMWORD [wk(3)], xmm2
 344
 345 .upsample:
 346     ; -- process the upper row
 347
 348     movdqa      xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
 349     movdqa      xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
 350
 351     movdqa      xmm0, xmm7                ; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
 352     movdqa      xmm4, xmm3                ; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
 353     psrldq      xmm0, 2                   ; xmm0=( 1  2  3  4  5  6  7 --)
 354     pslldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(-- -- -- -- -- -- --  8)
 355     movdqa      xmm5, xmm7
 356     movdqa      xmm6, xmm3
 357     psrldq      xmm5, (SIZEOF_XMMWORD-2)  ; xmm5=( 7 -- -- -- -- -- -- --)
 358     pslldq      xmm6, 2                   ; xmm6=(--  8  9 10 11 12 13 14)
 359
 360     por         xmm0, xmm4                ; xmm0=( 1  2  3  4  5  6  7  8)
 361     por         xmm5, xmm6                ; xmm5=( 7  8  9 10 11 12 13 14)
 362
 363     movdqa      xmm1, xmm7
 364     movdqa      xmm2, xmm3
 365     pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
 366     psrldq      xmm2, 2                   ; xmm2=( 9 10 11 12 13 14 15 --)
 367     movdqa      xmm4, xmm3
 368     psrldq      xmm4, (SIZEOF_XMMWORD-2)  ; xmm4=(15 -- -- -- -- -- -- --)
 369
 370     por         xmm1, XMMWORD [wk(0)]     ; xmm1=(-1  0  1  2  3  4  5  6)
 371     por         xmm2, XMMWORD [wk(2)]     ; xmm2=( 9 10 11 12 13 14 15 16)
 372
 373     movdqa      XMMWORD [wk(0)], xmm4
 374
 375     pmullw      xmm7, [rel PW_THREE]
 376     pmullw      xmm3, [rel PW_THREE]
 377     paddw       xmm1, [rel PW_EIGHT]
 378     paddw       xmm5, [rel PW_EIGHT]
 379     paddw       xmm0, [rel PW_SEVEN]
 380     paddw       xmm2, [rel PW_SEVEN]
 381
 382     paddw       xmm1, xmm7
 383     paddw       xmm5, xmm3
 384     psrlw       xmm1, 4                 ; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
 385     psrlw       xmm5, 4                 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
 386     paddw       xmm0, xmm7
 387     paddw       xmm2, xmm3
 388     psrlw       xmm0, 4                 ; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
 389     psrlw       xmm2, 4                 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
 390
 391     psllw       xmm0, BYTE_BIT
 392     psllw       xmm2, BYTE_BIT
 393     por         xmm1, xmm0              ; xmm1=Out0L=( 0  1  2 ... 13 14 15)
 394     por         xmm5, xmm2              ; xmm5=Out0H=(16 17 18 ... 29 30 31)
 395
 396     movdqa      XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
 397     movdqa      XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
 398
 399     ; -- process the lower row
 400
 401     movdqa      xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
 402     movdqa      xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
 403
 404     movdqa      xmm7, xmm6                ; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
 405     movdqa      xmm3, xmm4                ; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
 406     psrldq      xmm7, 2                   ; xmm7=( 1  2  3  4  5  6  7 --)
 407     pslldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(-- -- -- -- -- -- --  8)
 408     movdqa      xmm0, xmm6
 409     movdqa      xmm2, xmm4
 410     psrldq      xmm0, (SIZEOF_XMMWORD-2)  ; xmm0=( 7 -- -- -- -- -- -- --)
 411     pslldq      xmm2, 2                   ; xmm2=(--  8  9 10 11 12 13 14)
 412
 413     por         xmm7, xmm3                ; xmm7=( 1  2  3  4  5  6  7  8)
 414     por         xmm0, xmm2                ; xmm0=( 7  8  9 10 11 12 13 14)
 415
 416     movdqa      xmm1, xmm6
 417     movdqa      xmm5, xmm4
 418     pslldq      xmm1, 2                   ; xmm1=(--  0  1  2  3  4  5  6)
 419     psrldq      xmm5, 2                   ; xmm5=( 9 10 11 12 13 14 15 --)
 420     movdqa      xmm3, xmm4
 421     psrldq      xmm3, (SIZEOF_XMMWORD-2)  ; xmm3=(15 -- -- -- -- -- -- --)
 422
 423     por         xmm1, XMMWORD [wk(1)]     ; xmm1=(-1  0  1  2  3  4  5  6)
 424     por         xmm5, XMMWORD [wk(3)]     ; xmm5=( 9 10 11 12 13 14 15 16)
 425
 426     movdqa      XMMWORD [wk(1)], xmm3
 427
 428     pmullw      xmm6, [rel PW_THREE]
 429     pmullw      xmm4, [rel PW_THREE]
 430     paddw       xmm1, [rel PW_EIGHT]
 431     paddw       xmm0, [rel PW_EIGHT]
 432     paddw       xmm7, [rel PW_SEVEN]
 433     paddw       xmm5, [rel PW_SEVEN]
 434
 435     paddw       xmm1, xmm6
 436     paddw       xmm0, xmm4
 437     psrlw       xmm1, 4                 ; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
 438     psrlw       xmm0, 4                 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
 439     paddw       xmm7, xmm6
 440     paddw       xmm5, xmm4
 441     psrlw       xmm7, 4                 ; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
 442     psrlw       xmm5, 4                 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
 443
 444     psllw       xmm7, BYTE_BIT
 445     psllw       xmm5, BYTE_BIT
 446     por         xmm1, xmm7              ; xmm1=Out1L=( 0  1  2 ... 13 14 15)
 447     por         xmm0, xmm5              ; xmm0=Out1H=(16 17 18 ... 29 30 31)
 448
 449     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
 450     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
 451
 452     sub         rax, byte SIZEOF_XMMWORD
 453     add         rcx, byte 1*SIZEOF_XMMWORD  ; inptr1(above)
 454     add         rbx, byte 1*SIZEOF_XMMWORD  ; inptr0
 455     add         rsi, byte 1*SIZEOF_XMMWORD  ; inptr1(below)
 456     add         rdx, byte 2*SIZEOF_XMMWORD  ; outptr0
 457     add         rdi, byte 2*SIZEOF_XMMWORD  ; outptr1
 458     cmp         rax, byte SIZEOF_XMMWORD
 459     ja          near .columnloop
 460     test        rax, rax
 461     jnz         near .columnloop_last
 462
 463     pop         rsi
 464     pop         rdi
 465     pop         rcx
 466     pop         rax
 467
 468     add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
 469     add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
 470     sub         rcx, byte 2                  ; rowctr
 471     jg          near .rowloop
 472
 473 .return:
 474     pop         rbx
 475     uncollect_args 4
 476     lea         rsp, [rbp-8]
 477     pop         r15
 478     pop         rbp
 479     ret
 480
 481 ; --------------------------------------------------------------------------
 482 ;
 483 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
 484 ; It's still a box filter.
 485 ;
 486 ; GLOBAL(void)
 487 ; jsimd_h2v1_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
 488 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 489 ;
 490
 491 ; r10 = int max_v_samp_factor
 492 ; r11d = JDIMENSION output_width
 493 ; r12 = JSAMPARRAY input_data
 494 ; r13 = JSAMPARRAY *output_data_ptr
 495
 496     align       32
 497     GLOBAL_FUNCTION(jsimd_h2v1_upsample_sse2)
 498
 499 EXTN(jsimd_h2v1_upsample_sse2):
 500     push        rbp
 501     mov         rbp, rsp
 502     collect_args 4
 503
 504     mov         edx, r11d
 505     add         rdx, byte (2*SIZEOF_XMMWORD)-1
 506     and         rdx, byte -(2*SIZEOF_XMMWORD)
 507     jz          near .return
 508
 509     mov         rcx, r10                ; rowctr
 510     test        rcx, rcx
 511     jz          short .return
 512
 513     mov         rsi, r12                ; input_data
 514     mov         rdi, r13
 515     mov         rdip, JSAMPARRAY [rdi]  ; output_data
 516 .rowloop:
 517     push        rdi
 518     push        rsi
 519
 520     mov         rsip, JSAMPROW [rsi]    ; inptr
 521     mov         rdip, JSAMPROW [rdi]    ; outptr
 522     mov         rax, rdx                ; colctr
 523 .columnloop:
 524
 525     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 526
 527     movdqa      xmm1, xmm0
 528     punpcklbw   xmm0, xmm0
 529     punpckhbw   xmm1, xmm1
 530
 531     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 532     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
 533
 534     sub         rax, byte 2*SIZEOF_XMMWORD
 535     jz          short .nextrow
 536
 537     movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 538
 539     movdqa      xmm3, xmm2
 540     punpcklbw   xmm2, xmm2
 541     punpckhbw   xmm3, xmm3
 542
 543     movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 544     movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
 545
 546     sub         rax, byte 2*SIZEOF_XMMWORD
 547     jz          short .nextrow
 548
 549     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
 550     add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr
 551     jmp         short .columnloop
 552
 553 .nextrow:
 554     pop         rsi
 555     pop         rdi
 556
 557     add         rsi, byte SIZEOF_JSAMPROW  ; input_data
 558     add         rdi, byte SIZEOF_JSAMPROW  ; output_data
 559     dec         rcx                        ; rowctr
 560     jg          short .rowloop
 561
 562 .return:
 563     uncollect_args 4
 564     pop         rbp
 565     ret
 566
 567 ; --------------------------------------------------------------------------
 568 ;
 569 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
 570 ; It's still a box filter.
 571 ;
 572 ; GLOBAL(void)
 573 ; jsimd_h2v2_upsample_sse2(int max_v_samp_factor, JDIMENSION output_width,
 574 ;                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr);
 575 ;
 576
 577 ; r10 = int max_v_samp_factor
 578 ; r11d = JDIMENSION output_width
 579 ; r12 = JSAMPARRAY input_data
 580 ; r13 = JSAMPARRAY *output_data_ptr
 581
 582     align       32
 583     GLOBAL_FUNCTION(jsimd_h2v2_upsample_sse2)
 584
 585 EXTN(jsimd_h2v2_upsample_sse2):
 586     push        rbp
 587     mov         rbp, rsp
 588     collect_args 4
 589     push        rbx
 590
 591     mov         edx, r11d
 592     add         rdx, byte (2*SIZEOF_XMMWORD)-1
 593     and         rdx, byte -(2*SIZEOF_XMMWORD)
 594     jz          near .return
 595
 596     mov         rcx, r10                ; rowctr
 597     test        rcx, rcx
 598     jz          near .return
 599
 600     mov         rsi, r12                ; input_data
 601     mov         rdi, r13
 602     mov         rdip, JSAMPARRAY [rdi]  ; output_data
 603 .rowloop:
 604     push        rdi
 605     push        rsi
 606
 607     mov         rsip, JSAMPROW [rsi]                   ; inptr
 608     mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
 609     mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
 610     mov         rax, rdx                               ; colctr
 611 .columnloop:
 612
 613     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 614
 615     movdqa      xmm1, xmm0
 616     punpcklbw   xmm0, xmm0
 617     punpckhbw   xmm1, xmm1
 618
 619     movdqa      XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
 620     movdqa      XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
 621     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 622     movdqa      XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
 623
 624     sub         rax, byte 2*SIZEOF_XMMWORD
 625     jz          short .nextrow
 626
 627     movdqa      xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 628
 629     movdqa      xmm3, xmm2
 630     punpcklbw   xmm2, xmm2
 631     punpckhbw   xmm3, xmm3
 632
 633     movdqa      XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
 634     movdqa      XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
 635     movdqa      XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
 636     movdqa      XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
 637
 638     sub         rax, byte 2*SIZEOF_XMMWORD
 639     jz          short .nextrow
 640
 641     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
 642     add         rbx, byte 4*SIZEOF_XMMWORD  ; outptr0
 643     add         rdi, byte 4*SIZEOF_XMMWORD  ; outptr1
 644     jmp         short .columnloop
 645
 646 .nextrow:
 647     pop         rsi
 648     pop         rdi
 649
 650     add         rsi, byte 1*SIZEOF_JSAMPROW  ; input_data
 651     add         rdi, byte 2*SIZEOF_JSAMPROW  ; output_data
 652     sub         rcx, byte 2                  ; rowctr
 653     jg          near .rowloop
 654
 655 .return:
 656     pop         rbx
 657     uncollect_args 4
 658     pop         rbp
 659     ret
 660
 661 ; For some reason, the OS X linker does not honor the request to align the
 662 ; segment unless we do this.
 663     align       32