simd/x86_64/jcsample-sse2.asm

   1 ;
   2 ; jcsample.asm - downsampling (64-bit SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright (C) 2009, 2016, D. R. Commander.
   6 ; Copyright (C) 2018, Matthias Räncker.
   7 ;
   8 ; Based on the x86 SIMD extension for IJG JPEG library
   9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11 ;
  12 ; This file should be assembled with NASM (Netwide Assembler),
  13 ; can *not* be assembled with Microsoft's MASM or any compatible
  14 ; assembler (including Borland's Turbo Assembler).
  15 ; NASM is available from http://nasm.sourceforge.net/ or
  16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17
  18 %include "jsimdext.inc"
  19
  20 ; --------------------------------------------------------------------------
  21     SECTION     SEG_TEXT
  22     BITS        64
  23 ;
  24 ; Downsample pixel values of a single component.
  25 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
  26 ; without smoothing.
  27 ;
  28 ; GLOBAL(void)
  29 ; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
  30 ;                            JDIMENSION v_samp_factor,
  31 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
  32 ;                            JSAMPARRAY output_data);
  33 ;
  34
  35 ; r10d = JDIMENSION image_width
  36 ; r11 = int max_v_samp_factor
  37 ; r12d = JDIMENSION v_samp_factor
  38 ; r13d = JDIMENSION width_in_blocks
  39 ; r14 = JSAMPARRAY input_data
  40 ; r15 = JSAMPARRAY output_data
  41
  42     align       32
  43     GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
  44
  45 EXTN(jsimd_h2v1_downsample_sse2):
  46     push        rbp
  47     mov         rbp, rsp
  48     collect_args 6
  49
  50     mov         ecx, r13d
  51     shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
  52     jz          near .return
  53
  54     mov         edx, r10d
  55
  56     ; -- expand_right_edge
  57
  58     push        rcx
  59     shl         rcx, 1                  ; output_cols * 2
  60     sub         rcx, rdx
  61     jle         short .expand_end
  62
  63     mov         rax, r11
  64     test        rax, rax
  65     jle         short .expand_end
  66
  67     cld
  68     mov         rsi, r14                ; input_data
  69 .expandloop:
  70     push        rax
  71     push        rcx
  72
  73     mov         rdip, JSAMPROW [rsi]
  74     add         rdi, rdx
  75     mov         al, JSAMPLE [rdi-1]
  76
  77     rep stosb
  78
  79     pop         rcx
  80     pop         rax
  81
  82     add         rsi, byte SIZEOF_JSAMPROW
  83     dec         rax
  84     jg          short .expandloop
  85
  86 .expand_end:
  87     pop         rcx                     ; output_cols
  88
  89     ; -- h2v1_downsample
  90
  91     mov         eax, r12d               ; rowctr
  92     test        eax, eax
  93     jle         near .return
  94
  95     mov         rdx, 0x00010000         ; bias pattern
  96     movd        xmm7, edx
  97     pcmpeqw     xmm6, xmm6
  98     pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
  99     psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
 100
 101     mov         rsi, r14                ; input_data
 102     mov         rdi, r15                ; output_data
 103 .rowloop:
 104     push        rcx
 105     push        rdi
 106     push        rsi
 107
 108     mov         rsip, JSAMPROW [rsi]    ; inptr
 109     mov         rdip, JSAMPROW [rdi]    ; outptr
 110
 111     cmp         rcx, byte SIZEOF_XMMWORD
 112     jae         short .columnloop
 113
 114 .columnloop_r8:
 115     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 116     pxor        xmm1, xmm1
 117     mov         rcx, SIZEOF_XMMWORD
 118     jmp         short .downsample
 119
 120 .columnloop:
 121     movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 122     movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 123
 124 .downsample:
 125     movdqa      xmm2, xmm0
 126     movdqa      xmm3, xmm1
 127
 128     pand        xmm0, xmm6
 129     psrlw       xmm2, BYTE_BIT
 130     pand        xmm1, xmm6
 131     psrlw       xmm3, BYTE_BIT
 132
 133     paddw       xmm0, xmm2
 134     paddw       xmm1, xmm3
 135     paddw       xmm0, xmm7
 136     paddw       xmm1, xmm7
 137     psrlw       xmm0, 1
 138     psrlw       xmm1, 1
 139
 140     packuswb    xmm0, xmm1
 141
 142     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 143
 144     sub         rcx, byte SIZEOF_XMMWORD    ; outcol
 145     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
 146     add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
 147     cmp         rcx, byte SIZEOF_XMMWORD
 148     jae         short .columnloop
 149     test        rcx, rcx
 150     jnz         short .columnloop_r8
 151
 152     pop         rsi
 153     pop         rdi
 154     pop         rcx
 155
 156     add         rsi, byte SIZEOF_JSAMPROW  ; input_data
 157     add         rdi, byte SIZEOF_JSAMPROW  ; output_data
 158     dec         rax                        ; rowctr
 159     jg          near .rowloop
 160
 161 .return:
 162     uncollect_args 6
 163     pop         rbp
 164     ret
 165
 166 ; --------------------------------------------------------------------------
 167 ;
 168 ; Downsample pixel values of a single component.
 169 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
 170 ; without smoothing.
 171 ;
 172 ; GLOBAL(void)
 173 ; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
 174 ;                            JDIMENSION v_samp_factor,
 175 ;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
 176 ;                            JSAMPARRAY output_data);
 177 ;
 178
 179 ; r10d = JDIMENSION image_width
 180 ; r11 = int max_v_samp_factor
 181 ; r12d = JDIMENSION v_samp_factor
 182 ; r13d = JDIMENSION width_in_blocks
 183 ; r14 = JSAMPARRAY input_data
 184 ; r15 = JSAMPARRAY output_data
 185
 186     align       32
 187     GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
 188
 189 EXTN(jsimd_h2v2_downsample_sse2):
 190     push        rbp
 191     mov         rbp, rsp
 192     collect_args 6
 193
 194     mov         ecx, r13d
 195     shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
 196     jz          near .return
 197
 198     mov         edx, r10d
 199
 200     ; -- expand_right_edge
 201
 202     push        rcx
 203     shl         rcx, 1                  ; output_cols * 2
 204     sub         rcx, rdx
 205     jle         short .expand_end
 206
 207     mov         rax, r11
 208     test        rax, rax
 209     jle         short .expand_end
 210
 211     cld
 212     mov         rsi, r14                ; input_data
 213 .expandloop:
 214     push        rax
 215     push        rcx
 216
 217     mov         rdip, JSAMPROW [rsi]
 218     add         rdi, rdx
 219     mov         al, JSAMPLE [rdi-1]
 220
 221     rep stosb
 222
 223     pop         rcx
 224     pop         rax
 225
 226     add         rsi, byte SIZEOF_JSAMPROW
 227     dec         rax
 228     jg          short .expandloop
 229
 230 .expand_end:
 231     pop         rcx                     ; output_cols
 232
 233     ; -- h2v2_downsample
 234
 235     mov         eax, r12d               ; rowctr
 236     test        rax, rax
 237     jle         near .return
 238
 239     mov         rdx, 0x00020001         ; bias pattern
 240     movd        xmm7, edx
 241     pcmpeqw     xmm6, xmm6
 242     pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
 243     psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
 244
 245     mov         rsi, r14                ; input_data
 246     mov         rdi, r15                ; output_data
 247 .rowloop:
 248     push        rcx
 249     push        rdi
 250     push        rsi
 251
 252     mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
 253     mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
 254     mov         rdip, JSAMPROW [rdi]                    ; outptr
 255
 256     cmp         rcx, byte SIZEOF_XMMWORD
 257     jae         short .columnloop
 258
 259 .columnloop_r8:
 260     movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
 261     movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 262     pxor        xmm2, xmm2
 263     pxor        xmm3, xmm3
 264     mov         rcx, SIZEOF_XMMWORD
 265     jmp         short .downsample
 266
 267 .columnloop:
 268     movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
 269     movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
 270     movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
 271     movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
 272
 273 .downsample:
 274     movdqa      xmm4, xmm0
 275     movdqa      xmm5, xmm1
 276     pand        xmm0, xmm6
 277     psrlw       xmm4, BYTE_BIT
 278     pand        xmm1, xmm6
 279     psrlw       xmm5, BYTE_BIT
 280     paddw       xmm0, xmm4
 281     paddw       xmm1, xmm5
 282
 283     movdqa      xmm4, xmm2
 284     movdqa      xmm5, xmm3
 285     pand        xmm2, xmm6
 286     psrlw       xmm4, BYTE_BIT
 287     pand        xmm3, xmm6
 288     psrlw       xmm5, BYTE_BIT
 289     paddw       xmm2, xmm4
 290     paddw       xmm3, xmm5
 291
 292     paddw       xmm0, xmm1
 293     paddw       xmm2, xmm3
 294     paddw       xmm0, xmm7
 295     paddw       xmm2, xmm7
 296     psrlw       xmm0, 2
 297     psrlw       xmm2, 2
 298
 299     packuswb    xmm0, xmm2
 300
 301     movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
 302
 303     sub         rcx, byte SIZEOF_XMMWORD    ; outcol
 304     add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
 305     add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
 306     add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
 307     cmp         rcx, byte SIZEOF_XMMWORD
 308     jae         near .columnloop
 309     test        rcx, rcx
 310     jnz         near .columnloop_r8
 311
 312     pop         rsi
 313     pop         rdi
 314     pop         rcx
 315
 316     add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
 317     add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
 318     dec         rax                          ; rowctr
 319     jg          near .rowloop
 320
 321 .return:
 322     uncollect_args 6
 323     pop         rbp
 324     ret
 325
 326 ; For some reason, the OS X linker does not honor the request to align the
 327 ; segment unless we do this.
 328     align       32