2 ; jcsample.asm - downsampling (64-bit AVX2)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright (C) 2009, 2016, D. R. Commander.
6 ; Copyright (C) 2015, Intel Corporation.
8 ; Based on the x86 SIMD extension for IJG JPEG library
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
12 ; This file should be assembled with NASM (Netwide Assembler),
13 ; can *not* be assembled with Microsoft's MASM or any compatible
14 ; assembler (including Borland's Turbo Assembler).
15 ; NASM is available from http://nasm.sourceforge.net/ or
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
20 %include "jsimdext.inc"
22 ; --------------------------------------------------------------------------
26 ; Downsample pixel values of a single component.
27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
31 ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
32 ; JDIMENSION v_samp_factor,
33 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
34 ; JSAMPARRAY output_data);
37 ; r10d = JDIMENSION image_width
38 ; r11 = int max_v_samp_factor
39 ; r12d = JDIMENSION v_samp_factor
40 ; r13d = JDIMENSION width_in_blocks
41 ; r14 = JSAMPARRAY input_data
42 ; r15 = JSAMPARRAY output_data
45 GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
47 EXTN(jsimd_h2v1_downsample_avx2):
54 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
59 ; -- expand_right_edge
62 shl rcx, 1 ; output_cols * 2
71 mov rsi, r14 ; input_data
76 mov rdi, JSAMPROW [rsi]
78 mov al, JSAMPLE [rdi-1]
85 add rsi, byte SIZEOF_JSAMPROW
94 mov eax, r12d ; rowctr
98 mov rdx, 0x00010000 ; bias pattern
100 vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
101 vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
102 vpcmpeqw ymm6, ymm6, ymm6
103 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
105 mov rsi, r14 ; input_data
106 mov rdi, r15 ; output_data
112 mov rsi, JSAMPROW [rsi] ; inptr
113 mov rdi, JSAMPROW [rdi] ; outptr
115 cmp rcx, byte SIZEOF_YMMWORD
116 jae short .columnloop
119 ; rcx can possibly be 8, 16, 24
122 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
123 vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
124 mov rcx, SIZEOF_YMMWORD
125 jmp short .downsample
130 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
131 vpxor ymm1, ymm1, ymm1
132 mov rcx, SIZEOF_YMMWORD
133 jmp short .downsample
136 vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
137 vpxor ymm1, ymm1, ymm1
138 mov rcx, SIZEOF_YMMWORD
139 jmp short .downsample
142 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
143 vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
146 vpsrlw ymm2, ymm0, BYTE_BIT
147 vpand ymm0, ymm0, ymm6
148 vpsrlw ymm3, ymm1, BYTE_BIT
149 vpand ymm1, ymm1, ymm6
151 vpaddw ymm0, ymm0, ymm2
152 vpaddw ymm1, ymm1, ymm3
153 vpaddw ymm0, ymm0, ymm7
154 vpaddw ymm1, ymm1, ymm7
158 vpackuswb ymm0, ymm0, ymm1
159 vpermq ymm0, ymm0, 0xd8
161 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
163 sub rcx, byte SIZEOF_YMMWORD ; outcol
164 add rsi, byte 2*SIZEOF_YMMWORD ; inptr
165 add rdi, byte 1*SIZEOF_YMMWORD ; outptr
166 cmp rcx, byte SIZEOF_YMMWORD
167 jae short .columnloop
169 jnz near .columnloop_r24
175 add rsi, byte SIZEOF_JSAMPROW ; input_data
176 add rdi, byte SIZEOF_JSAMPROW ; output_data
186 ; --------------------------------------------------------------------------
188 ; Downsample pixel values of a single component.
189 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
193 ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
194 ; JDIMENSION v_samp_factor,
195 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
196 ; JSAMPARRAY output_data);
199 ; r10d = JDIMENSION image_width
200 ; r11 = int max_v_samp_factor
201 ; r12d = JDIMENSION v_samp_factor
202 ; r13d = JDIMENSION width_in_blocks
203 ; r14 = JSAMPARRAY input_data
204 ; r15 = JSAMPARRAY output_data
207 GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
209 EXTN(jsimd_h2v2_downsample_avx2):
216 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
221 ; -- expand_right_edge
224 shl rcx, 1 ; output_cols * 2
226 jle short .expand_end
230 jle short .expand_end
233 mov rsi, r14 ; input_data
238 mov rdi, JSAMPROW [rsi]
240 mov al, JSAMPLE [rdi-1]
247 add rsi, byte SIZEOF_JSAMPROW
252 pop rcx ; output_cols
256 mov eax, r12d ; rowctr
260 mov rdx, 0x00020001 ; bias pattern
262 vpcmpeqw ymm6, ymm6, ymm6
263 vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
264 vperm2i128 ymm7, ymm7, ymm7, 0
265 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
267 mov rsi, r14 ; input_data
268 mov rdi, r15 ; output_data
274 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
275 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
276 mov rdi, JSAMPROW [rdi] ; outptr
278 cmp rcx, byte SIZEOF_YMMWORD
279 jae short .columnloop
284 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
285 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
286 vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
287 vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
288 mov rcx, SIZEOF_YMMWORD
289 jmp short .downsample
294 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
295 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
296 vpxor ymm2, ymm2, ymm2
297 vpxor ymm3, ymm3, ymm3
298 mov rcx, SIZEOF_YMMWORD
299 jmp short .downsample
302 vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
303 vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
304 vpxor ymm2, ymm2, ymm2
305 vpxor ymm3, ymm3, ymm3
306 mov rcx, SIZEOF_YMMWORD
307 jmp short .downsample
310 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
311 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
312 vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
313 vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
316 vpand ymm4, ymm0, ymm6
317 vpsrlw ymm0, ymm0, BYTE_BIT
318 vpand ymm5, ymm1, ymm6
319 vpsrlw ymm1, ymm1, BYTE_BIT
320 vpaddw ymm0, ymm0, ymm4
321 vpaddw ymm1, ymm1, ymm5
323 vpand ymm4, ymm2, ymm6
324 vpsrlw ymm2, ymm2, BYTE_BIT
325 vpand ymm5, ymm3, ymm6
326 vpsrlw ymm3, ymm3, BYTE_BIT
327 vpaddw ymm2, ymm2, ymm4
328 vpaddw ymm3, ymm3, ymm5
330 vpaddw ymm0, ymm0, ymm1
331 vpaddw ymm2, ymm2, ymm3
332 vpaddw ymm0, ymm0, ymm7
333 vpaddw ymm2, ymm2, ymm7
337 vpackuswb ymm0, ymm0, ymm2
338 vpermq ymm0, ymm0, 0xd8
340 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
342 sub rcx, byte SIZEOF_YMMWORD ; outcol
343 add rdx, byte 2*SIZEOF_YMMWORD ; inptr0
344 add rsi, byte 2*SIZEOF_YMMWORD ; inptr1
345 add rdi, byte 1*SIZEOF_YMMWORD ; outptr
346 cmp rcx, byte SIZEOF_YMMWORD
349 jnz near .columnloop_r24
355 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
356 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
366 ; For some reason, the OS X linker does not honor the request to align the
367 ; segment unless we do this.