2 ; jcsample.asm - downsampling (64-bit AVX2)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright (C) 2009, 2016, D. R. Commander.
6 ; Copyright (C) 2015, Intel Corporation.
7 ; Copyright (C) 2018, Matthias Räncker.
9 ; Based on the x86 SIMD extension for IJG JPEG library
10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
13 ; This file should be assembled with NASM (Netwide Assembler),
14 ; can *not* be assembled with Microsoft's MASM or any compatible
15 ; assembler (including Borland's Turbo Assembler).
16 ; NASM is available from http://nasm.sourceforge.net/ or
17 ; http://sourceforge.net/project/showfiles.php?group_id=6208
19 %include "jsimdext.inc"
21 ; --------------------------------------------------------------------------
25 ; Downsample pixel values of a single component.
26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
30 ; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
31 ; JDIMENSION v_samp_factor,
32 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
33 ; JSAMPARRAY output_data);
36 ; r10d = JDIMENSION image_width
37 ; r11 = int max_v_samp_factor
38 ; r12d = JDIMENSION v_samp_factor
39 ; r13d = JDIMENSION width_in_blocks
40 ; r14 = JSAMPARRAY input_data
41 ; r15 = JSAMPARRAY output_data
44 GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
46 EXTN(jsimd_h2v1_downsample_avx2):
53 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
58 ; -- expand_right_edge
61 shl rcx, 1 ; output_cols * 2
70 mov rsi, r14 ; input_data
75 mov rdip, JSAMPROW [rsi]
77 mov al, JSAMPLE [rdi-1]
84 add rsi, byte SIZEOF_JSAMPROW
93 mov eax, r12d ; rowctr
97 mov rdx, 0x00010000 ; bias pattern
99 vpshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
100 vperm2i128 ymm7, ymm7, ymm7, 0 ; ymm7={xmm7, xmm7}
101 vpcmpeqw ymm6, ymm6, ymm6
102 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
104 mov rsi, r14 ; input_data
105 mov rdi, r15 ; output_data
111 mov rsip, JSAMPROW [rsi] ; inptr
112 mov rdip, JSAMPROW [rdi] ; outptr
114 cmp rcx, byte SIZEOF_YMMWORD
115 jae short .columnloop
118 ; rcx can possibly be 8, 16, 24
121 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
122 vmovdqu xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
123 mov rcx, SIZEOF_YMMWORD
124 jmp short .downsample
129 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
130 vpxor ymm1, ymm1, ymm1
131 mov rcx, SIZEOF_YMMWORD
132 jmp short .downsample
135 vmovdqu xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
136 vpxor ymm1, ymm1, ymm1
137 mov rcx, SIZEOF_YMMWORD
138 jmp short .downsample
141 vmovdqu ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
142 vmovdqu ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
145 vpsrlw ymm2, ymm0, BYTE_BIT
146 vpand ymm0, ymm0, ymm6
147 vpsrlw ymm3, ymm1, BYTE_BIT
148 vpand ymm1, ymm1, ymm6
150 vpaddw ymm0, ymm0, ymm2
151 vpaddw ymm1, ymm1, ymm3
152 vpaddw ymm0, ymm0, ymm7
153 vpaddw ymm1, ymm1, ymm7
157 vpackuswb ymm0, ymm0, ymm1
158 vpermq ymm0, ymm0, 0xd8
160 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
162 sub rcx, byte SIZEOF_YMMWORD ; outcol
163 add rsi, byte 2*SIZEOF_YMMWORD ; inptr
164 add rdi, byte 1*SIZEOF_YMMWORD ; outptr
165 cmp rcx, byte SIZEOF_YMMWORD
166 jae short .columnloop
168 jnz near .columnloop_r24
174 add rsi, byte SIZEOF_JSAMPROW ; input_data
175 add rdi, byte SIZEOF_JSAMPROW ; output_data
185 ; --------------------------------------------------------------------------
187 ; Downsample pixel values of a single component.
188 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
192 ; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
193 ; JDIMENSION v_samp_factor,
194 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
195 ; JSAMPARRAY output_data);
198 ; r10d = JDIMENSION image_width
199 ; r11 = int max_v_samp_factor
200 ; r12d = JDIMENSION v_samp_factor
201 ; r13d = JDIMENSION width_in_blocks
202 ; r14 = JSAMPARRAY input_data
203 ; r15 = JSAMPARRAY output_data
206 GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
208 EXTN(jsimd_h2v2_downsample_avx2):
215 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
220 ; -- expand_right_edge
223 shl rcx, 1 ; output_cols * 2
225 jle short .expand_end
229 jle short .expand_end
232 mov rsi, r14 ; input_data
237 mov rdip, JSAMPROW [rsi]
239 mov al, JSAMPLE [rdi-1]
246 add rsi, byte SIZEOF_JSAMPROW
251 pop rcx ; output_cols
255 mov eax, r12d ; rowctr
259 mov rdx, 0x00020001 ; bias pattern
261 vpcmpeqw ymm6, ymm6, ymm6
262 vpshufd xmm7, xmm7, 0x00 ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
263 vperm2i128 ymm7, ymm7, ymm7, 0
264 vpsrlw ymm6, ymm6, BYTE_BIT ; ymm6={0xFF 0x00 0xFF 0x00 ..}
266 mov rsi, r14 ; input_data
267 mov rdi, r15 ; output_data
273 mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
274 mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
275 mov rdip, JSAMPROW [rdi] ; outptr
277 cmp rcx, byte SIZEOF_YMMWORD
278 jae short .columnloop
283 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
284 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
285 vmovdqu xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
286 vmovdqu xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
287 mov rcx, SIZEOF_YMMWORD
288 jmp short .downsample
293 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
294 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
295 vpxor ymm2, ymm2, ymm2
296 vpxor ymm3, ymm3, ymm3
297 mov rcx, SIZEOF_YMMWORD
298 jmp short .downsample
301 vmovdqu xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
302 vmovdqu xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
303 vpxor ymm2, ymm2, ymm2
304 vpxor ymm3, ymm3, ymm3
305 mov rcx, SIZEOF_YMMWORD
306 jmp short .downsample
309 vmovdqu ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
310 vmovdqu ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
311 vmovdqu ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
312 vmovdqu ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
315 vpand ymm4, ymm0, ymm6
316 vpsrlw ymm0, ymm0, BYTE_BIT
317 vpand ymm5, ymm1, ymm6
318 vpsrlw ymm1, ymm1, BYTE_BIT
319 vpaddw ymm0, ymm0, ymm4
320 vpaddw ymm1, ymm1, ymm5
322 vpand ymm4, ymm2, ymm6
323 vpsrlw ymm2, ymm2, BYTE_BIT
324 vpand ymm5, ymm3, ymm6
325 vpsrlw ymm3, ymm3, BYTE_BIT
326 vpaddw ymm2, ymm2, ymm4
327 vpaddw ymm3, ymm3, ymm5
329 vpaddw ymm0, ymm0, ymm1
330 vpaddw ymm2, ymm2, ymm3
331 vpaddw ymm0, ymm0, ymm7
332 vpaddw ymm2, ymm2, ymm7
336 vpackuswb ymm0, ymm0, ymm2
337 vpermq ymm0, ymm0, 0xd8
339 vmovdqu YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
341 sub rcx, byte SIZEOF_YMMWORD ; outcol
342 add rdx, byte 2*SIZEOF_YMMWORD ; inptr0
343 add rsi, byte 2*SIZEOF_YMMWORD ; inptr1
344 add rdi, byte 1*SIZEOF_YMMWORD ; outptr
345 cmp rcx, byte SIZEOF_YMMWORD
348 jnz near .columnloop_r24
354 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
355 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
365 ; For some reason, the OS X linker does not honor the request to align the
366 ; segment unless we do this.