2 ; jcsample.asm - downsampling (64-bit SSE2)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright (C) 2009, 2016, D. R. Commander.
6 ; Copyright (C) 2018, Matthias Räncker.
8 ; Based on the x86 SIMD extension for IJG JPEG library
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
12 ; This file should be assembled with NASM (Netwide Assembler),
13 ; can *not* be assembled with Microsoft's MASM or any compatible
14 ; assembler (including Borland's Turbo Assembler).
15 ; NASM is available from http://nasm.sourceforge.net/ or
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
18 %include "jsimdext.inc"
20 ; --------------------------------------------------------------------------
24 ; Downsample pixel values of a single component.
25 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
29 ; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
30 ; JDIMENSION v_samp_factor,
31 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
32 ; JSAMPARRAY output_data);
35 ; r10d = JDIMENSION image_width
36 ; r11 = int max_v_samp_factor
37 ; r12d = JDIMENSION v_samp_factor
38 ; r13d = JDIMENSION width_in_blocks
39 ; r14 = JSAMPARRAY input_data
40 ; r15 = JSAMPARRAY output_data
43 GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
45 EXTN(jsimd_h2v1_downsample_sse2):
51 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
56 ; -- expand_right_edge
59 shl rcx, 1 ; output_cols * 2
68 mov rsi, r14 ; input_data
73 mov rdip, JSAMPROW [rsi]
75 mov al, JSAMPLE [rdi-1]
82 add rsi, byte SIZEOF_JSAMPROW
91 mov eax, r12d ; rowctr
95 mov rdx, 0x00010000 ; bias pattern
98 pshufd xmm7, xmm7, 0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
99 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
101 mov rsi, r14 ; input_data
102 mov rdi, r15 ; output_data
108 mov rsip, JSAMPROW [rsi] ; inptr
109 mov rdip, JSAMPROW [rdi] ; outptr
111 cmp rcx, byte SIZEOF_XMMWORD
112 jae short .columnloop
115 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
117 mov rcx, SIZEOF_XMMWORD
118 jmp short .downsample
121 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
122 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
142 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
144 sub rcx, byte SIZEOF_XMMWORD ; outcol
145 add rsi, byte 2*SIZEOF_XMMWORD ; inptr
146 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
147 cmp rcx, byte SIZEOF_XMMWORD
148 jae short .columnloop
150 jnz short .columnloop_r8
156 add rsi, byte SIZEOF_JSAMPROW ; input_data
157 add rdi, byte SIZEOF_JSAMPROW ; output_data
166 ; --------------------------------------------------------------------------
168 ; Downsample pixel values of a single component.
169 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
173 ; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
174 ; JDIMENSION v_samp_factor,
175 ; JDIMENSION width_in_blocks, JSAMPARRAY input_data,
176 ; JSAMPARRAY output_data);
179 ; r10d = JDIMENSION image_width
180 ; r11 = int max_v_samp_factor
181 ; r12d = JDIMENSION v_samp_factor
182 ; r13d = JDIMENSION width_in_blocks
183 ; r14 = JSAMPARRAY input_data
184 ; r15 = JSAMPARRAY output_data
187 GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
189 EXTN(jsimd_h2v2_downsample_sse2):
195 shl rcx, 3 ; imul rcx,DCTSIZE (rcx = output_cols)
200 ; -- expand_right_edge
203 shl rcx, 1 ; output_cols * 2
205 jle short .expand_end
209 jle short .expand_end
212 mov rsi, r14 ; input_data
217 mov rdip, JSAMPROW [rsi]
219 mov al, JSAMPLE [rdi-1]
226 add rsi, byte SIZEOF_JSAMPROW
231 pop rcx ; output_cols
235 mov eax, r12d ; rowctr
239 mov rdx, 0x00020001 ; bias pattern
242 pshufd xmm7, xmm7, 0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
243 psrlw xmm6, BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
245 mov rsi, r14 ; input_data
246 mov rdi, r15 ; output_data
252 mov rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
253 mov rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
254 mov rdip, JSAMPROW [rdi] ; outptr
256 cmp rcx, byte SIZEOF_XMMWORD
257 jae short .columnloop
260 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
261 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
264 mov rcx, SIZEOF_XMMWORD
265 jmp short .downsample
268 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
269 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
270 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
271 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
301 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
303 sub rcx, byte SIZEOF_XMMWORD ; outcol
304 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
305 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
306 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
307 cmp rcx, byte SIZEOF_XMMWORD
310 jnz near .columnloop_r8
316 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
317 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
326 ; For some reason, the OS X linker does not honor the request to align the
327 ; segment unless we do this.