2 ; jcsample.asm - downsampling (SSE2)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
19 %include "jsimdext.inc"
21 ; --------------------------------------------------------------------------
25 ; Downsample pixel values of a single component.
26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
35 %define img_width(b) (b)+8 ; JDIMENSION image_width
36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
43 global EXTN(jsimd_h2v1_downsample_sse2)
45 EXTN(jsimd_h2v1_downsample_sse2):
49 ; push ecx ; need not be preserved
50 ; push edx ; need not be preserved
54 mov ecx, JDIMENSION [width_blks(ebp)]
55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
58 mov edx, JDIMENSION [img_width(ebp)]
60 ; -- expand_right_edge
63 shl ecx,1 ; output_cols * 2
67 mov eax, INT [max_v_samp(ebp)]
72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
78 mov edi, JSAMPROW [esi]
80 mov al, JSAMPLE [edi-1]
87 add esi, byte SIZEOF_JSAMPROW
96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
100 mov edx, 0x00010000 ; bias pattern
103 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
104 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
114 mov esi, JSAMPROW [esi] ; inptr
115 mov edi, JSAMPROW [edi] ; outptr
117 cmp ecx, byte SIZEOF_XMMWORD
118 jae short .columnloop
122 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
124 mov ecx, SIZEOF_XMMWORD
125 jmp short .downsample
129 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
130 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
150 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
152 sub ecx, byte SIZEOF_XMMWORD ; outcol
153 add esi, byte 2*SIZEOF_XMMWORD ; inptr
154 add edi, byte 1*SIZEOF_XMMWORD ; outptr
155 cmp ecx, byte SIZEOF_XMMWORD
156 jae short .columnloop
158 jnz short .columnloop_r8
164 add esi, byte SIZEOF_JSAMPROW ; input_data
165 add edi, byte SIZEOF_JSAMPROW ; output_data
172 ; pop edx ; need not be preserved
173 ; pop ecx ; need not be preserved
178 ; --------------------------------------------------------------------------
180 ; Downsample pixel values of a single component.
181 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
185 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
186 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
187 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
190 %define img_width(b) (b)+8 ; JDIMENSION image_width
191 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
192 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
193 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
194 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
195 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
198 global EXTN(jsimd_h2v2_downsample_sse2)
200 EXTN(jsimd_h2v2_downsample_sse2):
204 ; push ecx ; need not be preserved
205 ; push edx ; need not be preserved
209 mov ecx, JDIMENSION [width_blks(ebp)]
210 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
213 mov edx, JDIMENSION [img_width(ebp)]
215 ; -- expand_right_edge
218 shl ecx,1 ; output_cols * 2
220 jle short .expand_end
222 mov eax, INT [max_v_samp(ebp)]
224 jle short .expand_end
227 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
233 mov edi, JSAMPROW [esi]
235 mov al, JSAMPLE [edi-1]
242 add esi, byte SIZEOF_JSAMPROW
247 pop ecx ; output_cols
251 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
255 mov edx, 0x00020001 ; bias pattern
258 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
259 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
261 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
262 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
269 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
270 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
271 mov edi, JSAMPROW [edi] ; outptr
273 cmp ecx, byte SIZEOF_XMMWORD
274 jae short .columnloop
278 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
279 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
282 mov ecx, SIZEOF_XMMWORD
283 jmp short .downsample
287 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
288 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
289 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
290 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
320 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
322 sub ecx, byte SIZEOF_XMMWORD ; outcol
323 add edx, byte 2*SIZEOF_XMMWORD ; inptr0
324 add esi, byte 2*SIZEOF_XMMWORD ; inptr1
325 add edi, byte 1*SIZEOF_XMMWORD ; outptr
326 cmp ecx, byte SIZEOF_XMMWORD
329 jnz near .columnloop_r8
335 add esi, byte 2*SIZEOF_JSAMPROW ; input_data
336 add edi, byte 1*SIZEOF_JSAMPROW ; output_data
343 ; pop edx ; need not be preserved
344 ; pop ecx ; need not be preserved
349 ; For some reason, the OS X linker does not honor the request to align the
350 ; segment unless we do this.