2 ; jcsammmx.asm - downsampling (MMX)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
7 ; x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
19 %include "jsimdext.inc"
21 ; --------------------------------------------------------------------------
25 ; Downsample pixel values of a single component.
26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
30 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
35 %define img_width(b) (b)+8 ; JDIMENSION image_width
36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
43 global EXTN(jsimd_h2v1_downsample_mmx)
45 EXTN(jsimd_h2v1_downsample_mmx):
49 ; push ecx ; need not be preserved
50 ; push edx ; need not be preserved
54 mov ecx, JDIMENSION [width_blks(ebp)]
55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
58 mov edx, JDIMENSION [img_width(ebp)]
60 ; -- expand_right_edge
63 shl ecx,1 ; output_cols * 2
67 mov eax, INT [max_v_samp(ebp)]
72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
78 mov edi, JSAMPROW [esi]
80 mov al, JSAMPLE [edi-1]
87 add esi, byte SIZEOF_JSAMPROW
96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
100 mov edx, 0x00010000 ; bias pattern
103 punpckldq mm7,mm7 ; mm7={0, 1, 0, 1}
104 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
114 mov esi, JSAMPROW [esi] ; inptr
115 mov edi, JSAMPROW [edi] ; outptr
119 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
120 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
138 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
140 add esi, byte 2*SIZEOF_MMWORD ; inptr
141 add edi, byte 1*SIZEOF_MMWORD ; outptr
142 sub ecx, byte SIZEOF_MMWORD ; outcol
143 jnz short .columnloop
149 add esi, byte SIZEOF_JSAMPROW ; input_data
150 add edi, byte SIZEOF_JSAMPROW ; output_data
154 emms ; empty MMX state
159 ; pop edx ; need not be preserved
160 ; pop ecx ; need not be preserved
165 ; --------------------------------------------------------------------------
167 ; Downsample pixel values of a single component.
168 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
172 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
173 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
174 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
177 %define img_width(b) (b)+8 ; JDIMENSION image_width
178 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
179 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
180 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
181 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
182 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
185 global EXTN(jsimd_h2v2_downsample_mmx)
187 EXTN(jsimd_h2v2_downsample_mmx):
191 ; push ecx ; need not be preserved
192 ; push edx ; need not be preserved
196 mov ecx, JDIMENSION [width_blks(ebp)]
197 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
200 mov edx, JDIMENSION [img_width(ebp)]
202 ; -- expand_right_edge
205 shl ecx,1 ; output_cols * 2
207 jle short .expand_end
209 mov eax, INT [max_v_samp(ebp)]
211 jle short .expand_end
214 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
220 mov edi, JSAMPROW [esi]
222 mov al, JSAMPLE [edi-1]
229 add esi, byte SIZEOF_JSAMPROW
234 pop ecx ; output_cols
238 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
242 mov edx, 0x00020001 ; bias pattern
245 punpckldq mm7,mm7 ; mm7={1, 2, 1, 2}
246 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
248 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
249 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
256 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
258 mov edi, JSAMPROW [edi] ; outptr
262 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
263 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
264 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
265 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
294 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
296 add edx, byte 2*SIZEOF_MMWORD ; inptr0
297 add esi, byte 2*SIZEOF_MMWORD ; inptr1
298 add edi, byte 1*SIZEOF_MMWORD ; outptr
299 sub ecx, byte SIZEOF_MMWORD ; outcol
306 add esi, byte 2*SIZEOF_JSAMPROW ; input_data
307 add edi, byte 1*SIZEOF_JSAMPROW ; output_data
311 emms ; empty MMX state
316 ; pop edx ; need not be preserved
317 ; pop ecx ; need not be preserved
322 ; For some reason, the OS X linker does not honor the request to align the
323 ; segment unless we do this.