2 ; jidctfst.asm - fast integer IDCT (MMX)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
6 ; Based on the x86 SIMD extension for IJG JPEG library
7 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
8 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
10 ; This file should be assembled with NASM (Netwide Assembler),
11 ; can *not* be assembled with Microsoft's MASM or any compatible
12 ; assembler (including Borland's Turbo Assembler).
13 ; NASM is available from http://nasm.sourceforge.net/ or
14 ; http://sourceforge.net/project/showfiles.php?group_id=6208
16 ; This file contains a fast, not so accurate integer implementation of
17 ; the inverse DCT (Discrete Cosine Transform). The following code is
18 ; based directly on the IJG's original jidctfst.c; see the jidctfst.c
23 %include "jsimdext.inc"
26 ; --------------------------------------------------------------------------
28 %define CONST_BITS 8 ; 14 is also OK.
31 %if IFAST_SCALE_BITS != PASS1_BITS
32 %error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
36 F_1_082 equ 277 ; FIX(1.082392200)
37 F_1_414 equ 362 ; FIX(1.414213562)
38 F_1_847 equ 473 ; FIX(1.847759065)
39 F_2_613 equ 669 ; FIX(2.613125930)
40 F_1_613 equ (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
42 ; NASM cannot do compile-time arithmetic on floating-point constants.
43 %define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n))
44 F_1_082 equ DESCALE(1162209775,30-CONST_BITS) ; FIX(1.082392200)
45 F_1_414 equ DESCALE(1518500249,30-CONST_BITS) ; FIX(1.414213562)
46 F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065)
47 F_2_613 equ DESCALE(2805822602,30-CONST_BITS) ; FIX(2.613125930)
48 F_1_613 equ (F_2_613 - (1 << CONST_BITS)) ; FIX(2.613125930) - FIX(1)
51 ; --------------------------------------------------------------------------
54 ; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
55 ; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
57 %define PRE_MULTIPLY_SCALE_BITS 2
58 %define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
61 global EXTN(jconst_idct_ifast_mmx)
63 EXTN(jconst_idct_ifast_mmx):
65 PW_F1414 times 4 dw F_1_414 << CONST_SHIFT
66 PW_F1847 times 4 dw F_1_847 << CONST_SHIFT
67 PW_MF1613 times 4 dw -F_1_613 << CONST_SHIFT
68 PW_F1082 times 4 dw F_1_082 << CONST_SHIFT
69 PB_CENTERJSAMP times 8 db CENTERJSAMPLE
73 ; --------------------------------------------------------------------------
77 ; Perform dequantization and inverse DCT on one block of coefficients.
80 ; jsimd_idct_ifast_mmx (void *dct_table, JCOEFPTR coef_block,
81 ; JSAMPARRAY output_buf, JDIMENSION output_col)
84 %define dct_table(b) (b)+8 ; jpeg_component_info *compptr
85 %define coef_block(b) (b)+12 ; JCOEFPTR coef_block
86 %define output_buf(b) (b)+16 ; JSAMPARRAY output_buf
87 %define output_col(b) (b)+20 ; JDIMENSION output_col
89 %define original_ebp ebp+0
90 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM]
92 %define workspace wk(0)-DCTSIZE2*SIZEOF_JCOEF
93 ; JCOEF workspace[DCTSIZE2]
96 global EXTN(jsimd_idct_ifast_mmx)
98 EXTN(jsimd_idct_ifast_mmx):
100 mov eax,esp ; eax = original ebp
102 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits
104 mov ebp,esp ; ebp = aligned ebp
107 ; push ecx ; need not be preserved
108 ; push edx ; need not be preserved
112 get_GOT ebx ; get GOT address
114 ; ---- Pass 1: process columns from input, store into work array.
116 ; mov eax, [original_ebp]
117 mov edx, POINTER [dct_table(eax)] ; quantptr
118 mov esi, JCOEFPTR [coef_block(eax)] ; inptr
119 lea edi, [workspace] ; JCOEF *wsptr
120 mov ecx, DCTSIZE/4 ; ctr
123 %ifndef NO_ZERO_COLUMN_TEST_IFAST_MMX
124 mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
125 or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
128 movq mm0, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
129 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
130 por mm0, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
131 por mm1, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
132 por mm0, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
133 por mm1, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
134 por mm0, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
141 ; -- AC terms all zero
143 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
144 pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
146 movq mm2,mm0 ; mm0=in0=(00 01 02 03)
147 punpcklwd mm0,mm0 ; mm0=(00 00 01 01)
148 punpckhwd mm2,mm2 ; mm2=(02 02 03 03)
151 punpckldq mm0,mm0 ; mm0=(00 00 00 00)
152 punpckhdq mm1,mm1 ; mm1=(01 01 01 01)
154 punpckldq mm2,mm2 ; mm2=(02 02 02 02)
155 punpckhdq mm3,mm3 ; mm3=(03 03 03 03)
157 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
158 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm0
159 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm1
160 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm1
161 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm2
162 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
163 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm3
164 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm3
172 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
173 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
174 pmullw mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
175 pmullw mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
176 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
177 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
178 pmullw mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
179 pmullw mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
183 psubw mm0,mm2 ; mm0=tmp11
185 paddw mm4,mm2 ; mm4=tmp10
186 paddw mm5,mm3 ; mm5=tmp13
188 psllw mm1,PRE_MULTIPLY_SCALE_BITS
189 pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
190 psubw mm1,mm5 ; mm1=tmp12
194 psubw mm4,mm5 ; mm4=tmp3
195 psubw mm0,mm1 ; mm0=tmp2
196 paddw mm6,mm5 ; mm6=tmp0
197 paddw mm7,mm1 ; mm7=tmp1
199 movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
200 movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
204 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
205 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
206 pmullw mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
207 pmullw mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
208 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
209 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
210 pmullw mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
211 pmullw mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
215 psubw mm2,mm1 ; mm2=z12
216 psubw mm5,mm3 ; mm5=z10
217 paddw mm4,mm1 ; mm4=z11
218 paddw mm0,mm3 ; mm0=z13
220 movq mm1,mm5 ; mm1=z10(unscaled)
221 psllw mm2,PRE_MULTIPLY_SCALE_BITS
222 psllw mm5,PRE_MULTIPLY_SCALE_BITS
226 paddw mm3,mm0 ; mm3=tmp7
228 psllw mm4,PRE_MULTIPLY_SCALE_BITS
229 pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
231 ; To avoid overflow...
234 ; tmp12 = -2.613125930 * z10 + z5;
236 ; (This implementation)
237 ; tmp12 = (-1.613125930 - 1) * z10 + z5;
238 ; = -1.613125930 * z10 - z10 + z5;
242 pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
243 pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
244 pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
246 psubw mm2,mm5 ; mm2=tmp10
247 paddw mm0,mm5 ; mm0=tmp12
249 ; -- Final output stage
251 psubw mm0,mm3 ; mm0=tmp6
254 paddw mm6,mm3 ; mm6=data0=(00 01 02 03)
255 paddw mm7,mm0 ; mm7=data1=(10 11 12 13)
256 psubw mm1,mm3 ; mm1=data7=(70 71 72 73)
257 psubw mm5,mm0 ; mm5=data6=(60 61 62 63)
258 psubw mm4,mm0 ; mm4=tmp5
260 movq mm3,mm6 ; transpose coefficients(phase 1)
261 punpcklwd mm6,mm7 ; mm6=(00 10 01 11)
262 punpckhwd mm3,mm7 ; mm3=(02 12 03 13)
263 movq mm0,mm5 ; transpose coefficients(phase 1)
264 punpcklwd mm5,mm1 ; mm5=(60 70 61 71)
265 punpckhwd mm0,mm1 ; mm0=(62 72 63 73)
267 movq mm7, MMWORD [wk(0)] ; mm7=tmp2
268 movq mm1, MMWORD [wk(1)] ; mm1=tmp3
270 movq MMWORD [wk(0)], mm5 ; wk(0)=(60 70 61 71)
271 movq MMWORD [wk(1)], mm0 ; wk(1)=(62 72 63 73)
273 paddw mm2,mm4 ; mm2=tmp4
276 paddw mm7,mm4 ; mm7=data2=(20 21 22 23)
277 paddw mm1,mm2 ; mm1=data4=(40 41 42 43)
278 psubw mm5,mm4 ; mm5=data5=(50 51 52 53)
279 psubw mm0,mm2 ; mm0=data3=(30 31 32 33)
281 movq mm4,mm7 ; transpose coefficients(phase 1)
282 punpcklwd mm7,mm0 ; mm7=(20 30 21 31)
283 punpckhwd mm4,mm0 ; mm4=(22 32 23 33)
284 movq mm2,mm1 ; transpose coefficients(phase 1)
285 punpcklwd mm1,mm5 ; mm1=(40 50 41 51)
286 punpckhwd mm2,mm5 ; mm2=(42 52 43 53)
288 movq mm0,mm6 ; transpose coefficients(phase 2)
289 punpckldq mm6,mm7 ; mm6=(00 10 20 30)
290 punpckhdq mm0,mm7 ; mm0=(01 11 21 31)
291 movq mm5,mm3 ; transpose coefficients(phase 2)
292 punpckldq mm3,mm4 ; mm3=(02 12 22 32)
293 punpckhdq mm5,mm4 ; mm5=(03 13 23 33)
295 movq mm7, MMWORD [wk(0)] ; mm7=(60 70 61 71)
296 movq mm4, MMWORD [wk(1)] ; mm4=(62 72 63 73)
298 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm6
299 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm0
300 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_JCOEF)], mm3
301 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_JCOEF)], mm5
303 movq mm6,mm1 ; transpose coefficients(phase 2)
304 punpckldq mm1,mm7 ; mm1=(40 50 60 70)
305 punpckhdq mm6,mm7 ; mm6=(41 51 61 71)
306 movq mm0,mm2 ; transpose coefficients(phase 2)
307 punpckldq mm2,mm4 ; mm2=(42 52 62 72)
308 punpckhdq mm0,mm4 ; mm0=(43 53 63 73)
310 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
311 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm6
312 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_JCOEF)], mm2
313 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_JCOEF)], mm0
316 add esi, byte 4*SIZEOF_JCOEF ; coef_block
317 add edx, byte 4*SIZEOF_IFAST_MULT_TYPE ; quantptr
318 add edi, byte 4*DCTSIZE*SIZEOF_JCOEF ; wsptr
322 ; ---- Pass 2: process rows from work array, store into output array.
324 mov eax, [original_ebp]
325 lea esi, [workspace] ; JCOEF *wsptr
326 mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *)
327 mov eax, JDIMENSION [output_col(eax)]
328 mov ecx, DCTSIZE/4 ; ctr
334 movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
335 movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
336 movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
337 movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
341 psubw mm0,mm2 ; mm0=tmp11
343 paddw mm4,mm2 ; mm4=tmp10
344 paddw mm5,mm3 ; mm5=tmp13
346 psllw mm1,PRE_MULTIPLY_SCALE_BITS
347 pmulhw mm1,[GOTOFF(ebx,PW_F1414)]
348 psubw mm1,mm5 ; mm1=tmp12
352 psubw mm4,mm5 ; mm4=tmp3
353 psubw mm0,mm1 ; mm0=tmp2
354 paddw mm6,mm5 ; mm6=tmp0
355 paddw mm7,mm1 ; mm7=tmp1
357 movq MMWORD [wk(1)], mm4 ; wk(1)=tmp3
358 movq MMWORD [wk(0)], mm0 ; wk(0)=tmp2
362 movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
363 movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
364 movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
365 movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
369 psubw mm2,mm1 ; mm2=z12
370 psubw mm5,mm3 ; mm5=z10
371 paddw mm4,mm1 ; mm4=z11
372 paddw mm0,mm3 ; mm0=z13
374 movq mm1,mm5 ; mm1=z10(unscaled)
375 psllw mm2,PRE_MULTIPLY_SCALE_BITS
376 psllw mm5,PRE_MULTIPLY_SCALE_BITS
380 paddw mm3,mm0 ; mm3=tmp7
382 psllw mm4,PRE_MULTIPLY_SCALE_BITS
383 pmulhw mm4,[GOTOFF(ebx,PW_F1414)] ; mm4=tmp11
385 ; To avoid overflow...
388 ; tmp12 = -2.613125930 * z10 + z5;
390 ; (This implementation)
391 ; tmp12 = (-1.613125930 - 1) * z10 + z5;
392 ; = -1.613125930 * z10 - z10 + z5;
396 pmulhw mm5,[GOTOFF(ebx,PW_F1847)] ; mm5=z5
397 pmulhw mm0,[GOTOFF(ebx,PW_MF1613)]
398 pmulhw mm2,[GOTOFF(ebx,PW_F1082)]
400 psubw mm2,mm5 ; mm2=tmp10
401 paddw mm0,mm5 ; mm0=tmp12
403 ; -- Final output stage
405 psubw mm0,mm3 ; mm0=tmp6
408 paddw mm6,mm3 ; mm6=data0=(00 10 20 30)
409 paddw mm7,mm0 ; mm7=data1=(01 11 21 31)
410 psraw mm6,(PASS1_BITS+3) ; descale
411 psraw mm7,(PASS1_BITS+3) ; descale
412 psubw mm1,mm3 ; mm1=data7=(07 17 27 37)
413 psubw mm5,mm0 ; mm5=data6=(06 16 26 36)
414 psraw mm1,(PASS1_BITS+3) ; descale
415 psraw mm5,(PASS1_BITS+3) ; descale
416 psubw mm4,mm0 ; mm4=tmp5
418 packsswb mm6,mm5 ; mm6=(00 10 20 30 06 16 26 36)
419 packsswb mm7,mm1 ; mm7=(01 11 21 31 07 17 27 37)
421 movq mm3, MMWORD [wk(0)] ; mm3=tmp2
422 movq mm0, MMWORD [wk(1)] ; mm0=tmp3
424 paddw mm2,mm4 ; mm2=tmp4
427 paddw mm3,mm4 ; mm3=data2=(02 12 22 32)
428 paddw mm0,mm2 ; mm0=data4=(04 14 24 34)
429 psraw mm3,(PASS1_BITS+3) ; descale
430 psraw mm0,(PASS1_BITS+3) ; descale
431 psubw mm5,mm4 ; mm5=data5=(05 15 25 35)
432 psubw mm1,mm2 ; mm1=data3=(03 13 23 33)
433 psraw mm5,(PASS1_BITS+3) ; descale
434 psraw mm1,(PASS1_BITS+3) ; descale
436 movq mm4,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm4=[PB_CENTERJSAMP]
438 packsswb mm3,mm0 ; mm3=(02 12 22 32 04 14 24 34)
439 packsswb mm1,mm5 ; mm1=(03 13 23 33 05 15 25 35)
446 movq mm2,mm6 ; transpose coefficients(phase 1)
447 punpcklbw mm6,mm7 ; mm6=(00 01 10 11 20 21 30 31)
448 punpckhbw mm2,mm7 ; mm2=(06 07 16 17 26 27 36 37)
449 movq mm0,mm3 ; transpose coefficients(phase 1)
450 punpcklbw mm3,mm1 ; mm3=(02 03 12 13 22 23 32 33)
451 punpckhbw mm0,mm1 ; mm0=(04 05 14 15 24 25 34 35)
453 movq mm5,mm6 ; transpose coefficients(phase 2)
454 punpcklwd mm6,mm3 ; mm6=(00 01 02 03 10 11 12 13)
455 punpckhwd mm5,mm3 ; mm5=(20 21 22 23 30 31 32 33)
456 movq mm4,mm0 ; transpose coefficients(phase 2)
457 punpcklwd mm0,mm2 ; mm0=(04 05 06 07 14 15 16 17)
458 punpckhwd mm4,mm2 ; mm4=(24 25 26 27 34 35 36 37)
460 movq mm7,mm6 ; transpose coefficients(phase 3)
461 punpckldq mm6,mm0 ; mm6=(00 01 02 03 04 05 06 07)
462 punpckhdq mm7,mm0 ; mm7=(10 11 12 13 14 15 16 17)
463 movq mm1,mm5 ; transpose coefficients(phase 3)
464 punpckldq mm5,mm4 ; mm5=(20 21 22 23 24 25 26 27)
465 punpckhdq mm1,mm4 ; mm1=(30 31 32 33 34 35 36 37)
467 pushpic ebx ; save GOT address
469 mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
470 mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
471 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
472 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
473 mov edx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
474 mov ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
475 movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm5
476 movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm1
478 poppic ebx ; restore GOT address
480 add esi, byte 4*SIZEOF_JCOEF ; wsptr
481 add edi, byte 4*SIZEOF_JSAMPROW
485 emms ; empty MMX state
489 ; pop edx ; need not be preserved
490 ; pop ecx ; need not be preserved
492 mov esp,ebp ; esp <- aligned ebp
493 pop esp ; esp <- original ebp
497 ; For some reason, the OS X linker does not honor the request to align the
498 ; segment unless we do this.