2 ; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright (C) 2009, 2016, D. R. Commander.
6 ; Copyright (C) 2018, Matthias Räncker.
8 ; Based on the x86 SIMD extension for IJG JPEG library
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
12 ; This file should be assembled with NASM (Netwide Assembler),
13 ; can *not* be assembled with Microsoft's MASM or any compatible
14 ; assembler (including Borland's Turbo Assembler).
15 ; NASM is available from http://nasm.sourceforge.net/ or
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
18 %include "jsimdext.inc"
21 ; --------------------------------------------------------------------------
25 ; Load data into workspace, applying unsigned->signed conversion
28 ; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
29 ; DCTELEM *workspace);
32 ; r10 = JSAMPARRAY sample_data
33 ; r11d = JDIMENSION start_col
34 ; r12 = DCTELEM *workspace
37 GLOBAL_FUNCTION(jsimd_convsamp_sse2)
39 EXTN(jsimd_convsamp_sse2):
45 pxor xmm6, xmm6 ; xmm6=(all 0's)
47 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
54 mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
55 mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
57 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567)
58 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF)
60 mov rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
61 mov rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
63 movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN)
64 movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV)
66 punpcklbw xmm0, xmm6 ; xmm0=(01234567)
67 punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF)
70 punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN)
71 punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV)
75 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
76 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
77 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
78 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
80 add rsi, byte 4*SIZEOF_JSAMPROW
81 add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
90 ; --------------------------------------------------------------------------
92 ; Quantize/descale the coefficients, and store into coef_block
94 ; This implementation is based on an algorithm described in
95 ; "How to optimize for the Pentium family of microprocessors"
96 ; (http://www.agner.org/assem/).
99 ; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
100 ; DCTELEM *workspace);
103 %define RECIPROCAL(m, n, b) \
104 XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
105 %define CORRECTION(m, n, b) \
106 XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
107 %define SCALE(m, n, b) \
108 XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
110 ; r10 = JCOEFPTR coef_block
111 ; r11 = DCTELEM *divisors
112 ; r12 = DCTELEM *workspace
115 GLOBAL_FUNCTION(jsimd_quantize_sse2)
117 EXTN(jsimd_quantize_sse2):
127 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
128 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
129 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
130 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
135 psraw xmm4, (WORD_BIT-1)
136 psraw xmm5, (WORD_BIT-1)
137 psraw xmm6, (WORD_BIT-1)
138 psraw xmm7, (WORD_BIT-1)
143 psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0;
144 psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1;
145 psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2;
146 psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3;
148 paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor
149 paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)]
150 paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)]
151 paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)]
152 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal
153 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
154 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
155 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
156 pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale
157 pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
158 pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
159 pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
169 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
170 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
171 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
172 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
174 add rsi, byte 32*SIZEOF_DCTELEM
175 add rdx, byte 32*SIZEOF_DCTELEM
176 add rdi, byte 32*SIZEOF_JCOEF
184 ; For some reason, the OS X linker does not honor the request to align the
185 ; segment unless we do this.