2 ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
4 ; Copyright (C) 2009-2011, 2014-2016, 2019, 2021, 2023, D. R. Commander.
5 ; Copyright (C) 2015, Matthieu Darbois.
6 ; Copyright (C) 2018, Matthias Räncker.
7 ; Copyright (C) 2023, Aliaksiej Kandracienka.
9 ; Based on the x86 SIMD extension for IJG JPEG library
10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
13 ; This file should be assembled with NASM (Netwide Assembler),
14 ; can *not* be assembled with Microsoft's MASM or any compatible
15 ; assembler (including Borland's Turbo Assembler).
16 ; NASM is available from http://nasm.sourceforge.net/ or
17 ; http://sourceforge.net/project/showfiles.php?group_id=6208
19 ; This file contains an SSE2 implementation for Huffman coding of one block.
20 ; The following code is based on jchuff.c; see jchuff.c for more details.
22 %include "jsimdext.inc"
25 .next_output_byte: resp 1 ; => next byte to write in buffer
26 .free_in_buffer: resp 1 ; # of byte spaces remaining in buffer
27 .cur.put_buffer.simd resq 1 ; current bit accumulation buffer
28 .cur.free_bits resd 1 ; # of bits available in it
29 .cur.last_dc_val resd 4 ; last DC coef for each component
30 .cinfo: resp 1 ; dump_buffer needs access to this
34 .ehufco: resd 256 ; code for each symbol
35 .ehufsi: resb 256 ; length of code for each symbol
36 ; If no code has been allocated for a symbol S, ehufsi[S] contains 0
39 ; --------------------------------------------------------------------------
43 GLOBAL_DATA(jconst_huff_encode_one_block)
45 EXTN(jconst_huff_encode_one_block):
47 jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
48 dd 0x000f, 0x001f, 0x003f, 0x007f
49 dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
50 dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
91 %define NBITS(x) nbits_base + x
92 %define MASK_BITS(x) NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
94 ; --------------------------------------------------------------------------
98 ; Shorthand used to describe SIMD operations:
99 ; wN: xmmN treated as eight signed 16-bit values
100 ; wN[i]: perform the same operation on all eight signed 16-bit values, i=0..7
101 ; bN: xmmN treated as 16 unsigned 8-bit values
102 ; bN[i]: perform the same operation on all 16 unsigned 8-bit values, i=0..15
103 ; Contents of SIMD registers are shown in memory order.
105 ; Fill the bit buffer to capacity with the leading bits from code, then output
106 ; the bit buffer and put the remaining bits from code into the bit buffer.
109 ; code - contains the bits to shift into the bit buffer (LSB-aligned)
110 ; %1 - the label to which to jump when the macro completes
111 ; %2 (optional) - extra instructions to execute after nbits has been set
113 ; Upon completion, free_bits will be set to the number of remaining bits from
114 ; code, and put_buffer will contain those remaining bits. temp and code will
117 ; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
120 %macro EMIT_QWORD 1-2
121 add nbitsb, free_bitsb ; nbits += free_bits;
122 neg free_bitsb ; free_bits = -free_bits;
123 mov tempd, code ; temp = code;
124 shl put_buffer, nbitsb ; put_buffer <<= nbits;
125 mov nbitsb, free_bitsb ; nbits = free_bits;
126 neg free_bitsb ; free_bits = -free_bits;
127 shr tempd, nbitsb ; temp >>= nbits;
128 or tempq, put_buffer ; temp |= put_buffer;
129 movq xmm0, tempq ; xmm0.u64 = { temp, 0 };
130 bswap tempq ; temp = htonl(temp);
131 mov put_buffer, codeq ; put_buffer = code;
132 pcmpeqb xmm0, xmm1 ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
134 pmovmskb code, xmm0 ; code = 0; code |= ((b0[i] >> 7) << i);
135 mov qword [buffer], tempq ; memcpy(buffer, &temp, 8);
136 ; (speculative; will be overwritten if
137 ; code contains any 0xFF bytes)
138 add free_bitsb, 64 ; free_bits += 64;
139 add bufferp, 8 ; buffer += 8;
140 test code, code ; if (code == 0) /* No 0xFF bytes */
142 ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
143 ; bytes in the qword.
144 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
145 mov byte [buffer-7], 0 ; buffer[-7] = 0;
146 sbb bufferp, 6 ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
147 mov byte [buffer], temph ; buffer[0] = temp[1];
148 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
149 mov byte [buffer+1], 0 ; buffer[1] = 0;
150 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
151 shr tempq, 16 ; temp >>= 16;
152 mov byte [buffer], tempb ; buffer[0] = temp[0];
153 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
154 mov byte [buffer+1], 0 ; buffer[1] = 0;
155 sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
156 mov byte [buffer], temph ; buffer[0] = temp[1];
157 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
158 mov byte [buffer+1], 0 ; buffer[1] = 0;
159 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
160 shr tempq, 16 ; temp >>= 16;
161 mov byte [buffer], tempb ; buffer[0] = temp[0];
162 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
163 mov byte [buffer+1], 0 ; buffer[1] = 0;
164 sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
165 mov byte [buffer], temph ; buffer[0] = temp[1];
166 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
167 mov byte [buffer+1], 0 ; buffer[1] = 0;
168 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
169 shr tempd, 16 ; temp >>= 16;
170 mov byte [buffer], tempb ; buffer[0] = temp[0];
171 cmp tempb, 0xFF ; Set CF if temp[0] < 0xFF
172 mov byte [buffer+1], 0 ; buffer[1] = 0;
173 sbb bufferp, -2 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
174 mov byte [buffer], temph ; buffer[0] = temp[1];
175 cmp temph, 0xFF ; Set CF if temp[1] < 0xFF
176 mov byte [buffer+1], 0 ; buffer[1] = 0;
177 sbb bufferp, -2 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
182 ; Encode a single block's worth of coefficients.
185 ; jsimd_huff_encode_one_block_sse2(working_state *state, JOCTET *buffer,
186 ; JCOEFPTR block, int last_dc_val,
187 ; c_derived_tbl *dctbl, c_derived_tbl *actbl)
190 ; When shuffling data, we try to avoid pinsrw as much as possible, since it is
191 ; slow on many CPUs. Its reciprocal throughput (issue latency) is 1 even on
192 ; modern CPUs, so chains of pinsrw instructions (even with different outputs)
193 ; can limit performance. pinsrw is a VectorPath instruction on AMD K8 and
194 ; requires 2 µops (with memory operand) on Intel. In either case, only one
195 ; pinsrw instruction can be decoded per cycle (and nothing else if they are
196 ; back-to-back), so out-of-order execution cannot be used to work around long
197 ; pinsrw chains (though for Sandy Bridge and later, this may be less of a
198 ; problem if the code runs from the µop cache.)
200 ; We use tzcnt instead of bsf without checking for support. The instruction is
201 ; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
202 ; rep bsf.) The destination (first) operand of bsf (and tzcnt on some CPUs) is
203 ; an input dependency (although the behavior is not formally defined, Intel
204 ; CPUs usually leave the destination unmodified if the source is zero.) This
205 ; can prevent out-of-order execution, so we clear the destination before
208 ; Initial register allocation
215 ; r8 - dctbl --> code_temp
220 ; r15 - block --> free_bits
237 %define nbits_base rsi
245 %define put_buffer r12
246 %define put_bufferd r12d
249 ; Step 1: Re-arrange input data according to jpeg_natural_order
250 ; xx 01 02 03 04 05 06 07 xx 01 08 16 09 02 03 10
251 ; 08 09 10 11 12 13 14 15 17 24 32 25 18 11 04 05
252 ; 16 17 18 19 20 21 22 23 12 19 26 33 40 48 41 34
253 ; 24 25 26 27 28 29 30 31 ==> 27 20 13 06 07 14 21 28
254 ; 32 33 34 35 36 37 38 39 35 42 49 56 57 50 43 36
255 ; 40 41 42 43 44 45 46 47 29 22 15 23 30 37 44 51
256 ; 48 49 50 51 52 53 54 55 58 59 52 45 38 31 39 46
257 ; 56 57 58 59 60 61 62 63 53 60 61 54 47 55 62 63
260 GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
262 EXTN(jsimd_huff_encode_one_block_sse2):
268 ; rcx = working_state *state
269 ; rdx = JOCTET *buffer
270 ; r8 = JCOEFPTR block
271 ; r9 = int last_dc_val
272 ; [rbp+48] = c_derived_tbl *dctbl
273 ; [rbp+56] = c_derived_tbl *actbl
279 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
281 movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
285 movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
287 movsx code, word [block] ;Z: code = block[0];
288 pxor xmm4, xmm4 ;A: w4[i] = 0;
289 sub code, r9d ;Z: code -= last_dc_val;
290 mov dctbl, POINTER [rbp+48]
291 mov actbl, POINTER [rbp+56]
292 punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
293 lea nbits_base, [rel jpeg_nbits_table]
297 ; rdi = working_state *state
298 ; rsi = JOCTET *buffer
299 ; rdx = JCOEFPTR block
300 ; rcx = int last_dc_val
301 ; r8 = c_derived_tbl *dctbl
302 ; r9 = c_derived_tbl *actbl
307 movups xmm3, XMMWORD [block + 0 * SIZEOF_WORD] ;D: w3 = xx 01 02 03 04 05 06 07
309 movdqa xmm0, xmm3 ;A: w0 = xx 01 02 03 04 05 06 07
313 movups xmm1, XMMWORD [block + 8 * SIZEOF_WORD] ;B: w1 = 08 09 10 11 12 13 14 15
314 movsx codeq, word [block] ;Z: code = block[0];
315 lea nbits_base, [rel jpeg_nbits_table]
316 pxor xmm4, xmm4 ;A: w4[i] = 0;
317 sub codeq, rcx ;Z: code -= last_dc_val;
318 punpckldq xmm0, xmm1 ;A: w0 = xx 01 08 09 02 03 10 11
322 ; Allocate stack space for t array, and realign stack.
323 add rsp, -DCTSIZE2 * SIZEOF_WORD - 8
326 pshuflw xmm0, xmm0, 11001001b ;A: w0 = 01 08 xx 09 02 03 10 11
327 pinsrw xmm0, word [block + 16 * SIZEOF_WORD], 2 ;A: w0 = 01 08 16 09 02 03 10 11
328 punpckhdq xmm3, xmm1 ;D: w3 = 04 05 12 13 06 07 14 15
329 punpcklqdq xmm1, xmm3 ;B: w1 = 08 09 10 11 04 05 12 13
330 pinsrw xmm0, word [block + 17 * SIZEOF_WORD], 7 ;A: w0 = 01 08 16 09 02 03 10 17
331 ;A: (Row 0, offset 1)
332 pcmpgtw xmm4, xmm0 ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
333 paddw xmm0, xmm4 ;A: w0[i] += w4[i];
334 movaps XMMWORD [t + 0 * SIZEOF_WORD], xmm0 ;A: t[i] = w0[i];
336 movq xmm2, qword [block + 24 * SIZEOF_WORD] ;B: w2 = 24 25 26 27 -- -- -- --
337 pshuflw xmm2, xmm2, 11011000b ;B: w2 = 24 26 25 27 -- -- -- --
338 pslldq xmm1, 1 * SIZEOF_WORD ;B: w1 = -- 08 09 10 11 04 05 12
339 movups xmm5, XMMWORD [block + 48 * SIZEOF_WORD] ;H: w5 = 48 49 50 51 52 53 54 55
340 movsd xmm1, xmm2 ;B: w1 = 24 26 25 27 11 04 05 12
341 punpcklqdq xmm2, xmm5 ;C: w2 = 24 26 25 27 48 49 50 51
342 pinsrw xmm1, word [block + 32 * SIZEOF_WORD], 1 ;B: w1 = 24 32 25 27 11 04 05 12
343 pxor xmm4, xmm4 ;A: w4[i] = 0;
344 psrldq xmm3, 2 * SIZEOF_WORD ;D: w3 = 12 13 06 07 14 15 -- --
345 pcmpeqw xmm0, xmm4 ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
346 pinsrw xmm1, word [block + 18 * SIZEOF_WORD], 3 ;B: w1 = 24 32 25 18 11 04 05 12
348 pcmpgtw xmm4, xmm1 ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
349 paddw xmm1, xmm4 ;B: w1[i] += w4[i];
350 movaps XMMWORD [t + 8 * SIZEOF_WORD], xmm1 ;B: t[i+8] = w1[i];
351 pxor xmm4, xmm4 ;B: w4[i] = 0;
352 pcmpeqw xmm1, xmm4 ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
354 packsswb xmm0, xmm1 ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
355 ; w/ signed saturation
357 pinsrw xmm3, word [block + 20 * SIZEOF_WORD], 0 ;D: w3 = 20 13 06 07 14 15 -- --
358 pinsrw xmm3, word [block + 21 * SIZEOF_WORD], 5 ;D: w3 = 20 13 06 07 14 21 -- --
359 pinsrw xmm3, word [block + 28 * SIZEOF_WORD], 6 ;D: w3 = 20 13 06 07 14 21 28 --
360 pinsrw xmm3, word [block + 35 * SIZEOF_WORD], 7 ;D: w3 = 20 13 06 07 14 21 28 35
362 pcmpgtw xmm4, xmm3 ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
363 paddw xmm3, xmm4 ;D: w3[i] += w4[i];
364 movaps XMMWORD [t + 24 * SIZEOF_WORD], xmm3 ;D: t[i+24] = w3[i];
365 pxor xmm4, xmm4 ;D: w4[i] = 0;
366 pcmpeqw xmm3, xmm4 ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
368 pinsrw xmm2, word [block + 19 * SIZEOF_WORD], 0 ;C: w2 = 19 26 25 27 48 49 50 51
369 cmp code, 1 << 31 ;Z: Set CF if code < 0x80000000,
370 ;Z: i.e. if code is positive
371 pinsrw xmm2, word [block + 33 * SIZEOF_WORD], 2 ;C: w2 = 19 26 33 27 48 49 50 51
372 pinsrw xmm2, word [block + 40 * SIZEOF_WORD], 3 ;C: w2 = 19 26 33 40 48 49 50 51
373 adc code, -1 ;Z: code += -1 + (code >= 0 ? 1 : 0);
374 pinsrw xmm2, word [block + 41 * SIZEOF_WORD], 5 ;C: w2 = 19 26 33 40 48 41 50 51
375 pinsrw xmm2, word [block + 34 * SIZEOF_WORD], 6 ;C: w2 = 19 26 33 40 48 41 34 51
376 movsxd codeq, code ;Z: sign extend code
377 pinsrw xmm2, word [block + 27 * SIZEOF_WORD], 7 ;C: w2 = 19 26 33 40 48 41 34 27
379 pcmpgtw xmm4, xmm2 ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
380 paddw xmm2, xmm4 ;C: w2[i] += w4[i];
381 movaps XMMWORD [t + 16 * SIZEOF_WORD], xmm2 ;C: t[i+16] = w2[i];
382 pxor xmm4, xmm4 ;C: w4[i] = 0;
383 pcmpeqw xmm2, xmm4 ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
385 packsswb xmm2, xmm3 ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
386 ; w/ signed saturation
388 movzx nbitsq, byte [NBITS(codeq)] ;Z: nbits = JPEG_NBITS(code);
389 movdqa xmm3, xmm5 ;H: w3 = 48 49 50 51 52 53 54 55
390 pmovmskb tempd, xmm2 ;Z: temp = 0; temp |= ((b2[i] >> 7) << i);
391 pmovmskb put_bufferd, xmm0 ;Z: put_buffer = 0; put_buffer |= ((b0[i] >> 7) << i);
392 movups xmm0, XMMWORD [block + 56 * SIZEOF_WORD] ;H: w0 = 56 57 58 59 60 61 62 63
393 punpckhdq xmm3, xmm0 ;H: w3 = 52 53 60 61 54 55 62 63
394 shl tempd, 16 ;Z: temp <<= 16;
395 psrldq xmm3, 1 * SIZEOF_WORD ;H: w3 = 53 60 61 54 55 62 63 --
396 pxor xmm2, xmm2 ;H: w2[i] = 0;
397 or put_bufferd, tempd ;Z: put_buffer |= temp;
398 pshuflw xmm3, xmm3, 00111001b ;H: w3 = 60 61 54 53 55 62 63 --
399 movq xmm1, qword [block + 44 * SIZEOF_WORD] ;G: w1 = 44 45 46 47 -- -- -- --
400 unpcklps xmm5, xmm0 ;E: w5 = 48 49 56 57 50 51 58 59
401 pxor xmm0, xmm0 ;H: w0[i] = 0;
402 pinsrw xmm3, word [block + 47 * SIZEOF_WORD], 3 ;H: w3 = 60 61 54 47 55 62 63 --
404 pcmpgtw xmm2, xmm3 ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
405 paddw xmm3, xmm2 ;H: w3[i] += w2[i];
406 movaps XMMWORD [t + 56 * SIZEOF_WORD], xmm3 ;H: t[i+56] = w3[i];
407 movq xmm4, qword [block + 36 * SIZEOF_WORD] ;G: w4 = 36 37 38 39 -- -- -- --
408 pcmpeqw xmm3, xmm0 ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
409 punpckldq xmm4, xmm1 ;G: w4 = 36 37 44 45 38 39 46 47
410 mov tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
411 ;Z: temp = dctbl->ehufco[nbits];
412 movdqa xmm1, xmm4 ;F: w1 = 36 37 44 45 38 39 46 47
413 psrldq xmm4, 1 * SIZEOF_WORD ;G: w4 = 37 44 45 38 39 46 47 --
414 shufpd xmm1, xmm5, 10b ;F: w1 = 36 37 44 45 50 51 58 59
415 and code, dword [MASK_BITS(nbitsq)] ;Z: code &= (1 << nbits) - 1;
416 pshufhw xmm4, xmm4, 11010011b ;G: w4 = 37 44 45 38 -- 39 46 --
417 pslldq xmm1, 1 * SIZEOF_WORD ;F: w1 = -- 36 37 44 45 50 51 58
418 shl tempq, nbitsb ;Z: temp <<= nbits;
419 pinsrw xmm4, word [block + 59 * SIZEOF_WORD], 0 ;G: w4 = 59 44 45 38 -- 39 46 --
420 pshufd xmm1, xmm1, 11011000b ;F: w1 = -- 36 45 50 37 44 51 58
421 pinsrw xmm4, word [block + 52 * SIZEOF_WORD], 1 ;G: w4 = 59 52 45 38 -- 39 46 --
422 or code, tempd ;Z: code |= temp;
423 movlps xmm1, qword [block + 20 * SIZEOF_WORD] ;F: w1 = 20 21 22 23 37 44 51 58
424 pinsrw xmm4, word [block + 31 * SIZEOF_WORD], 4 ;G: w4 = 59 52 45 38 31 39 46 --
425 pshuflw xmm1, xmm1, 01110010b ;F: w1 = 22 20 23 21 37 44 51 58
426 pinsrw xmm4, word [block + 53 * SIZEOF_WORD], 7 ;G: w4 = 59 52 45 38 31 39 46 53
428 pxor xmm2, xmm2 ;G: w2[i] = 0;
429 pcmpgtw xmm0, xmm4 ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
430 pinsrw xmm1, word [block + 15 * SIZEOF_WORD], 1 ;F: w1 = 22 15 23 21 37 44 51 58
431 paddw xmm4, xmm0 ;G: w4[i] += w0[i];
432 movaps XMMWORD [t + 48 * SIZEOF_WORD], xmm4 ;G: t[48+i] = w4[i];
433 pinsrw xmm1, word [block + 30 * SIZEOF_WORD], 3 ;F: w1 = 22 15 23 30 37 44 51 58
435 pcmpeqw xmm4, xmm2 ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
436 pinsrw xmm5, word [block + 42 * SIZEOF_WORD], 0 ;E: w5 = 42 49 56 57 50 51 58 59
438 packsswb xmm4, xmm3 ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
439 ; w/ signed saturation
441 pxor xmm0, xmm0 ;F: w0[i] = 0;
442 pinsrw xmm5, word [block + 43 * SIZEOF_WORD], 5 ;E: w5 = 42 49 56 57 50 43 58 59
443 pcmpgtw xmm2, xmm1 ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
444 pmovmskb tempd, xmm4 ;Z: temp = 0; temp |= ((b4[i] >> 7) << i);
445 pinsrw xmm5, word [block + 36 * SIZEOF_WORD], 6 ;E: w5 = 42 49 56 57 50 43 36 59
446 paddw xmm1, xmm2 ;F: w1[i] += w2[i];
447 movaps XMMWORD [t + 40 * SIZEOF_WORD], xmm1 ;F: t[40+i] = w1[i];
448 pinsrw xmm5, word [block + 29 * SIZEOF_WORD], 7 ;E: w5 = 42 49 56 57 50 43 36 29
451 %define free_bitsq r15
452 %define free_bitsd r15d
453 %define free_bitsb r15b
454 pcmpeqw xmm1, xmm0 ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
455 shl tempq, 48 ;Z: temp <<= 48;
456 pxor xmm2, xmm2 ;E: w2[i] = 0;
457 pcmpgtw xmm0, xmm5 ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
458 paddw xmm5, xmm0 ;E: w5[i] += w0[i];
459 or tempq, put_buffer ;Z: temp |= put_buffer;
460 movaps XMMWORD [t + 32 * SIZEOF_WORD], xmm5 ;E: t[32+i] = w5[i];
461 lea t, [dword t - 2] ;Z: t = &t[-1];
462 pcmpeqw xmm5, xmm2 ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
464 packsswb xmm5, xmm1 ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
465 ; w/ signed saturation
467 add nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
468 ;Z: nbits += dctbl->ehufsi[nbits];
470 %define code_temp r8d
471 pmovmskb indexd, xmm5 ;Z: index = 0; index |= ((b5[i] >> 7) << i);
472 mov free_bitsd, [state+working_state.cur.free_bits]
473 ;Z: free_bits = state->cur.free_bits;
474 pcmpeqw xmm1, xmm1 ;Z: b1[i] = 0xFF;
475 shl index, 32 ;Z: index <<= 32;
476 mov put_buffer, [state+working_state.cur.put_buffer.simd]
477 ;Z: put_buffer = state->cur.put_buffer.simd;
478 or index, tempq ;Z: index |= temp;
479 not index ;Z: index = ~index;
480 sub free_bitsb, nbitsb ;Z: if ((free_bits -= nbits) >= 0)
481 jnl .ENTRY_SKIP_EMIT_CODE ;Z: goto .ENTRY_SKIP_EMIT_CODE;
483 .EMIT_CODE: ;Z: .EMIT_CODE:
484 EMIT_QWORD .BLOOP_COND ;Z: insert code, flush buffer, goto .BLOOP_COND
486 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
490 lea code_temp, [nbitsq - 16] ; code_temp = nbits - 16;
491 movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
492 ; nbits = actbl->ehufsi[0xf0];
493 mov code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
494 ; code = actbl->ehufco[0xf0];
495 sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
496 jle .EMIT_BRLOOP_CODE ; goto .EMIT_BRLOOP_CODE;
497 shl put_buffer, nbitsb ; put_buffer <<= nbits;
498 mov nbits, code_temp ; nbits = code_temp;
499 or put_buffer, codeq ; put_buffer |= code;
500 cmp nbits, 16 ; if (nbits <= 16)
502 jmp .BRLOOP ; } while (1);
504 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
508 .ENTRY_SKIP_EMIT_CODE: ; .ENTRY_SKIP_EMIT_CODE:
509 shl put_buffer, nbitsb ; put_buffer <<= nbits;
510 or put_buffer, codeq ; put_buffer |= code;
511 .BLOOP_COND: ; .BLOOP_COND:
512 test index, index ; if (index != 0)
515 xor nbits, nbits ; nbits = 0; /* kill tzcnt input dependency */
516 tzcnt nbitsq, index ; nbits = # of trailing 0 bits in index
518 lea t, [t + nbitsq * 2] ; t = &t[nbits];
519 shr index, nbitsb ; index >>= nbits;
520 .EMIT_BRLOOP_CODE_END: ; .EMIT_BRLOOP_CODE_END:
521 cmp nbits, 16 ; if (nbits > 16)
522 jg .BRLOOP ; goto .BRLOOP;
524 movsx codeq, word [t] ; code = *t;
525 lea tempd, [nbitsq * 2] ; temp = nbits * 2;
526 movzx nbits, byte [NBITS(codeq)] ; nbits = JPEG_NBITS(code);
527 lea tempd, [nbitsq + tempq * 8] ; temp = temp * 8 + nbits;
528 mov code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
529 ; code_temp = actbl->ehufco[temp-16];
530 shl code_temp, nbitsb ; code_temp <<= nbits;
531 and code, dword [MASK_BITS(nbitsq)] ; code &= (1 << nbits) - 1;
532 add nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
533 ; free_bits -= actbl->ehufsi[temp-16];
534 or code, code_temp ; code |= code_temp;
535 sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
536 jle .EMIT_CODE ; goto .EMIT_CODE;
537 shl put_buffer, nbitsb ; put_buffer <<= nbits;
538 or put_buffer, codeq ; put_buffer |= code;
540 jnz .BLOOP ; } while (index != 0);
541 .ELOOP: ; } /* index != 0 */
542 sub td, esp ; t -= &t_[0];
543 cmp td, (DCTSIZE2 - 2) * SIZEOF_WORD ; if (t != 62)
545 movzx nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
546 ; nbits = actbl->ehufsi[0];
547 mov code, [actbl + c_derived_tbl.ehufco + 0] ; code = actbl->ehufco[0];
548 sub free_bitsb, nbitsb ; if ((free_bits -= nbits) <= 0)
549 jg .EFN_SKIP_EMIT_CODE ; {
550 EMIT_QWORD .EFN ; insert code, flush buffer
552 .EFN_SKIP_EMIT_CODE: ; } else {
553 shl put_buffer, nbitsb ; put_buffer <<= nbits;
554 or put_buffer, codeq ; put_buffer |= code;
556 mov [state + working_state.cur.put_buffer.simd], put_buffer
557 ; state->cur.put_buffer.simd = put_buffer;
558 mov byte [state + working_state.cur.free_bits], free_bitsb
559 ; state->cur.free_bits = free_bits;
560 sub rsp, -DCTSIZE2 * SIZEOF_WORD - 8
573 ; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
577 EMIT_QWORD .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
578 ; insert code, flush buffer,
579 ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
581 ; For some reason, the OS X linker does not honor the request to align the
582 ; segment unless we do this.