2 ; jcphuff-sse2.asm - prepare data for progressive Huffman encoding
5 ; Copyright (C) 2016, 2018, Matthieu Darbois
7 ; Based on the x86 SIMD extension for IJG JPEG library
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ; This file should be assembled with NASM (Netwide Assembler),
12 ; can *not* be assembled with Microsoft's MASM or any compatible
13 ; assembler (including Borland's Turbo Assembler).
14 ; NASM is available from http://nasm.sourceforge.net/ or
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
17 ; This file contains an SSE2 implementation of data preparation for progressive
18 ; Huffman encoding. See jcphuff.c for more details.
20 %include "jsimdext.inc"
22 ; --------------------------------------------------------------------------
26 ; --------------------------------------------------------------------------
27 ; Macros to load data for jsimd_encode_mcu_AC_first_prepare_sse2() and
28 ; jsimd_encode_mcu_AC_refine_prepare_sse2()
34 mov T0d, INT [LUT + 0*SIZEOF_INT]
35 mov T1d, INT [LUT + 8*SIZEOF_INT]
36 pinsrw X0, word [BLOCK + T0 * 2], 0
37 pinsrw X1, word [BLOCK + T1 * 2], 0
39 mov T0d, INT [LUT + 1*SIZEOF_INT]
40 mov T1d, INT [LUT + 9*SIZEOF_INT]
41 pinsrw X0, word [BLOCK + T0 * 2], 1
42 pinsrw X1, word [BLOCK + T1 * 2], 1
44 mov T0d, INT [LUT + 2*SIZEOF_INT]
45 mov T1d, INT [LUT + 10*SIZEOF_INT]
46 pinsrw X0, word [BLOCK + T0 * 2], 2
47 pinsrw X1, word [BLOCK + T1 * 2], 2
49 mov T0d, INT [LUT + 3*SIZEOF_INT]
50 mov T1d, INT [LUT + 11*SIZEOF_INT]
51 pinsrw X0, word [BLOCK + T0 * 2], 3
52 pinsrw X1, word [BLOCK + T1 * 2], 3
54 mov T0d, INT [LUT + 4*SIZEOF_INT]
55 mov T1d, INT [LUT + 12*SIZEOF_INT]
56 pinsrw X0, word [BLOCK + T0 * 2], 4
57 pinsrw X1, word [BLOCK + T1 * 2], 4
59 mov T0d, INT [LUT + 5*SIZEOF_INT]
60 mov T1d, INT [LUT + 13*SIZEOF_INT]
61 pinsrw X0, word [BLOCK + T0 * 2], 5
62 pinsrw X1, word [BLOCK + T1 * 2], 5
64 mov T0d, INT [LUT + 6*SIZEOF_INT]
65 mov T1d, INT [LUT + 14*SIZEOF_INT]
66 pinsrw X0, word [BLOCK + T0 * 2], 6
67 pinsrw X1, word [BLOCK + T1 * 2], 6
69 mov T0d, INT [LUT + 7*SIZEOF_INT]
70 mov T1d, INT [LUT + 15*SIZEOF_INT]
71 pinsrw X0, word [BLOCK + T0 * 2], 7
72 pinsrw X1, word [BLOCK + T1 * 2], 7
80 mov T0d, INT [LUT + 0*SIZEOF_INT]
81 mov T1d, INT [LUT + 8*SIZEOF_INT]
82 pinsrw X0, word [BLOCK + T0 * 2], 0
83 pinsrw X1, word [BLOCK + T1 * 2], 0
85 mov T0d, INT [LUT + 1*SIZEOF_INT]
86 pinsrw X0, word [BLOCK + T0 * 2], 1
88 mov T0d, INT [LUT + 2*SIZEOF_INT]
89 pinsrw X0, word [BLOCK + T0 * 2], 2
91 mov T0d, INT [LUT + 3*SIZEOF_INT]
92 pinsrw X0, word [BLOCK + T0 * 2], 3
94 mov T0d, INT [LUT + 4*SIZEOF_INT]
95 pinsrw X0, word [BLOCK + T0 * 2], 4
97 mov T0d, INT [LUT + 5*SIZEOF_INT]
98 pinsrw X0, word [BLOCK + T0 * 2], 5
100 mov T0d, INT [LUT + 6*SIZEOF_INT]
101 pinsrw X0, word [BLOCK + T0 * 2], 6
103 mov T0d, INT [LUT + 7*SIZEOF_INT]
104 pinsrw X0, word [BLOCK + T0 * 2], 7
108 mov T1d, INT [LUT + 9*SIZEOF_INT]
109 pinsrw X1, word [BLOCK + T1 * 2], 1
113 mov T1d, INT [LUT + 10*SIZEOF_INT]
114 pinsrw X1, word [BLOCK + T1 * 2], 2
118 mov T1d, INT [LUT + 11*SIZEOF_INT]
119 pinsrw X1, word [BLOCK + T1 * 2], 3
123 mov T1d, INT [LUT + 12*SIZEOF_INT]
124 pinsrw X1, word [BLOCK + T1 * 2], 4
128 mov T1d, INT [LUT + 13*SIZEOF_INT]
129 pinsrw X1, word [BLOCK + T1 * 2], 5
133 mov T1d, INT [LUT + 14*SIZEOF_INT]
134 pinsrw X1, word [BLOCK + T1 * 2], 6
141 mov T0d, INT [LUT + 0*SIZEOF_INT]
142 pinsrw X0, word [BLOCK + T0 * 2], 0
144 mov T0d, INT [LUT + 1*SIZEOF_INT]
145 pinsrw X0, word [BLOCK + T0 * 2], 1
147 mov T0d, INT [LUT + 2*SIZEOF_INT]
148 pinsrw X0, word [BLOCK + T0 * 2], 2
150 mov T0d, INT [LUT + 3*SIZEOF_INT]
151 pinsrw X0, word [BLOCK + T0 * 2], 3
153 mov T0d, INT [LUT + 4*SIZEOF_INT]
154 pinsrw X0, word [BLOCK + T0 * 2], 4
156 mov T0d, INT [LUT + 5*SIZEOF_INT]
157 pinsrw X0, word [BLOCK + T0 * 2], 5
159 mov T0d, INT [LUT + 6*SIZEOF_INT]
160 pinsrw X0, word [BLOCK + T0 * 2], 6
162 mov T0d, INT [LUT + 7*SIZEOF_INT]
163 pinsrw X0, word [BLOCK + T0 * 2], 7
170 mov T1d, INT [LUT + 0*SIZEOF_INT]
171 pinsrw X0, word [BLOCK + T1 * 2], 0
175 mov T1d, INT [LUT + 1*SIZEOF_INT]
176 pinsrw X0, word [BLOCK + T1 * 2], 1
180 mov T1d, INT [LUT + 2*SIZEOF_INT]
181 pinsrw X0, word [BLOCK + T1 * 2], 2
185 mov T1d, INT [LUT + 3*SIZEOF_INT]
186 pinsrw X0, word [BLOCK + T1 * 2], 3
190 mov T1d, INT [LUT + 4*SIZEOF_INT]
191 pinsrw X0, word [BLOCK + T1 * 2], 4
195 mov T1d, INT [LUT + 5*SIZEOF_INT]
196 pinsrw X0, word [BLOCK + T1 * 2], 5
200 mov T1d, INT [LUT + 6*SIZEOF_INT]
201 pinsrw X0, word [BLOCK + T1 * 2], 6
206 movdqa xmm0, XMMWORD [VALUES + ( 0*2)]
207 movdqa xmm1, XMMWORD [VALUES + ( 8*2)]
208 movdqa xmm2, XMMWORD [VALUES + (16*2)]
209 movdqa xmm3, XMMWORD [VALUES + (24*2)]
210 movdqa xmm4, XMMWORD [VALUES + (32*2)]
211 movdqa xmm5, XMMWORD [VALUES + (40*2)]
212 movdqa xmm6, XMMWORD [VALUES + (48*2)]
213 movdqa xmm7, XMMWORD [VALUES + (56*2)]
244 mov MMWORD [r15], rax
248 ; Prepare data for jsimd_encode_mcu_AC_first().
251 ; jsimd_encode_mcu_AC_first_prepare_sse2(const JCOEF *block,
252 ; const int *jpeg_natural_order_start,
253 ; int Sl, int Al, JCOEF *values,
256 ; r10 = const JCOEF *block
257 ; r11 = const int *jpeg_natural_order_start
260 ; r14 = JCOEF *values
261 ; r15 = size_t *zerobits
281 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_first_prepare_sse2)
283 EXTN(jsimd_encode_mcu_AC_first_prepare_sse2):
285 mov rax, rsp ; rax = original rbp
287 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
289 mov rbp, rsp ; rbp = aligned rbp
293 movdqa XMMWORD [rbp - 16], ZERO
315 movdqa XMMWORD [VALUES + (0) * 2], X0
316 movdqa XMMWORD [VALUES + (8) * 2], X1
317 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
318 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
320 add LUT, 16*SIZEOF_INT
342 movdqa XMMWORD [VALUES + (0) * 2], X0
343 movdqa XMMWORD [VALUES + (8) * 2], X1
344 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
345 movdqa XMMWORD [VALUES + (8 + DCTSIZE2) * 2], N1
355 movdqa XMMWORD [VALUES + (0) * 2], X0
356 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
366 movdqa XMMWORD [VALUES + (0) * 2], X0
367 movdqa XMMWORD [VALUES + (0 + DCTSIZE2) * 2], N0
378 movdqa XMMWORD [VALUES + 0], ZERO
383 sub VALUES, DCTSIZE2*2
387 movdqa ZERO, XMMWORD [rbp - 16]
389 mov rsp, rbp ; rsp <- aligned rbp
390 pop rsp ; rsp <- original rbp
412 ; Prepare data for jsimd_encode_mcu_AC_refine().
415 ; jsimd_encode_mcu_AC_refine_prepare_sse2(const JCOEF *block,
416 ; const int *jpeg_natural_order_start,
417 ; int Sl, int Al, JCOEF *absvalues,
420 ; r10 = const JCOEF *block
421 ; r11 = const int *jpeg_natural_order_start
424 ; r14 = JCOEF *values
449 GLOBAL_FUNCTION(jsimd_encode_mcu_AC_refine_prepare_sse2)
451 EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
453 mov rax, rsp ; rax = original rbp
455 and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits
457 mov rbp, rsp ; rbp = aligned rbp
461 movdqa XMMWORD [rbp - 16], ZERO
486 movdqa XMMWORD [VALUES + (0) * 2], X0
487 movdqa XMMWORD [VALUES + (8) * 2], X1
492 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
493 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
494 shr SIGN, 16 ; make room for sizebits
497 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
498 jz .CONTINUER16 ; if (idx) {
500 add EOB, T1d ; EOB = k + idx;
503 add LUT, 16*SIZEOF_INT
524 movdqa XMMWORD [VALUES + (0) * 2], X0
525 movdqa XMMWORD [VALUES + (8) * 2], X1
530 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
531 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
532 shr SIGN, 16 ; make room for sizebits
535 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
536 jz .CONTINUER15 ; if (idx) {
538 add EOB, T1d ; EOB = k + idx;
549 movdqa XMMWORD [VALUES + (0) * 2], X0
553 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
554 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
555 shr SIGN, 8 ; make room for sizebits
558 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
559 jz .CONTINUER8 ; if (idx) {
561 add EOB, T1d ; EOB = k + idx;
572 movdqa XMMWORD [VALUES + (0) * 2], X0
576 pmovmskb T0d, N0 ; lsignbits.val16u[k>>4] = _mm_movemask_epi8(neg);
577 pmovmskb T1d, X0 ; idx = _mm_movemask_epi8(x1);
578 shr SIGN, 8 ; make room for sizebits
581 bsr T1d, T1d ; idx = 16 - (__builtin_clz(idx)>>1);
582 jz .CONTINUER7 ; if (idx) {
584 add EOB, T1d ; EOB = k + idx;
596 movdqa XMMWORD [VALUES + 0], ZERO
603 sub VALUES, DCTSIZE2*2
604 mov MMWORD [r15+SIZEOF_MMWORD], SIGN
609 movdqa ZERO, XMMWORD [rbp - 16]
611 mov rsp, rbp ; rsp <- aligned rbp
612 pop rsp ; rsp <- original rbp
637 ; For some reason, the OS X linker does not honor the request to align the
638 ; segment unless we do this.