update latest
[external/libjpeg-turbo.git] / simd / jcqnts2f-64.asm
1 ;
2 ; jcqnts2f-64.asm - sample data conversion and quantization (64-bit SSE & SSE2)
3 ;
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright 2009 D. R. Commander
6 ;
7 ; Based on
8 ; x86 SIMD extension for IJG JPEG library
9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
11 ;
12 ; This file should be assembled with NASM (Netwide Assembler),
13 ; can *not* be assembled with Microsoft's MASM or any compatible
14 ; assembler (including Borland's Turbo Assembler).
15 ; NASM is available from http://nasm.sourceforge.net/ or
16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
17 ;
18 ; [TAB8]
19
20 %include "jsimdext.inc"
21 %include "jdct.inc"
22
23 ; --------------------------------------------------------------------------
24         SECTION SEG_TEXT
25         BITS    64
26 ;
27 ; Load data into workspace, applying unsigned->signed conversion
28 ;
29 ; GLOBAL(void)
30 ; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
31 ;                            FAST_FLOAT * workspace);
32 ;
33
34 ; r10 = JSAMPARRAY sample_data
35 ; r11 = JDIMENSION start_col
36 ; r12 = FAST_FLOAT * workspace
37
38         align   16
39         global  EXTN(jsimd_convsamp_float_sse2)
40
41 EXTN(jsimd_convsamp_float_sse2):
42         push    rbp
43         mov     rax,rsp
44         mov     rbp,rsp
45         collect_args
46         push    rbx
47
48         pcmpeqw  xmm7,xmm7
49         psllw    xmm7,7
50         packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
51
52         mov rsi, r10
53         mov     rax, r11
54         mov rdi, r12
55         mov     rcx, DCTSIZE/2
56 .convloop:
57         mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
58         mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
59
60         movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
61         movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
62
63         psubb   xmm0,xmm7                       ; xmm0=(01234567)
64         psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
65
66         punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
67         punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
68
69         punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
70         punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
71         punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
72         punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
73
74         psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
75         psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
76         cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
77         cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
78         psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
79         psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
80         cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
81         cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
82
83         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
84         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
85         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
86         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
87
88         add     rsi, byte 2*SIZEOF_JSAMPROW
89         add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
90         dec     rcx
91         jnz     short .convloop
92
93         pop     rbx
94         uncollect_args
95         pop     rbp
96         ret
97
98
99 ; --------------------------------------------------------------------------
100 ;
101 ; Quantize/descale the coefficients, and store into coef_block
102 ;
103 ; GLOBAL(void)
104 ; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
105 ;                         FAST_FLOAT * workspace);
106 ;
107
108 ; r10 = JCOEFPTR coef_block
109 ; r11 = FAST_FLOAT * divisors
110 ; r12 = FAST_FLOAT * workspace
111
112         align   16
113         global  EXTN(jsimd_quantize_float_sse2)
114
115 EXTN(jsimd_quantize_float_sse2):
116         push    rbp
117         mov     rax,rsp
118         mov     rbp,rsp
119         collect_args
120
121         mov rsi, r12
122         mov rdx, r11
123         mov rdi, r10
124         mov     rax, DCTSIZE2/16
125 .quantloop:
126         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
127         movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
128         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
129         mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
130         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
131         movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
132         mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
133         mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
134
135         cvtps2dq xmm0,xmm0
136         cvtps2dq xmm1,xmm1
137         cvtps2dq xmm2,xmm2
138         cvtps2dq xmm3,xmm3
139
140         packssdw xmm0,xmm1
141         packssdw xmm2,xmm3
142
143         movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
144         movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
145
146         add     rsi, byte 16*SIZEOF_FAST_FLOAT
147         add     rdx, byte 16*SIZEOF_FAST_FLOAT
148         add     rdi, byte 16*SIZEOF_JCOEF
149         dec     rax
150         jnz     short .quantloop
151
152         uncollect_args
153         pop     rbp
154         ret
155
156 ; For some reason, the OS X linker does not honor the request to align the
157 ; segment unless we do this.
158         align   16