1 ; vim:filetype=nasm ts=8
3 ; libFLAC - Free Lossless Audio Codec library
4 ; Copyright (C) 2001,2002,2003,2004,2005,2006,2007,2008,2009 Josh Coalson
6 ; Redistribution and use in source and binary forms, with or without
7 ; modification, are permitted provided that the following conditions
10 ; - Redistributions of source code must retain the above copyright
11 ; notice, this list of conditions and the following disclaimer.
13 ; - Redistributions in binary form must reproduce the above copyright
14 ; notice, this list of conditions and the following disclaimer in the
15 ; documentation and/or other materials provided with the distribution.
17 ; - Neither the name of the Xiph.org Foundation nor the names of its
18 ; contributors may be used to endorse or promote products derived from
19 ; this software without specific prior written permission.
21 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
25 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
38 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
39 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
40 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
41 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
42 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
43 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
44 cglobal FLAC__lpc_restore_signal_asm_ia32
45 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
49 ; **********************************************************************
51 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
54 ; unsigned sample, coeff;
55 ; const unsigned limit = data_len - lag;
57 ; FLAC__ASSERT(lag > 0);
58 ; FLAC__ASSERT(lag <= data_len);
60 ; for(coeff = 0; coeff < lag; coeff++)
62 ; for(sample = 0; sample <= limit; sample++) {
64 ; for(coeff = 0; coeff < lag; coeff++)
65 ; autoc[coeff] += d * data[sample+coeff];
67 ; for(; sample < data_len; sample++) {
69 ; for(coeff = 0; coeff < data_len - sample; coeff++)
70 ; autoc[coeff] += d * data[sample+coeff];
75 cident FLAC__lpc_compute_autocorrelation_asm_ia32
76 ;[esp + 28] == autoc[]
78 ;[esp + 20] == data_len
83 ;ASSERT(lag <= data_len)
90 ; for(coeff = 0; coeff < lag; coeff++)
92 mov edi, [esp + 28] ; edi == autoc
93 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write
97 ; const unsigned limit = data_len - lag;
98 mov eax, [esp + 24] ; eax == lag
100 sub ecx, eax ; ecx == limit
102 mov edi, [esp + 28] ; edi == autoc
103 mov esi, [esp + 16] ; esi == data
104 inc ecx ; we are looping <= limit so we add one to the counter
106 ; for(sample = 0; sample <= limit; sample++) {
108 ; for(coeff = 0; coeff < lag; coeff++)
109 ; autoc[coeff] += d * data[sample+coeff];
111 fld dword [esi] ; ST = d <- data[sample]
112 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
113 lea edx, [eax + eax*2]
115 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
120 inc edx ; compensate for the shorter opcode on the last iteration
121 inc edx ; compensate for the shorter opcode on the last iteration
122 inc edx ; compensate for the shorter opcode on the last iteration
125 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
130 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here!
131 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here!
132 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here!
134 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
135 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
136 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
138 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
139 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
140 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
142 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
143 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
144 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
146 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
147 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
148 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
150 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
151 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
152 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
154 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
155 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
156 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
158 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
159 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
160 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
162 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
163 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
164 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
166 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
167 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
168 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
170 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
171 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
172 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
174 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
175 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
176 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
178 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
179 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
180 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
182 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
183 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
184 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
186 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
187 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
188 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
190 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
191 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
192 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
194 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
195 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
196 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
198 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
199 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
200 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
202 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
203 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
204 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
206 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
207 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
208 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
210 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
211 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
212 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
214 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
215 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
216 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
218 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
219 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
220 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
222 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
223 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
224 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
226 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
227 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
228 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
230 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
231 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
232 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
234 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
235 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
236 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
238 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
239 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
240 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
242 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
243 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
244 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
246 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
247 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
248 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
250 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
251 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
252 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
254 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
255 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
256 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
258 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
259 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
260 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
263 fstp st0 ; pop d, ST = empty
264 add esi, byte 4 ; sample++
267 fld dword [esi] ; ST = d <- data[sample]
271 ; for(; sample < data_len; sample++) {
273 ; for(coeff = 0; coeff < data_len - sample; coeff++)
274 ; autoc[coeff] += d * data[sample+coeff];
276 mov ecx, [esp + 24] ; ecx <- lag
277 dec ecx ; ecx <- lag - 1
278 jz near .end ; skip loop if 0 (i.e. lag == 1)
280 fld dword [esi] ; ST = d <- data[sample]
281 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
282 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
283 lea edx, [eax + eax*2]
285 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
290 inc edx ; compensate for the shorter opcode on the last iteration
291 inc edx ; compensate for the shorter opcode on the last iteration
292 inc edx ; compensate for the shorter opcode on the last iteration
296 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
297 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
298 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
300 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
301 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
302 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
304 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
305 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
306 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
308 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
309 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
310 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
312 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
313 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
314 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
316 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
317 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
318 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
320 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
321 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
322 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
324 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
325 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
326 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
328 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
329 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
330 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
332 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
333 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
334 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
336 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
337 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
338 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
340 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
341 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
342 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
344 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
345 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
346 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
348 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
349 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
350 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
352 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
353 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
354 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
356 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
357 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
358 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
360 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
361 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
362 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
364 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
365 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
366 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
368 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
369 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
370 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
372 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
373 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
374 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
376 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
377 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
378 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
380 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
381 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
382 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
384 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
385 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
386 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
388 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
389 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
390 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
392 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
393 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
394 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
396 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
397 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
398 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
400 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
401 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
402 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
404 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
405 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
406 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
408 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
409 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
410 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
412 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
413 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
414 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
416 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
417 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
418 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
420 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
421 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
422 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
425 fstp st0 ; pop d, ST = empty
426 add esi, byte 4 ; sample++
429 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
430 fld dword [esi] ; ST = d <- data[sample]
441 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
442 ;[esp + 16] == autoc[]
444 ;[esp + 8] == data_len
449 ;ASSERT(lag <= data_len)
451 ; for(coeff = 0; coeff < lag; coeff++)
452 ; autoc[coeff] = 0.0;
455 mov edx, [esp + 8] ; edx == data_len
456 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
458 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
460 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
461 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
462 .warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
463 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
464 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
469 ; start by reading the next sample
470 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
472 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
473 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
475 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
476 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
481 mov edx, [esp + 16] ; edx == autoc
488 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
489 ;[esp + 16] == autoc[]
491 ;[esp + 8] == data_len
496 ;ASSERT(lag <= data_len)
498 ; for(coeff = 0; coeff < lag; coeff++)
499 ; autoc[coeff] = 0.0;
503 mov edx, [esp + 8] ; edx == data_len
504 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
506 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
508 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
509 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
510 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
511 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
512 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
514 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
516 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
521 ; start by reading the next sample
522 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
523 ; here we reorder the instructions; see the (#) indexes for a logical order
524 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
526 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
527 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
528 movss xmm3, xmm2 ; (5)
529 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
530 movss xmm2, xmm0 ; (6)
531 mulps xmm1, xmm3 ; (8)
532 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
533 addps xmm6, xmm1 ; (10)
534 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
539 mov edx, [esp + 16] ; edx == autoc
541 movups [edx + 16], xmm6
547 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
548 ;[esp + 16] == autoc[]
550 ;[esp + 8] == data_len
555 ;ASSERT(lag <= data_len)
557 ; for(coeff = 0; coeff < lag; coeff++)
558 ; autoc[coeff] = 0.0;
563 mov edx, [esp + 8] ; edx == data_len
564 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
566 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
568 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
569 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
570 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
571 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
572 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
580 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
585 ; start by reading the next sample
586 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
588 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
590 ; shift xmm4:xmm3:xmm2 left by one float
591 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
592 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
593 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
598 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
612 mov edx, [esp + 16] ; edx == autoc
614 movups [edx + 16], xmm6
615 movups [edx + 32], xmm7
621 cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
641 lea esp, [esp + 4 * eax]
646 mov [esp + 4 * ecx], eax
654 lea ecx, [esi + 4 * eax - 12]
672 movd mm1, [eax + 4 * ebx]
673 movd mm3, [eax + 4 * ebx + 4]
674 movd mm5, [eax + 4 * ebx + 8]
675 movd mm7, [eax + 4 * ebx + 12]
681 punpckldq mm7, [eax + 4 * ebx + 16]
685 movq mm3, [esp + 4 * ebx]
689 movq [esp + 4 * ebx], mm3
700 lea ecx, [esi + 4 * edi]
711 movd mm1, [esi + 4 * ebx]
713 movd mm2, [esp + 4 * ebx]
715 movd [esp + 4 * ebx], mm1
728 mov eax, [esp + 4 * edx]
729 mov [edi + 4 * edx], eax
741 ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
743 ; for(i = 0; i < data_len; i++) {
745 ; for(j = 0; j < order; j++)
746 ; sum += qlp_coeff[j] * data[i-j-1];
747 ; residual[i] = data[i] - (sum >> lp_quantization);
751 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
752 ;[esp + 40] residual[]
753 ;[esp + 36] lp_quantization
755 ;[esp + 28] qlp_coeff[]
766 mov esi, [esp + 20] ; esi = data[]
767 mov edi, [esp + 40] ; edi = residual[]
768 mov eax, [esp + 32] ; eax = order
769 mov ebx, [esp + 24] ; ebx = data_len
772 jz near .end ; do nothing if data_len == 0
778 mov edx, [ecx] ; edx = qlp_coeff[0]
779 mov eax, [esi - 4] ; eax = data[-1]
780 mov cl, [esp + 36] ; cl = lp_quantization
797 cmp eax, byte 32 ; for order <= 32 there is a faster routine
800 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
813 imul eax, [esi + 4 * ecx]
816 jnz short .i_32more_loop_j
834 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
840 mov eax, [esp + 28] ; eax = qlp_coeff[]
845 imul ecx, [esi - 128]
848 imul ecx, [esi - 124]
851 imul ecx, [esi - 120]
854 imul ecx, [esi - 116]
857 imul ecx, [esi - 112]
860 imul ecx, [esi - 108]
863 imul ecx, [esi - 104]
866 imul ecx, [esi - 100]
937 mov ecx, [eax] ; there is one byte missing
961 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
962 ; the channel and qlp_coeffs must be <= 16. Especially note that this routine
963 ; cannot be used for side-channel coded 16bps channels since the effective bps
966 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
967 ;[esp + 40] residual[]
968 ;[esp + 36] lp_quantization
970 ;[esp + 28] qlp_coeff[]
981 mov esi, [esp + 20] ; esi = data[]
982 mov edi, [esp + 40] ; edi = residual[]
983 mov eax, [esp + 32] ; eax = order
984 mov ebx, [esp + 24] ; ebx = data_len
987 jz near .end ; do nothing if data_len == 0
992 mov edx, [esp + 28] ; edx = qlp_coeff[]
993 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
1000 push word [edx + 4 * ecx]
1003 jnz short .copy_qlp_loop
1016 movq mm5, [esp + 2 * eax - 8]
1017 movd mm4, [esi - 16]
1018 punpckldq mm4, [esi - 12]
1020 punpckldq mm0, [esi - 4]
1024 jnbe short .mmx_4more
1030 punpckldq mm1, [esi + 4]
1068 punpckldq mm1, [esi + 4]
1089 movd mm0, [ecx - 16]
1091 punpckldq mm0, [ecx - 12]
1092 punpckldq mm7, [ecx - 4]
1097 movd mm0, [ecx - 12]
1098 punpckldq mm0, [ecx - 8]
1099 punpckldq mm7, [ecx]
1107 jnz .mmx_4more_loop_j
1123 jg near .mmx_4more_loop_i
1131 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
1140 ; **********************************************************************
1142 ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1147 ; FLAC__ASSERT(order > 0);
1149 ; for(i = 0; i < data_len; i++) {
1151 ; for(j = 0; j < order; j++)
1152 ; sum += qlp_coeff[j] * data[i-j-1];
1153 ; data[i] = residual[i] + (sum >> lp_quantization);
1157 cident FLAC__lpc_restore_signal_asm_ia32
1159 ;[esp + 36] lp_quantization
1161 ;[esp + 28] qlp_coeff[]
1162 ;[esp + 24] data_len
1163 ;[esp + 20] residual[]
1172 mov esi, [esp + 20] ; esi = residual[]
1173 mov edi, [esp + 40] ; edi = data[]
1174 mov eax, [esp + 32] ; eax = order
1175 mov ebx, [esp + 24] ; ebx = data_len
1178 jz near .end ; do nothing if data_len == 0
1202 cmp eax, byte 32 ; for order <= 32 there is a faster routine
1205 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1218 imul eax, [edi + 4 * ecx]
1221 jnz short .x87_32more_loop_j
1231 jnz .x87_32more_loop_i
1238 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1243 inc edx ; compensate for the shorter opcode on the last iteration
1244 mov eax, [esp + 28] ; eax = qlp_coeff[]
1248 mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
1249 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
1250 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
1251 mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
1252 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
1253 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
1254 mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
1255 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
1256 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
1257 mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
1258 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
1259 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
1260 mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
1261 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
1262 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
1263 mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
1264 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
1265 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
1266 mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
1267 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
1268 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
1269 mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
1270 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
1271 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
1272 mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
1273 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
1274 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
1275 mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
1276 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
1277 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
1278 mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
1279 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
1280 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
1281 mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
1282 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
1283 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
1284 mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
1285 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
1286 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
1287 mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
1288 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
1289 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
1290 mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
1291 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
1292 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
1293 mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
1294 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
1295 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
1296 mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
1297 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
1298 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
1299 mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
1300 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
1301 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
1302 mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
1303 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
1304 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
1305 mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
1306 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
1307 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
1308 mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
1309 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
1310 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
1311 mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
1312 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
1313 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
1314 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
1315 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
1316 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
1317 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
1318 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
1319 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
1320 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
1321 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
1322 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
1323 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
1324 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
1325 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
1326 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
1327 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
1328 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
1329 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
1330 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
1331 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
1332 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
1333 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
1334 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
1335 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
1336 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
1337 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
1338 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
1339 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
1340 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
1341 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1342 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
1343 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
1347 sar ebp, cl ; ebp = (sum >> lp_quantization)
1348 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
1349 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
1364 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1365 ; the channel and qlp_coeffs must be <= 16. Especially note that this routine
1366 ; cannot be used for side-channel coded 16bps channels since the effective bps
1368 ; WATCHOUT: this routine requires that each data array have a buffer of up to
1369 ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1370 ; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
1372 cident FLAC__lpc_restore_signal_asm_ia32_mmx
1374 ;[esp + 36] lp_quantization
1376 ;[esp + 28] qlp_coeff[]
1377 ;[esp + 24] data_len
1378 ;[esp + 20] residual[]
1393 jz near .end ; do nothing if data_len == 0
1395 jb near FLAC__lpc_restore_signal_asm_ia32.begin
1398 movd mm6, [esp + 36]
1405 push word [edx + 4 * ecx]
1408 jnz short .copy_qlp_loop
1421 movq mm5, [esp + 2 * eax - 8]
1422 movd mm4, [edi - 16]
1423 punpckldq mm4, [edi - 12]
1425 punpckldq mm0, [edi - 4]
1429 jnbe short .mmx_4more
1467 movd mm0, [ecx - 16]
1468 punpckldq mm0, [ecx - 12]
1470 punpckldq mm1, [ecx - 4]
1478 jnz .mmx_4more_loop_j
1495 jnz short .mmx_4more_loop_i