1 ; libFLAC - Free Lossless Audio Codec library
2 ; Copyright (C) 2001,2002,2003,2004,2005 Josh Coalson
4 ; Redistribution and use in source and binary forms, with or without
5 ; modification, are permitted provided that the following conditions
8 ; - Redistributions of source code must retain the above copyright
9 ; notice, this list of conditions and the following disclaimer.
11 ; - Redistributions in binary form must reproduce the above copyright
12 ; notice, this list of conditions and the following disclaimer in the
13 ; documentation and/or other materials provided with the distribution.
15 ; - Neither the name of the Xiph.org Foundation nor the names of its
16 ; contributors may be used to endorse or promote products derived from
17 ; this software without specific prior written permission.
19 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
23 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
27 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
28 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
29 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
36 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
37 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
38 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
39 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
40 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
41 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
42 cglobal FLAC__lpc_restore_signal_asm_ia32
43 cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
47 ; **********************************************************************
49 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
52 ; unsigned sample, coeff;
53 ; const unsigned limit = data_len - lag;
55 ; FLAC__ASSERT(lag > 0);
56 ; FLAC__ASSERT(lag <= data_len);
58 ; for(coeff = 0; coeff < lag; coeff++)
60 ; for(sample = 0; sample <= limit; sample++) {
62 ; for(coeff = 0; coeff < lag; coeff++)
63 ; autoc[coeff] += d * data[sample+coeff];
65 ; for(; sample < data_len; sample++) {
67 ; for(coeff = 0; coeff < data_len - sample; coeff++)
68 ; autoc[coeff] += d * data[sample+coeff];
73 cident FLAC__lpc_compute_autocorrelation_asm_ia32
74 ;[esp + 28] == autoc[]
76 ;[esp + 20] == data_len
81 ;ASSERT(lag <= data_len)
88 ; for(coeff = 0; coeff < lag; coeff++)
90 mov edi, [esp + 28] ; edi == autoc
91 mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write
95 ; const unsigned limit = data_len - lag;
96 mov eax, [esp + 24] ; eax == lag
98 sub ecx, eax ; ecx == limit
100 mov edi, [esp + 28] ; edi == autoc
101 mov esi, [esp + 16] ; esi == data
102 inc ecx ; we are looping <= limit so we add one to the counter
104 ; for(sample = 0; sample <= limit; sample++) {
106 ; for(coeff = 0; coeff < lag; coeff++)
107 ; autoc[coeff] += d * data[sample+coeff];
109 fld dword [esi] ; ST = d <- data[sample]
110 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
111 lea edx, [eax + eax*2]
113 lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
118 inc edx ; compensate for the shorter opcode on the last iteration
119 inc edx ; compensate for the shorter opcode on the last iteration
120 inc edx ; compensate for the shorter opcode on the last iteration
123 sub edx, byte 9 ; compensate for the longer opcodes on the first iteration
128 fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here!
129 fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here!
130 fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here!
132 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
133 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
134 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
136 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
137 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
138 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
140 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
141 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
142 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
144 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
145 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
146 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
148 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
149 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
150 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
152 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
153 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
154 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
156 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
157 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
158 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
160 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
161 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
162 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
164 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
165 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
166 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
168 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
169 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
170 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
172 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
173 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
174 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
176 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
177 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
178 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
180 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
181 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
182 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
184 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
185 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
186 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
188 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
189 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
190 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
192 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
193 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
194 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
196 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
197 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
198 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
200 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
201 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
202 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
204 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
205 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
206 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
208 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
209 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
210 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
212 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
213 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
214 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
216 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
217 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
218 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
220 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
221 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
222 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
224 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
225 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
226 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
228 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
229 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
230 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
232 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
233 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
234 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
236 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
237 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
238 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
240 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
241 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
242 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
244 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
245 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
246 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
248 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
249 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
250 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
252 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
253 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
254 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
256 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
257 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
258 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
261 fstp st0 ; pop d, ST = empty
262 add esi, byte 4 ; sample++
265 fld dword [esi] ; ST = d <- data[sample]
269 ; for(; sample < data_len; sample++) {
271 ; for(coeff = 0; coeff < data_len - sample; coeff++)
272 ; autoc[coeff] += d * data[sample+coeff];
274 mov ecx, [esp + 24] ; ecx <- lag
275 dec ecx ; ecx <- lag - 1
276 jz near .end ; skip loop if 0 (i.e. lag == 1)
278 fld dword [esi] ; ST = d <- data[sample]
279 mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through
280 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
281 lea edx, [eax + eax*2]
283 lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
288 inc edx ; compensate for the shorter opcode on the last iteration
289 inc edx ; compensate for the shorter opcode on the last iteration
290 inc edx ; compensate for the shorter opcode on the last iteration
294 fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d
295 fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d
296 fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d
298 fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d
299 fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d
300 fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d
302 fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d
303 fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d
304 fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d
306 fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d
307 fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d
308 fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d
310 fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d
311 fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d
312 fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d
314 fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d
315 fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d
316 fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d
318 fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d
319 fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d
320 fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d
322 fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d
323 fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d
324 fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d
326 fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d
327 fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d
328 fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d
330 fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d
331 fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d
332 fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d
334 fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d
335 fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d
336 fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d
338 fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d
339 fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d
340 fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d
342 fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d
343 fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d
344 fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d
346 fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d
347 fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d
348 fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d
350 fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d
351 fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d
352 fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d
354 fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d
355 fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d
356 fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d
358 fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d
359 fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d
360 fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d
362 fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d
363 fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d
364 fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d
366 fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d
367 fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d
368 fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d
370 fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d
371 fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d
372 fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d
374 fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d
375 fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d
376 fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d
378 fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d
379 fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d
380 fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d
382 fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d
383 fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d
384 fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d
386 fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d
387 fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d
388 fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d
390 fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d
391 fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d
392 fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d
394 fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d
395 fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d
396 fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d
398 fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d
399 fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d
400 fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d
402 fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d
403 fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d
404 fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d
406 fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d
407 fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d
408 fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d
410 fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d
411 fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d
412 fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d
414 fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d
415 fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d
416 fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d
418 fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here!
419 fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here!
420 fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here!
423 fstp st0 ; pop d, ST = empty
424 add esi, byte 4 ; sample++
427 add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target
428 fld dword [esi] ; ST = d <- data[sample]
439 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
440 ;[esp + 16] == autoc[]
442 ;[esp + 8] == data_len
447 ;ASSERT(lag <= data_len)
449 ; for(coeff = 0; coeff < lag; coeff++)
450 ; autoc[coeff] = 0.0;
453 mov edx, [esp + 8] ; edx == data_len
454 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
456 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
458 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
459 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
460 .warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
461 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
462 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
467 ; start by reading the next sample
468 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
470 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
471 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
473 mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2
474 addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2
479 mov edx, [esp + 16] ; edx == autoc
486 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
487 ;[esp + 16] == autoc[]
489 ;[esp + 8] == data_len
494 ;ASSERT(lag <= data_len)
496 ; for(coeff = 0; coeff < lag; coeff++)
497 ; autoc[coeff] = 0.0;
501 mov edx, [esp + 8] ; edx == data_len
502 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
504 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
506 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
507 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
508 movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
509 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
510 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
512 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
514 addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
519 ; start by reading the next sample
520 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
521 ; here we reorder the instructions; see the (#) indexes for a logical order
522 shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float
524 shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float
525 shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample]
526 movss xmm3, xmm2 ; (5)
527 movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample]
528 movss xmm2, xmm0 ; (6)
529 mulps xmm1, xmm3 ; (8)
530 mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
531 addps xmm6, xmm1 ; (10)
532 addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
537 mov edx, [esp + 16] ; edx == autoc
539 movups [edx + 16], xmm6
545 cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
546 ;[esp + 16] == autoc[]
548 ;[esp + 8] == data_len
553 ;ASSERT(lag <= data_len)
555 ; for(coeff = 0; coeff < lag; coeff++)
556 ; autoc[coeff] = 0.0;
561 mov edx, [esp + 8] ; edx == data_len
562 mov eax, [esp + 4] ; eax == &data[sample] <- &data[0]
564 movss xmm0, [eax] ; xmm0 = 0,0,0,data[0]
566 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0]
567 shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
568 xorps xmm3, xmm3 ; xmm3 = 0,0,0,0
569 xorps xmm4, xmm4 ; xmm4 = 0,0,0,0
570 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
578 addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
583 ; start by reading the next sample
584 movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample]
586 shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample]
588 ; shift xmm4:xmm3:xmm2 left by one float
589 shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
590 shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
591 shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
596 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
610 mov edx, [esp + 16] ; edx == autoc
612 movups [edx + 16], xmm6
613 movups [edx + 32], xmm7
619 cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
639 lea esp, [esp + 4 * eax]
644 mov [esp + 4 * ecx], eax
652 lea ecx, [esi + 4 * eax - 12]
670 movd mm1, [eax + 4 * ebx]
671 movd mm3, [eax + 4 * ebx + 4]
672 movd mm5, [eax + 4 * ebx + 8]
673 movd mm7, [eax + 4 * ebx + 12]
679 punpckldq mm7, [eax + 4 * ebx + 16]
683 movq mm3, [esp + 4 * ebx]
687 movq [esp + 4 * ebx], mm3
698 lea ecx, [esi + 4 * edi]
709 movd mm1, [esi + 4 * ebx]
711 movd mm2, [esp + 4 * ebx]
713 movd [esp + 4 * ebx], mm1
726 mov eax, [esp + 4 * edx]
727 mov [edi + 4 * edx], eax
739 ;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
741 ; for(i = 0; i < data_len; i++) {
743 ; for(j = 0; j < order; j++)
744 ; sum += qlp_coeff[j] * data[i-j-1];
745 ; residual[i] = data[i] - (sum >> lp_quantization);
749 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
750 ;[esp + 40] residual[]
751 ;[esp + 36] lp_quantization
753 ;[esp + 28] qlp_coeff[]
764 mov esi, [esp + 20] ; esi = data[]
765 mov edi, [esp + 40] ; edi = residual[]
766 mov eax, [esp + 32] ; eax = order
767 mov ebx, [esp + 24] ; ebx = data_len
770 jz near .end ; do nothing if data_len == 0
776 mov edx, [ecx] ; edx = qlp_coeff[0]
777 mov eax, [esi - 4] ; eax = data[-1]
778 mov cl, [esp + 36] ; cl = lp_quantization
795 cmp eax, byte 32 ; for order <= 32 there is a faster routine
798 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
811 imul eax, [esi + 4 * ecx]
814 jnz short .i_32more_loop_j
832 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
838 mov eax, [esp + 28] ; eax = qlp_coeff[]
843 imul ecx, [esi - 128]
846 imul ecx, [esi - 124]
849 imul ecx, [esi - 120]
852 imul ecx, [esi - 116]
855 imul ecx, [esi - 112]
858 imul ecx, [esi - 108]
861 imul ecx, [esi - 104]
864 imul ecx, [esi - 100]
935 mov ecx, [eax] ; there is one byte missing
959 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
960 ; the channel must be <= 16. Especially note that this routine cannot be used
961 ; for side-channel coded 16bps channels since the effective bps is 17.
963 cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
964 ;[esp + 40] residual[]
965 ;[esp + 36] lp_quantization
967 ;[esp + 28] qlp_coeff[]
978 mov esi, [esp + 20] ; esi = data[]
979 mov edi, [esp + 40] ; edi = residual[]
980 mov eax, [esp + 32] ; eax = order
981 mov ebx, [esp + 24] ; ebx = data_len
984 jz near .end ; do nothing if data_len == 0
989 mov edx, [esp + 28] ; edx = qlp_coeff[]
990 movd mm6, [esp + 36] ; mm6 = 0:lp_quantization
997 push word [edx + 4 * ecx]
1000 jnz short .copy_qlp_loop
1013 movq mm5, [esp + 2 * eax - 8]
1014 movd mm4, [esi - 16]
1015 punpckldq mm4, [esi - 12]
1017 punpckldq mm0, [esi - 4]
1021 jnbe short .mmx_4more
1027 punpckldq mm1, [esi + 4]
1065 punpckldq mm1, [esi + 4]
1086 movd mm0, [ecx - 16]
1088 punpckldq mm0, [ecx - 12]
1089 punpckldq mm7, [ecx - 4]
1094 movd mm0, [ecx - 12]
1095 punpckldq mm0, [ecx - 8]
1096 punpckldq mm7, [ecx]
1104 jnz .mmx_4more_loop_j
1120 jg near .mmx_4more_loop_i
1128 jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32.begin
1137 ; **********************************************************************
1139 ; void FLAC__lpc_restore_signal(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1144 ; FLAC__ASSERT(order > 0);
1146 ; for(i = 0; i < data_len; i++) {
1148 ; for(j = 0; j < order; j++)
1149 ; sum += qlp_coeff[j] * data[i-j-1];
1150 ; data[i] = residual[i] + (sum >> lp_quantization);
1154 cident FLAC__lpc_restore_signal_asm_ia32
1156 ;[esp + 36] lp_quantization
1158 ;[esp + 28] qlp_coeff[]
1159 ;[esp + 24] data_len
1160 ;[esp + 20] residual[]
1169 mov esi, [esp + 20] ; esi = residual[]
1170 mov edi, [esp + 40] ; edi = data[]
1171 mov eax, [esp + 32] ; eax = order
1172 mov ebx, [esp + 24] ; ebx = data_len
1175 jz near .end ; do nothing if data_len == 0
1199 cmp eax, byte 32 ; for order <= 32 there is a faster routine
1202 ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
1215 imul eax, [edi + 4 * ecx]
1218 jnz short .x87_32more_loop_j
1228 jnz .x87_32more_loop_i
1235 lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
1240 inc edx ; compensate for the shorter opcode on the last iteration
1241 mov eax, [esp + 28] ; eax = qlp_coeff[]
1245 mov ecx, [eax + 124] ; ecx = qlp_coeff[31]
1246 imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32]
1247 add ebp, ecx ; sum += qlp_coeff[31] * data[i-32]
1248 mov ecx, [eax + 120] ; ecx = qlp_coeff[30]
1249 imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31]
1250 add ebp, ecx ; sum += qlp_coeff[30] * data[i-31]
1251 mov ecx, [eax + 116] ; ecx = qlp_coeff[29]
1252 imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30]
1253 add ebp, ecx ; sum += qlp_coeff[29] * data[i-30]
1254 mov ecx, [eax + 112] ; ecx = qlp_coeff[28]
1255 imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29]
1256 add ebp, ecx ; sum += qlp_coeff[28] * data[i-29]
1257 mov ecx, [eax + 108] ; ecx = qlp_coeff[27]
1258 imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28]
1259 add ebp, ecx ; sum += qlp_coeff[27] * data[i-28]
1260 mov ecx, [eax + 104] ; ecx = qlp_coeff[26]
1261 imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27]
1262 add ebp, ecx ; sum += qlp_coeff[26] * data[i-27]
1263 mov ecx, [eax + 100] ; ecx = qlp_coeff[25]
1264 imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26]
1265 add ebp, ecx ; sum += qlp_coeff[25] * data[i-26]
1266 mov ecx, [eax + 96] ; ecx = qlp_coeff[24]
1267 imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25]
1268 add ebp, ecx ; sum += qlp_coeff[24] * data[i-25]
1269 mov ecx, [eax + 92] ; ecx = qlp_coeff[23]
1270 imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24]
1271 add ebp, ecx ; sum += qlp_coeff[23] * data[i-24]
1272 mov ecx, [eax + 88] ; ecx = qlp_coeff[22]
1273 imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23]
1274 add ebp, ecx ; sum += qlp_coeff[22] * data[i-23]
1275 mov ecx, [eax + 84] ; ecx = qlp_coeff[21]
1276 imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22]
1277 add ebp, ecx ; sum += qlp_coeff[21] * data[i-22]
1278 mov ecx, [eax + 80] ; ecx = qlp_coeff[20]
1279 imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21]
1280 add ebp, ecx ; sum += qlp_coeff[20] * data[i-21]
1281 mov ecx, [eax + 76] ; ecx = qlp_coeff[19]
1282 imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20]
1283 add ebp, ecx ; sum += qlp_coeff[19] * data[i-20]
1284 mov ecx, [eax + 72] ; ecx = qlp_coeff[18]
1285 imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19]
1286 add ebp, ecx ; sum += qlp_coeff[18] * data[i-19]
1287 mov ecx, [eax + 68] ; ecx = qlp_coeff[17]
1288 imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18]
1289 add ebp, ecx ; sum += qlp_coeff[17] * data[i-18]
1290 mov ecx, [eax + 64] ; ecx = qlp_coeff[16]
1291 imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17]
1292 add ebp, ecx ; sum += qlp_coeff[16] * data[i-17]
1293 mov ecx, [eax + 60] ; ecx = qlp_coeff[15]
1294 imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16]
1295 add ebp, ecx ; sum += qlp_coeff[15] * data[i-16]
1296 mov ecx, [eax + 56] ; ecx = qlp_coeff[14]
1297 imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15]
1298 add ebp, ecx ; sum += qlp_coeff[14] * data[i-15]
1299 mov ecx, [eax + 52] ; ecx = qlp_coeff[13]
1300 imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14]
1301 add ebp, ecx ; sum += qlp_coeff[13] * data[i-14]
1302 mov ecx, [eax + 48] ; ecx = qlp_coeff[12]
1303 imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13]
1304 add ebp, ecx ; sum += qlp_coeff[12] * data[i-13]
1305 mov ecx, [eax + 44] ; ecx = qlp_coeff[11]
1306 imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12]
1307 add ebp, ecx ; sum += qlp_coeff[11] * data[i-12]
1308 mov ecx, [eax + 40] ; ecx = qlp_coeff[10]
1309 imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11]
1310 add ebp, ecx ; sum += qlp_coeff[10] * data[i-11]
1311 mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9]
1312 imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10]
1313 add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10]
1314 mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8]
1315 imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9]
1316 add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9]
1317 mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7]
1318 imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8]
1319 add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8]
1320 mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6]
1321 imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7]
1322 add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7]
1323 mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5]
1324 imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6]
1325 add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6]
1326 mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4]
1327 imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5]
1328 add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5]
1329 mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3]
1330 imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4]
1331 add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4]
1332 mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2]
1333 imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3]
1334 add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3]
1335 mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1]
1336 imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2]
1337 add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2]
1338 mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction)
1339 imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1]
1340 add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1]
1344 sar ebp, cl ; ebp = (sum >> lp_quantization)
1345 add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization)
1346 mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization)
1361 ; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
1362 ; the channel must be <= 16. Especially note that this routine cannot be used
1363 ; for side-channel coded 16bps channels since the effective bps is 17.
1364 ; WATCHOUT: this routine requires that each data array have a buffer of up to
1365 ; 3 zeroes in front (at negative indices) for alignment purposes, i.e. for each
1366 ; channel n, data[n][-1] through data[n][-3] should be accessible and zero.
1368 cident FLAC__lpc_restore_signal_asm_ia32_mmx
1370 ;[esp + 36] lp_quantization
1372 ;[esp + 28] qlp_coeff[]
1373 ;[esp + 24] data_len
1374 ;[esp + 20] residual[]
1389 jz near .end ; do nothing if data_len == 0
1391 jb near FLAC__lpc_restore_signal_asm_ia32.begin
1394 movd mm6, [esp + 36]
1401 push word [edx + 4 * ecx]
1404 jnz short .copy_qlp_loop
1417 movq mm5, [esp + 2 * eax - 8]
1418 movd mm4, [edi - 16]
1419 punpckldq mm4, [edi - 12]
1421 punpckldq mm0, [edi - 4]
1425 jnbe short .mmx_4more
1463 movd mm0, [ecx - 16]
1464 punpckldq mm0, [ecx - 12]
1466 punpckldq mm1, [ecx - 4]
1474 jnz .mmx_4more_loop_j
1491 jnz short .mmx_4more_loop_i