1 ; vim:filetype=nasm ts=8
3 ; libFLAC - Free Lossless Audio Codec library
4 ; Copyright (C) 2001,2002,2003,2004,2005,2006,2007,2008,2009 Josh Coalson
6 ; Redistribution and use in source and binary forms, with or without
7 ; modification, are permitted provided that the following conditions
10 ; - Redistributions of source code must retain the above copyright
11 ; notice, this list of conditions and the following disclaimer.
13 ; - Redistributions in binary form must reproduce the above copyright
14 ; notice, this list of conditions and the following disclaimer in the
15 ; documentation and/or other materials provided with the distribution.
17 ; - Neither the name of the Xiph.org Foundation nor the names of its
18 ; contributors may be used to endorse or promote products derived from
19 ; this software without specific prior written permission.
21 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
25 ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26 ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27 ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 ; [CR] is a note to flag that the instruction can be easily reordered
39 cglobal FLAC__lpc_compute_autocorrelation_asm
43 ; **********************************************************************
45 ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[])
48 ; unsigned sample, coeff;
49 ; const unsigned limit = data_len - lag;
52 ; assert(lag <= data_len);
54 ; for(coeff = 0; coeff < lag; coeff++)
56 ; for(sample = 0; sample <= limit; sample++){
58 ; for(coeff = 0; coeff < lag; coeff++)
59 ; autoc[coeff] += d * data[sample+coeff];
61 ; for(; sample < data_len; sample++){
63 ; for(coeff = 0; coeff < data_len - sample; coeff++)
64 ; autoc[coeff] += d * data[sample+coeff];
68 FLAC__lpc_compute_autocorrelation_asm:
76 mov edx, [ebp + 8] ; edx == lag
77 mov ecx, [ebp + 4] ; ecx == data_len
78 mov esi, [ebp] ; esi == data
79 mov edi, [ebp + 12] ; edi == autoc
84 fldz ; will accumulate autoc[0]
88 add esi, byte 4 ; sample++
100 fldz ; will accumulate autoc[1]
102 fldz ; will accumulate autoc[0]
106 add esi, byte 4 ; [CR] sample++
109 faddp st2, st0 ; add to autoc[0]
113 faddp st3, st0 ; add to autoc[1]
116 ; clean up the leftovers
118 faddp st1, st0 ; add to autoc[0]
125 ja short .lag_above_3
127 fldz ; will accumulate autoc[2]
129 fldz ; will accumulate autoc[1]
131 fldz ; will accumulate autoc[0]
137 faddp st2, st0 ; add to autoc[0]
140 faddp st3, st0 ; add to autoc[1]
143 add esi, byte 4 ; [CR] sample++
144 faddp st3, st0 ; add to autoc[2]
147 ; clean up the leftovers
151 faddp st2, st0 ; add to autoc[0]
155 faddp st3, st0 ; add to autoc[1]
157 faddp st1, st0 ; add to autoc[0]
167 fldz ; will accumulate autoc[3]
169 fldz ; will accumulate autoc[2]
171 fldz ; will accumulate autoc[1]
173 fldz ; will accumulate autoc[0]
179 faddp st2, st0 ; add to autoc[0]
182 faddp st3, st0 ; add to autoc[1]
185 faddp st4, st0 ; add to autoc[2]
188 add esi, byte 4 ; [CR] sample++
189 faddp st4, st0 ; add to autoc[3]
192 ; clean up the leftovers
196 faddp st2, st0 ; add to autoc[0]
199 faddp st3, st0 ; add to autoc[1]
202 add esi, byte 4 ; [CR] sample++
203 faddp st3, st0 ; add to autoc[2]
207 faddp st2, st0 ; add to autoc[0]
211 faddp st3, st0 ; add to autoc[1]
213 faddp st1, st0 ; add to autoc[0]
217 fstp dword [edi + 12]
224 fldz ; will accumulate autoc[4]
225 fldz ; will accumulate autoc[3]
226 fldz ; will accumulate autoc[2]
227 fldz ; will accumulate autoc[1]
228 fldz ; will accumulate autoc[0]
235 faddp st2, st0 ; add to autoc[0]
238 faddp st3, st0 ; add to autoc[1]
241 faddp st4, st0 ; add to autoc[2]
244 faddp st5, st0 ; add to autoc[3]
247 add esi, byte 4 ; [CR] sample++
248 faddp st5, st0 ; add to autoc[4]
251 ; clean up the leftovers
255 faddp st2, st0 ; add to autoc[0]
258 faddp st3, st0 ; add to autoc[1]
261 faddp st4, st0 ; add to autoc[2]
264 add esi, byte 4 ; [CR] sample++
265 faddp st4, st0 ; add to autoc[3]
269 faddp st2, st0 ; add to autoc[0]
272 faddp st3, st0 ; add to autoc[1]
275 add esi, byte 4 ; [CR] sample++
276 faddp st3, st0 ; add to autoc[2]
280 faddp st2, st0 ; add to autoc[0]
284 faddp st3, st0 ; add to autoc[1]
286 faddp st1, st0 ; add to autoc[0]
290 fstp dword [edi + 12]
291 fstp dword [edi + 16]
298 fldz ; will accumulate autoc[5]
299 fldz ; will accumulate autoc[4]
300 fldz ; will accumulate autoc[3]
301 fldz ; will accumulate autoc[2]
302 fldz ; will accumulate autoc[1]
303 fldz ; will accumulate autoc[0]
310 faddp st2, st0 ; add to autoc[0]
313 faddp st3, st0 ; add to autoc[1]
316 faddp st4, st0 ; add to autoc[2]
319 faddp st5, st0 ; add to autoc[3]
322 faddp st6, st0 ; add to autoc[4]
325 add esi, byte 4 ; [CR] sample++
326 faddp st6, st0 ; add to autoc[5]
329 ; clean up the leftovers
333 faddp st2, st0 ; add to autoc[0]
336 faddp st3, st0 ; add to autoc[1]
339 faddp st4, st0 ; add to autoc[2]
342 faddp st5, st0 ; add to autoc[3]
345 add esi, byte 4 ; [CR] sample++
346 faddp st5, st0 ; add to autoc[4]
350 faddp st2, st0 ; add to autoc[0]
353 faddp st3, st0 ; add to autoc[1]
356 faddp st4, st0 ; add to autoc[2]
359 add esi, byte 4 ; [CR] sample++
360 faddp st4, st0 ; add to autoc[3]
364 faddp st2, st0 ; add to autoc[0]
367 faddp st3, st0 ; add to autoc[1]
370 add esi, byte 4 ; [CR] sample++
371 faddp st3, st0 ; add to autoc[2]
375 faddp st2, st0 ; add to autoc[0]
379 faddp st3, st0 ; add to autoc[1]
381 faddp st1, st0 ; add to autoc[0]
385 fstp dword [edi + 12]
386 fstp dword [edi + 16]
387 fstp dword [edi + 20]
391 ; for(coeff = 0; coeff < lag; coeff++)
392 ; autoc[coeff] = 0.0;
393 lea ecx, [edx * 2] ; ecx = # of dwords of 0 to write
396 mov ecx, [ebp + 4] ; ecx == data_len
397 mov edi, [ebp + 12] ; edi == autoc
398 ; const unsigned limit = data_len - lag;
400 inc ecx ; we are looping <= limit so we add one to the counter
401 ; for(sample = 0; sample <= limit; sample++){
403 ; for(coeff = 0; coeff < lag; coeff++)
404 ; autoc[coeff] += d * data[sample+coeff];
406 xor eax, eax ; eax == sample <- 0
409 push eax ; save sample
410 fld dword [esi + eax * 4] ; ST = d <- data[sample]
411 mov ebx, eax ; ebx == sample+coeff <- sample
412 mov edx, [ebp + 8] ; edx <- lag
413 xor eax, eax ; eax == coeff <- 0
417 fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d
418 fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d
419 fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d
420 inc ebx ; (sample+coeff)++
424 pop eax ; restore sample
425 fstp st0 ; pop d, ST = empty
428 ; for(; sample < data_len; sample++){
430 ; for(coeff = 0; coeff < data_len - sample; coeff++)
431 ; autoc[coeff] += d * data[sample+coeff];
433 mov ecx, [ebp + 8] ; ecx <- lag
434 dec ecx ; ecx <- lag - 1
435 jz .outer_end ; skip loop if 0
437 push eax ; save sample
438 fld dword [esi + eax * 4] ; ST = d <- data[sample]
439 mov ebx, eax ; ebx == sample+coeff <- sample
440 mov edx, [ebp + 4] ; edx <- data_len
441 sub edx, eax ; edx <- data_len-sample
442 xor eax, eax ; eax == coeff <- 0
445 fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d
446 fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d
447 fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d
448 inc ebx ; (sample+coeff)++
452 pop eax ; restore sample
453 fstp st0 ; pop d, ST = empty
460 mov ecx, [ebp + 4] ; ecx == data_len
461 mov esi, [ebp] ; esi == data
462 mov edi, [ebp + 12] ; edi == autoc
463 fldz ; will accumulate autoc[6]
470 add esi, byte 4 ; [CR] sample++
471 faddp st1, st0 ; add to autoc[6]
474 fstp dword [edi + 24]
478 mov ecx, [ebp + 4] ; ecx == data_len
479 mov esi, [ebp] ; esi == data
480 mov edi, [ebp + 12] ; edi == autoc
481 fldz ; will accumulate autoc[7]
482 fldz ; will accumulate autoc[6]
489 faddp st2, st0 ; add to autoc[6]
492 add esi, byte 4 ; [CR] sample++
493 faddp st2, st0 ; add to autoc[7]
496 ; clean up the leftovers
500 faddp st1, st0 ; add to autoc[6]
501 fstp dword [edi + 24]
502 fstp dword [edi + 28]
506 mov ecx, [ebp + 4] ; ecx == data_len
507 mov esi, [ebp] ; esi == data
508 mov edi, [ebp + 12] ; edi == autoc
509 fldz ; will accumulate autoc[8]
510 fldz ; will accumulate autoc[7]
511 fldz ; will accumulate autoc[6]
518 faddp st2, st0 ; add to autoc[6]
521 faddp st3, st0 ; add to autoc[7]
524 add esi, byte 4 ; [CR] sample++
525 faddp st3, st0 ; add to autoc[8]
528 ; clean up the leftovers
532 faddp st2, st0 ; add to autoc[6]
535 add esi, byte 4 ; [CR] sample++
536 faddp st2, st0 ; add to autoc[7]
540 faddp st1, st0 ; add to autoc[6]
541 fstp dword [edi + 24]
542 fstp dword [edi + 28]
543 fstp dword [edi + 32]
547 mov ecx, [ebp + 4] ; ecx == data_len
548 mov esi, [ebp] ; esi == data
549 mov edi, [ebp + 12] ; edi == autoc
550 fldz ; will accumulate autoc[9]
551 fldz ; will accumulate autoc[8]
552 fldz ; will accumulate autoc[7]
553 fldz ; will accumulate autoc[6]
560 faddp st2, st0 ; add to autoc[6]
563 faddp st3, st0 ; add to autoc[7]
566 faddp st4, st0 ; add to autoc[8]
569 add esi, byte 4 ; [CR] sample++
570 faddp st4, st0 ; add to autoc[9]
573 ; clean up the leftovers
577 faddp st2, st0 ; add to autoc[6]
580 faddp st3, st0 ; add to autoc[7]
583 add esi, byte 4 ; [CR] sample++
584 faddp st3, st0 ; add to autoc[8]
588 faddp st2, st0 ; add to autoc[6]
591 add esi, byte 4 ; [CR] sample++
592 faddp st2, st0 ; add to autoc[7]
596 faddp st1, st0 ; add to autoc[6]
597 fstp dword [edi + 24]
598 fstp dword [edi + 28]
599 fstp dword [edi + 32]
600 fstp dword [edi + 36]
604 mov ecx, [ebp + 4] ; ecx == data_len
605 mov esi, [ebp] ; esi == data
606 mov edi, [ebp + 12] ; edi == autoc
607 fldz ; will accumulate autoc[10]
608 fldz ; will accumulate autoc[9]
609 fldz ; will accumulate autoc[8]
610 fldz ; will accumulate autoc[7]
611 fldz ; will accumulate autoc[6]
618 faddp st2, st0 ; add to autoc[6]
621 faddp st3, st0 ; add to autoc[7]
624 faddp st4, st0 ; add to autoc[8]
627 faddp st5, st0 ; add to autoc[9]
630 add esi, byte 4 ; [CR] sample++
631 faddp st5, st0 ; add to autoc[10]
634 ; clean up the leftovers
638 faddp st2, st0 ; add to autoc[6]
641 faddp st3, st0 ; add to autoc[7]
644 faddp st4, st0 ; add to autoc[8]
647 add esi, byte 4 ; [CR] sample++
648 faddp st4, st0 ; add to autoc[9]
652 faddp st2, st0 ; add to autoc[6]
655 faddp st3, st0 ; add to autoc[7]
658 add esi, byte 4 ; [CR] sample++
659 faddp st3, st0 ; add to autoc[8]
663 faddp st2, st0 ; add to autoc[6]
666 add esi, byte 4 ; [CR] sample++
667 faddp st2, st0 ; add to autoc[7]
671 faddp st1, st0 ; add to autoc[6]
672 fstp dword [edi + 24]
673 fstp dword [edi + 28]
674 fstp dword [edi + 32]
675 fstp dword [edi + 36]
676 fstp dword [edi + 40]
680 mov ecx, [ebp + 4] ; ecx == data_len
681 mov esi, [ebp] ; esi == data
682 mov edi, [ebp + 12] ; edi == autoc
683 fldz ; will accumulate autoc[11]
684 fldz ; will accumulate autoc[10]
685 fldz ; will accumulate autoc[9]
686 fldz ; will accumulate autoc[8]
687 fldz ; will accumulate autoc[7]
688 fldz ; will accumulate autoc[6]
695 faddp st2, st0 ; add to autoc[6]
698 faddp st3, st0 ; add to autoc[7]
701 faddp st4, st0 ; add to autoc[8]
704 faddp st5, st0 ; add to autoc[9]
707 faddp st6, st0 ; add to autoc[10]
710 add esi, byte 4 ; [CR] sample++
711 faddp st6, st0 ; add to autoc[11]
714 ; clean up the leftovers
718 faddp st2, st0 ; add to autoc[6]
721 faddp st3, st0 ; add to autoc[7]
724 faddp st4, st0 ; add to autoc[8]
727 faddp st5, st0 ; add to autoc[9]
730 add esi, byte 4 ; [CR] sample++
731 faddp st5, st0 ; add to autoc[10]
735 faddp st2, st0 ; add to autoc[6]
738 faddp st3, st0 ; add to autoc[7]
741 faddp st4, st0 ; add to autoc[8]
744 add esi, byte 4 ; [CR] sample++
745 faddp st4, st0 ; add to autoc[9]
749 faddp st2, st0 ; add to autoc[6]
752 faddp st3, st0 ; add to autoc[7]
755 add esi, byte 4 ; [CR] sample++
756 faddp st3, st0 ; add to autoc[8]
760 faddp st2, st0 ; add to autoc[6]
763 add esi, byte 4 ; [CR] sample++
764 faddp st2, st0 ; add to autoc[7]
768 faddp st1, st0 ; add to autoc[6]
769 fstp dword [edi + 24]
770 fstp dword [edi + 28]
771 fstp dword [edi + 32]
772 fstp dword [edi + 36]
773 fstp dword [edi + 40]
774 fstp dword [edi + 44]