; vim:filetype=nasm ts=8 ; libFLAC - Free Lossless Audio Codec library ; Copyright (C) 2001-2009 Josh Coalson ; Copyright (C) 2011-2013 Xiph.Org Foundation ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions ; are met: ; ; - Redistributions of source code must retain the above copyright ; notice, this list of conditions and the following disclaimer. ; ; - Redistributions in binary form must reproduce the above copyright ; notice, this list of conditions and the following disclaimer in the ; documentation and/or other materials provided with the distribution. ; ; - Neither the name of the Xiph.org Foundation nor the names of its ; contributors may be used to endorse or promote products derived from ; this software without specific prior written permission. ; ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ; [CR] is a note to flag that the instruction can be easily reordered %include "nasm.h" data_section cglobal FLAC__lpc_compute_autocorrelation_asm code_section ; ********************************************************************** ; ; void FLAC__lpc_compute_autocorrelation_asm(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]) ; { ; FLAC__real d; ; unsigned sample, coeff; ; const unsigned limit = data_len - lag; ; ; assert(lag > 0); ; assert(lag <= data_len); ; ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; ; for(sample = 0; sample <= limit; sample++){ ; d = data[sample]; ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } ; for(; sample < data_len; sample++){ ; d = data[sample]; ; for(coeff = 0; coeff < data_len - sample; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } ; } ; FLAC__lpc_compute_autocorrelation_asm: push ebp lea ebp, [esp + 8] push ebx push esi push edi mov edx, [ebp + 8] ; edx == lag mov ecx, [ebp + 4] ; ecx == data_len mov esi, [ebp] ; esi == data mov edi, [ebp + 12] ; edi == autoc cmp edx, 1 ja short .lag_above_1 .lag_eq_1: fldz ; will accumulate autoc[0] ALIGN 16 .lag_1_loop: fld dword [esi] add esi, byte 4 ; sample++ fmul st0, st0 faddp st1, st0 dec ecx jnz .lag_1_loop fstp dword [edi] jmp .end .lag_above_1: cmp edx, 2 ja short .lag_above_2 .lag_eq_2: fldz ; will accumulate autoc[1] dec ecx fldz ; will accumulate autoc[0] fld dword [esi] ALIGN 16 .lag_2_loop: add esi, byte 4 ; [CR] sample++ fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi] fmul st1, st0 fxch faddp st3, st0 ; add to autoc[1] dec ecx jnz .lag_2_loop ; clean up the leftovers fmul st0, st0 faddp st1, st0 ; add to autoc[0] fstp dword [edi] fstp dword [edi + 4] jmp .end .lag_above_2: cmp edx, 3 ja short .lag_above_3 .lag_eq_3: fldz ; will accumulate autoc[2] dec ecx fldz ; will accumulate autoc[1] dec ecx fldz ; will accumulate autoc[0] ALIGN 16 .lag_3_loop: fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st3, st0 ; add to autoc[2] dec ecx jnz .lag_3_loop ; clean up the leftovers fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st1, st0 fxch faddp st3, st0 ; add to autoc[1] fmul st0, st0 faddp st1, st0 ; add to autoc[0] fstp dword [edi] fstp dword [edi + 4] fstp dword [edi + 8] jmp .end .lag_above_3: cmp edx, 4 ja near .lag_above_4 .lag_eq_4: fldz ; will accumulate autoc[3] dec ecx fldz ; will accumulate autoc[2] dec ecx fldz ; will accumulate autoc[1] dec ecx fldz ; will accumulate autoc[0] ALIGN 16 .lag_4_loop: fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmul st0, st1 faddp st4, st0 ; add to autoc[2] fld dword [esi + 12] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st4, st0 ; add to autoc[3] dec ecx jnz .lag_4_loop ; clean up the leftovers fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st3, st0 ; add to autoc[2] fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st1, st0 fxch faddp st3, st0 ; add to autoc[1] fmul st0, st0 faddp st1, st0 ; add to autoc[0] fstp dword [edi] fstp dword [edi + 4] fstp dword [edi + 8] fstp dword [edi + 12] jmp .end .lag_above_4: cmp edx, 5 ja near .lag_above_5 .lag_eq_5: fldz ; will accumulate autoc[4] fldz ; will accumulate autoc[3] fldz ; will accumulate autoc[2] fldz ; will accumulate autoc[1] fldz ; will accumulate autoc[0] sub ecx, byte 4 ALIGN 16 .lag_5_loop: fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmul st0, st1 faddp st4, st0 ; add to autoc[2] fld dword [esi + 12] fmul st0, st1 faddp st5, st0 ; add to autoc[3] fld dword [esi + 16] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st5, st0 ; add to autoc[4] dec ecx jnz .lag_5_loop ; clean up the leftovers fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmul st0, st1 faddp st4, st0 ; add to autoc[2] fld dword [esi + 12] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st4, st0 ; add to autoc[3] fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st3, st0 ; add to autoc[2] fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st1, st0 fxch faddp st3, st0 ; add to autoc[1] fmul st0, st0 faddp st1, st0 ; add to autoc[0] fstp dword [edi] fstp dword [edi + 4] fstp dword [edi + 8] fstp dword [edi + 12] fstp dword [edi + 16] jmp .end .lag_above_5: cmp edx, 6 ja .lag_above_6 .lag_eq_6: fldz ; will accumulate autoc[5] fldz ; will accumulate autoc[4] fldz ; will accumulate autoc[3] fldz ; will accumulate autoc[2] fldz ; will accumulate autoc[1] fldz ; will accumulate autoc[0] sub ecx, byte 5 ALIGN 16 .lag_6_loop: fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmul st0, st1 faddp st4, st0 ; add to autoc[2] fld dword [esi + 12] fmul st0, st1 faddp st5, st0 ; add to autoc[3] fld dword [esi + 16] fmul st0, st1 faddp st6, st0 ; add to autoc[4] fld dword [esi + 20] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st6, st0 ; add to autoc[5] dec ecx jnz .lag_6_loop ; clean up the leftovers fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmul st0, st1 faddp st4, st0 ; add to autoc[2] fld dword [esi + 12] fmul st0, st1 faddp st5, st0 ; add to autoc[3] fld dword [esi + 16] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st5, st0 ; add to autoc[4] fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmul st0, st1 faddp st4, st0 ; add to autoc[2] fld dword [esi + 12] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st4, st0 ; add to autoc[3] fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st0, st1 faddp st3, st0 ; add to autoc[1] fld dword [esi + 8] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st3, st0 ; add to autoc[2] fld dword [esi] fld st0 fmul st0, st0 faddp st2, st0 ; add to autoc[0] fld dword [esi + 4] fmul st1, st0 fxch faddp st3, st0 ; add to autoc[1] fmul st0, st0 faddp st1, st0 ; add to autoc[0] fstp dword [edi] fstp dword [edi + 4] fstp dword [edi + 8] fstp dword [edi + 12] fstp dword [edi + 16] fstp dword [edi + 20] jmp .end .lag_above_6: ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; lea ecx, [edx * 2] ; ecx = # of dwords of 0 to write xor eax, eax rep stosd mov ecx, [ebp + 4] ; ecx == data_len mov edi, [ebp + 12] ; edi == autoc ; const unsigned limit = data_len - lag; sub ecx, edx inc ecx ; we are looping <= limit so we add one to the counter ; for(sample = 0; sample <= limit; sample++){ ; d = data[sample]; ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } xor eax, eax ; eax == sample <- 0 ALIGN 16 .outer_loop: push eax ; save sample fld dword [esi + eax * 4] ; ST = d <- data[sample] mov ebx, eax ; ebx == sample+coeff <- sample mov edx, [ebp + 8] ; edx <- lag xor eax, eax ; eax == coeff <- 0 ALIGN 16 .inner_loop: fld st0 ; ST = d d fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d inc ebx ; (sample+coeff)++ inc eax ; coeff++ dec edx jnz .inner_loop pop eax ; restore sample fstp st0 ; pop d, ST = empty inc eax ; sample++ loop .outer_loop ; for(; sample < data_len; sample++){ ; d = data[sample]; ; for(coeff = 0; coeff < data_len - sample; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } mov ecx, [ebp + 8] ; ecx <- lag dec ecx ; ecx <- lag - 1 jz .outer_end ; skip loop if 0 .outer_loop2: push eax ; save sample fld dword [esi + eax * 4] ; ST = d <- data[sample] mov ebx, eax ; ebx == sample+coeff <- sample mov edx, [ebp + 4] ; edx <- data_len sub edx, eax ; edx <- data_len-sample xor eax, eax ; eax == coeff <- 0 .inner_loop2: fld st0 ; ST = d d fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d inc ebx ; (sample+coeff)++ inc eax ; coeff++ dec edx jnz .inner_loop2 pop eax ; restore sample fstp st0 ; pop d, ST = empty inc eax ; sample++ loop .outer_loop2 .outer_end: jmp .end .lag_eq_6_plus_1: mov ecx, [ebp + 4] ; ecx == data_len mov esi, [ebp] ; esi == data mov edi, [ebp + 12] ; edi == autoc fldz ; will accumulate autoc[6] sub ecx, byte 6 ALIGN 16 .lag_6_1_loop: fld dword [esi] fld dword [esi + 24] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st1, st0 ; add to autoc[6] dec ecx jnz .lag_6_1_loop fstp dword [edi + 24] jmp .end .lag_eq_6_plus_2: mov ecx, [ebp + 4] ; ecx == data_len mov esi, [ebp] ; esi == data mov edi, [ebp + 12] ; edi == autoc fldz ; will accumulate autoc[7] fldz ; will accumulate autoc[6] sub ecx, byte 7 ALIGN 16 .lag_6_2_loop: fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st2, st0 ; add to autoc[7] dec ecx jnz .lag_6_2_loop ; clean up the leftovers fld dword [esi] fld dword [esi + 24] fmulp st1, st0 faddp st1, st0 ; add to autoc[6] fstp dword [edi + 24] fstp dword [edi + 28] jmp .end .lag_eq_6_plus_3: mov ecx, [ebp + 4] ; ecx == data_len mov esi, [ebp] ; esi == data mov edi, [ebp + 12] ; edi == autoc fldz ; will accumulate autoc[8] fldz ; will accumulate autoc[7] fldz ; will accumulate autoc[6] sub ecx, byte 8 ALIGN 16 .lag_6_3_loop: fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st3, st0 ; add to autoc[8] dec ecx jnz .lag_6_3_loop ; clean up the leftovers fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st2, st0 ; add to autoc[7] fld dword [esi] fld dword [esi + 24] fmulp st1, st0 faddp st1, st0 ; add to autoc[6] fstp dword [edi + 24] fstp dword [edi + 28] fstp dword [edi + 32] jmp .end .lag_eq_6_plus_4: mov ecx, [ebp + 4] ; ecx == data_len mov esi, [ebp] ; esi == data mov edi, [ebp + 12] ; edi == autoc fldz ; will accumulate autoc[9] fldz ; will accumulate autoc[8] fldz ; will accumulate autoc[7] fldz ; will accumulate autoc[6] sub ecx, byte 9 ALIGN 16 .lag_6_4_loop: fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmul st0, st1 faddp st4, st0 ; add to autoc[8] fld dword [esi + 36] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st4, st0 ; add to autoc[9] dec ecx jnz .lag_6_4_loop ; clean up the leftovers fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st3, st0 ; add to autoc[8] fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st2, st0 ; add to autoc[7] fld dword [esi] fld dword [esi + 24] fmulp st1, st0 faddp st1, st0 ; add to autoc[6] fstp dword [edi + 24] fstp dword [edi + 28] fstp dword [edi + 32] fstp dword [edi + 36] jmp .end .lag_eq_6_plus_5: mov ecx, [ebp + 4] ; ecx == data_len mov esi, [ebp] ; esi == data mov edi, [ebp + 12] ; edi == autoc fldz ; will accumulate autoc[10] fldz ; will accumulate autoc[9] fldz ; will accumulate autoc[8] fldz ; will accumulate autoc[7] fldz ; will accumulate autoc[6] sub ecx, byte 10 ALIGN 16 .lag_6_5_loop: fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmul st0, st1 faddp st4, st0 ; add to autoc[8] fld dword [esi + 36] fmul st0, st1 faddp st5, st0 ; add to autoc[9] fld dword [esi + 40] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st5, st0 ; add to autoc[10] dec ecx jnz .lag_6_5_loop ; clean up the leftovers fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmul st0, st1 faddp st4, st0 ; add to autoc[8] fld dword [esi + 36] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st4, st0 ; add to autoc[9] fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st3, st0 ; add to autoc[8] fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st2, st0 ; add to autoc[7] fld dword [esi] fld dword [esi + 24] fmulp st1, st0 faddp st1, st0 ; add to autoc[6] fstp dword [edi + 24] fstp dword [edi + 28] fstp dword [edi + 32] fstp dword [edi + 36] fstp dword [edi + 40] jmp .end .lag_eq_6_plus_6: mov ecx, [ebp + 4] ; ecx == data_len mov esi, [ebp] ; esi == data mov edi, [ebp + 12] ; edi == autoc fldz ; will accumulate autoc[11] fldz ; will accumulate autoc[10] fldz ; will accumulate autoc[9] fldz ; will accumulate autoc[8] fldz ; will accumulate autoc[7] fldz ; will accumulate autoc[6] sub ecx, byte 11 ALIGN 16 .lag_6_6_loop: fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmul st0, st1 faddp st4, st0 ; add to autoc[8] fld dword [esi + 36] fmul st0, st1 faddp st5, st0 ; add to autoc[9] fld dword [esi + 40] fmul st0, st1 faddp st6, st0 ; add to autoc[10] fld dword [esi + 44] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st6, st0 ; add to autoc[11] dec ecx jnz .lag_6_6_loop ; clean up the leftovers fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmul st0, st1 faddp st4, st0 ; add to autoc[8] fld dword [esi + 36] fmul st0, st1 faddp st5, st0 ; add to autoc[9] fld dword [esi + 40] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st5, st0 ; add to autoc[10] fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmul st0, st1 faddp st4, st0 ; add to autoc[8] fld dword [esi + 36] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st4, st0 ; add to autoc[9] fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmul st0, st1 faddp st3, st0 ; add to autoc[7] fld dword [esi + 32] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st3, st0 ; add to autoc[8] fld dword [esi] fld dword [esi + 24] fmul st0, st1 faddp st2, st0 ; add to autoc[6] fld dword [esi + 28] fmulp st1, st0 add esi, byte 4 ; [CR] sample++ faddp st2, st0 ; add to autoc[7] fld dword [esi] fld dword [esi + 24] fmulp st1, st0 faddp st1, st0 ; add to autoc[6] fstp dword [edi + 24] fstp dword [edi + 28] fstp dword [edi + 32] fstp dword [edi + 36] fstp dword [edi + 40] fstp dword [edi + 44] jmp .end .end: pop edi pop esi pop ebx pop ebp ret end