From 9a7b5e2d80fd6df2f9a49c0c012e34333dfbd8ba Mon Sep 17 00:00:00 2001 From: Josh Coalson Date: Wed, 13 Jun 2001 18:03:09 +0000 Subject: [PATCH] move from ../i386 --- src/libFLAC/ia32/Makefile.am | 28 + src/libFLAC/ia32/cpu_asm.nasm | 54 ++ src/libFLAC/ia32/fixed_asm.nasm | 337 ++++++++ src/libFLAC/ia32/lpc_asm-unrolled.nasm | 770 ++++++++++++++++++ src/libFLAC/ia32/lpc_asm.nasm | 1348 ++++++++++++++++++++++++++++++++ src/libFLAC/ia32/nasm.h | 58 ++ 6 files changed, 2595 insertions(+) create mode 100644 src/libFLAC/ia32/Makefile.am create mode 100644 src/libFLAC/ia32/cpu_asm.nasm create mode 100644 src/libFLAC/ia32/fixed_asm.nasm create mode 100644 src/libFLAC/ia32/lpc_asm-unrolled.nasm create mode 100644 src/libFLAC/ia32/lpc_asm.nasm create mode 100644 src/libFLAC/ia32/nasm.h diff --git a/src/libFLAC/ia32/Makefile.am b/src/libFLAC/ia32/Makefile.am new file mode 100644 index 0000000..bdbb919 --- /dev/null +++ b/src/libFLAC/ia32/Makefile.am @@ -0,0 +1,28 @@ +# libFLAC - Free Lossless Audio Codec library +# Copyright (C) 2001 Josh Coalson +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Library General Public +# License as published by the Free Software Foundation; either +# version 2 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Library General Public License for more details. +# +# You should have received a copy of the GNU Library General Public +# License along with this library; if not, write to the +# Free Software Foundation, Inc., 59 Temple Place - Suite 330, +# Boston, MA 02111-1307, USA. + +# nasm build rule: +SUFFIXES = .s .lo +.s.lo: + $(NASM) -f elf -d ELF $< -o $@ + +noinst_LTLIBRARIES = libFLAC-asm.la +libFLAC_asm_la_SOURCES = \ + cpu_asm.s \ + fixed_asm.s \ + lpc_asm.s diff --git a/src/libFLAC/ia32/cpu_asm.nasm b/src/libFLAC/ia32/cpu_asm.nasm new file mode 100644 index 0000000..2869fda --- /dev/null +++ b/src/libFLAC/ia32/cpu_asm.nasm @@ -0,0 +1,54 @@ +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001 Josh Coalson +; +; This library is free software; you can redistribute it and/or +; modify it under the terms of the GNU Library General Public +; License as published by the Free Software Foundation; either +; version 2 of the License, or (at your option) any later version. +; +; This library is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Library General Public License for more details. +; +; You should have received a copy of the GNU Library General Public +; License along with this library; if not, write to the +; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +; Boston, MA 02111-1307, USA. + +%include "nasm.h" + + data_section + +cglobal FLAC__cpu_info_asm_i386 + + code_section + +; ********************************************************************** +; + ALIGN 16 +cident FLAC__cpu_info_asm_i386 + + push ebx + + pushfd + pop eax + mov edx, eax + xor eax, 00200000h + push eax + popfd + pushfd + pop eax + cmp eax, edx + jz .no_cpuid + mov eax, 1 + cpuid + mov eax, edx + jmp short .end +.no_cpuid: + xor eax, eax ; return 0 +.end: + pop ebx + ret + +end diff --git a/src/libFLAC/ia32/fixed_asm.nasm b/src/libFLAC/ia32/fixed_asm.nasm new file mode 100644 index 0000000..b0aba2e --- /dev/null +++ b/src/libFLAC/ia32/fixed_asm.nasm @@ -0,0 +1,337 @@ +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001 Josh Coalson +; +; This library is free software; you can redistribute it and/or +; modify it under the terms of the GNU Library General Public +; License as published by the Free Software Foundation; either +; version 2 of the License, or (at your option) any later version. +; +; This library is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Library General Public License for more details. +; +; You should have received a copy of the GNU Library General Public +; License along with this library; if not, write to the +; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +; Boston, MA 02111-1307, USA. + +%include "nasm.h" + + data_section + +cglobal FLAC__fixed_compute_best_predictor_asm_i386_mmx_cmov + + code_section + +; ********************************************************************** +; +; unsigned FLAC__fixed_compute_best_predictor(const int32 data[], unsigned data_len, real residual_bits_per_sample[FLAC__MAX_FIXED_ORDER+1]) +; { +; int32 last_error_0 = data[-1]; +; int32 last_error_1 = data[-1] - data[-2]; +; int32 last_error_2 = last_error_1 - (data[-2] - data[-3]); +; int32 last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); +; int32 error, save; +; uint32 total_error_0 = 0, total_error_1 = 0, total_error_2 = 0, total_error_3 = 0, total_error_4 = 0; +; unsigned i, order; +; +; for(i = 0; i < data_len; i++) { +; error = data[i] ; total_error_0 += local_abs(error); save = error; +; error -= last_error_0; total_error_1 += local_abs(error); last_error_0 = save; save = error; +; error -= last_error_1; total_error_2 += local_abs(error); last_error_1 = save; save = error; +; error -= last_error_2; total_error_3 += local_abs(error); last_error_2 = save; save = error; +; error -= last_error_3; total_error_4 += local_abs(error); last_error_3 = save; +; } +; +; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) +; order = 0; +; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) +; order = 1; +; else if(total_error_2 < min(total_error_3, total_error_4)) +; order = 2; +; else if(total_error_3 < total_error_4) +; order = 3; +; else +; order = 4; +; +; residual_bits_per_sample[0] = (real)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (real)total_error_0 / (real) data_len) / M_LN2 : 0.0); +; residual_bits_per_sample[1] = (real)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (real)total_error_1 / (real) data_len) / M_LN2 : 0.0); +; residual_bits_per_sample[2] = (real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (real)total_error_2 / (real) data_len) / M_LN2 : 0.0); +; residual_bits_per_sample[3] = (real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (real)total_error_3 / (real) data_len) / M_LN2 : 0.0); +; residual_bits_per_sample[4] = (real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (real)total_error_4 / (real) data_len) / M_LN2 : 0.0); +; +; return order; +; } + ALIGN 16 +cident FLAC__fixed_compute_best_predictor_asm_i386_mmx_cmov + + ; esp + 36 == data[] + ; esp + 40 == data_len + ; esp + 44 == residual_bits_per_sample[] + + push ebp + push ebx + push esi + push edi + sub esp, byte 16 + ; qword [esp] == temp space for loading uint64s to FPU regs + ; dword [esp] == last_error_0 + ; dword [esp + 4] == last_error_1 + ; dword [esp + 8] == last_error_2 + ; dword [esp + 12] == last_error_3 + + ; eax == error + ; ebx == &data[i] + ; ecx == loop counter (i) + ; edx == temp + ; edi == save + ; ebp == order + ; mm0 == total_error_1:total_error_0 + ; mm1 == total_error_3:total_error_2 + ; mm2 == 0:total_error_4 + ; mm3/4 == 0:unpackarea + ; mm5 == abs(error_1):abs(error_0) + ; mm5 == abs(error_3):abs(error_2) + + pxor mm0, mm0 ; total_error_1 = total_error_0 = 0 + pxor mm1, mm1 ; total_error_3 = total_error_2 = 0 + pxor mm2, mm2 ; total_error_4 = 0 + mov ebx, [esp + 36] ; ebx = data[] + mov ecx, [ebx - 4] ; ecx == data[-1] last_error_0 = data[-1] + mov eax, [ebx - 8] ; eax == data[-2] + mov ebp, [ebx - 16] ; ebp == data[-4] + mov ebx, [ebx - 12] ; ebx == data[-3] + mov edx, ecx + sub edx, eax ; last_error_1 = data[-1] - data[-2] + mov esi, edx + sub esi, eax + add esi, ebx ; last_error_2 = last_error_1 - (data[-2] - data[-3]) + shl ebx, 1 + mov edi, esi + sub edi, eax + add edi, ebx + sub edi, ebp ; last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); + mov ebx, [esp + 36] ; ebx = data[] + mov [esp], ecx ; [esp] = last_error_0 + mov [esp + 4], edx ; [esp + 4] = last_error_1 + mov [esp + 8], esi ; [esp + 8] = last_error_2 + mov [esp + 12], edi ; [esp + 12] = last_error_3 + mov ecx, [esp + 40] ; ecx = data_len + + ; for(i = 0; i < data_len; i++) { + ; error_0 = data[i] ; save = error_0; total_error_0 += local_abs(error_0); + ; error_1 -= last_error_0; last_error_0 = save; save = error_1; total_error_1 += local_abs(error_1); + ; error_2 -= last_error_1; last_error_1 = save; save = error_2; total_error_2 += local_abs(error_2); + ; error_3 -= last_error_2; last_error_2 = save; save = error_3; total_error_3 += local_abs(error_3); + ; error_4 -= last_error_3; last_error_3 = save; total_error_4 += local_abs(error_4); + ; } + ALIGN 16 +.loop: + mov eax, [ebx] ; eax = error_0 = data[i] + add ebx, 4 + mov edi, eax ; edi == save = error_0 + mov edx, eax ; edx = error_0 + neg edx ; edx = -error_0 + cmovns eax, edx ; eax = abs(error_0) + movd mm5, eax ; mm5 = 0:abs(error_0) + mov edx, [esp] ; edx = last_error_0 + mov eax, edi ; eax = error(error_0) + mov [esp], edi ; [esp] == last_error_0 = save + sub eax, edx ; error -= last_error_0 + mov edi, eax ; edi == save = error_1 + mov edx, eax ; edx = error_1 + neg edx ; edx = -error_1 + cmovns eax, edx ; eax = abs(error_1) + movd mm4, eax ; mm4 = 0:abs(error_1) + punpckldq mm5, mm4 ; mm5 = abs(error_1):abs(error_0) + mov edx, [esp + 4] ; edx = last_error_1 + mov eax, edi ; eax = error(error_1) + mov [esp + 4], edi ; [esp + 4] == last_error_1 = save + sub eax, edx ; error -= last_error_1 + mov edi, eax ; edi == save = error_2 + mov edx, eax ; edx = error_2 + paddd mm0, mm5 ; [CR] total_error_1 += abs(error_1) ; total_error_0 += abs(error_0) + neg edx ; edx = -error_2 + cmovns eax, edx ; eax = abs(error_2) + movd mm5, eax ; mm5 = 0:abs(error_2) + mov edx, [esp + 8] ; edx = last_error_2 + mov eax, edi ; eax = error(error_2) + mov [esp + 8], edi ; [esp + 8] == last_error_2 = save + sub eax, edx ; error -= last_error_2 + mov edi, eax ; edi == save = error_3 + mov edx, eax ; edx = error_3 + neg edx ; edx = -error_3 + cmovns eax, edx ; eax = abs(error_3) + movd mm4, eax ; mm4 = 0:abs(error_3) + punpckldq mm5, mm4 ; mm5 = abs(error_3):abs(error_2) + mov edx, [esp + 12] ; edx = last_error_3 + mov eax, edi ; eax = error(error_3) + mov [esp + 12], edi ; [esp + 12] == last_error_3 = save + sub eax, edx ; error -= last_error_3 + mov edx, eax ; edx = error_4 + paddd mm1, mm5 ; [CR] total_error_3 += abs(error_3) ; total_error_2 += abs(error_2) + neg edx ; edx = -error_4 + cmovns eax, edx ; eax = abs(error_4) + movd mm5, eax ; mm5 = 0:abs(error_4) + paddd mm2, mm5 ; total_error_4 += abs(error_4) + dec ecx + jnz near .loop + +; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) +; order = 0; +; else if(total_error_1 < min(min(total_error_2, total_error_3), total_error_4)) +; order = 1; +; else if(total_error_2 < min(total_error_3, total_error_4)) +; order = 2; +; else if(total_error_3 < total_error_4) +; order = 3; +; else +; order = 4; + movd edi, mm2 ; edi = total_error_4 + movq mm4, mm1 ; mm4 = total_error_3:total_error_2 + psrlq mm4, 32 ; mm4 = 0:total_error_3 + movd edx, mm1 ; edx = total_error_2 + movd esi, mm4 ; esi = total_error_3 + movq mm3, mm0 ; mm3 = total_error_1:total_error_0 + psrlq mm3, 32 ; mm3 = 0:total_error_1 + movd ebx, mm0 ; ebx = total_error_0 + movd ecx, mm3 ; ecx = total_error_1 + emms + mov eax, ebx ; eax = total_error_0 + cmp ecx, ebx + cmovb eax, ecx ; eax = min(total_error_0, total_error_1) + cmp edx, eax + cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2) + cmp esi, eax + cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3) + cmp edi, eax + cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4) + + cmp eax, ebx + jne .not_order_0 + xor ebp, ebp + jmp short .got_order +.not_order_0: + cmp eax, ecx + jne .not_order_1 + mov ebp, 1 + jmp short .got_order +.not_order_1: + cmp eax, edx + jne .not_order_2 + mov ebp, 2 + jmp short .got_order +.not_order_2: + cmp eax, esi + jne .not_order_3 + mov ebp, 3 + jmp short .got_order +.not_order_3: + mov ebp, 4 +.got_order: + ; residual_bits_per_sample[0] = (real)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (real)total_error_0 / (real) data_len) / M_LN2 : 0.0); + ; residual_bits_per_sample[1] = (real)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (real)total_error_1 / (real) data_len) / M_LN2 : 0.0); + ; residual_bits_per_sample[2] = (real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (real)total_error_2 / (real) data_len) / M_LN2 : 0.0); + ; residual_bits_per_sample[3] = (real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (real)total_error_3 / (real) data_len) / M_LN2 : 0.0); + ; residual_bits_per_sample[4] = (real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (real)total_error_4 / (real) data_len) / M_LN2 : 0.0); + xor eax, eax + cmp eax, [esp + 40] + je near .data_len_is_0 + fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs) +.rbps_0: + test ebx, ebx + jz .total_error_0_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], ebx + mov [esp + 4], eax ; [esp] = (uint64)total_error_0 + mov ebx, [esp + 44] + fild qword [esp] ; ST = total_error_0 1.0 data_len + fdiv st2 ; ST = total_error_0/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_0/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_0/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_0/data_len) data_len + fstp dword [ebx] ; residual_bits_per_sample[0] = log2(ln2*total_error_0/data_len) ST = data_len + jmp short .rbps_1 +.total_error_0_is_0: + mov ebx, [esp + 44] + mov [ebx], eax ; residual_bits_per_sample[0] = 0.0 +.rbps_1: + test ecx, ecx + jz .total_error_1_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], ecx + mov [esp + 4], eax ; [esp] = (uint64)total_error_1 + fild qword [esp] ; ST = total_error_1 1.0 data_len + fdiv st2 ; ST = total_error_1/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_1/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_1/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_1/data_len) data_len + fstp dword [ebx + 4] ; residual_bits_per_sample[1] = log2(ln2*total_error_1/data_len) ST = data_len + jmp short .rbps_2 +.total_error_1_is_0: + mov [ebx + 4], eax ; residual_bits_per_sample[1] = 0.0 +.rbps_2: + test edx, edx + jz .total_error_2_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], edx + mov [esp + 4], eax ; [esp] = (uint64)total_error_2 + fild qword [esp] ; ST = total_error_2 1.0 data_len + fdiv st2 ; ST = total_error_2/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_2/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_2/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_2/data_len) data_len + fstp dword [ebx + 8] ; residual_bits_per_sample[2] = log2(ln2*total_error_2/data_len) ST = data_len + jmp short .rbps_3 +.total_error_2_is_0: + mov [ebx + 8], eax ; residual_bits_per_sample[2] = 0.0 +.rbps_3: + test esi, esi + jz .total_error_3_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], esi + mov [esp + 4], eax ; [esp] = (uint64)total_error_3 + fild qword [esp] ; ST = total_error_3 1.0 data_len + fdiv st2 ; ST = total_error_3/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_3/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_3/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_3/data_len) data_len + fstp dword [ebx + 12] ; residual_bits_per_sample[3] = log2(ln2*total_error_3/data_len) ST = data_len + jmp short .rbps_4 +.total_error_3_is_0: + mov [ebx + 12], eax ; residual_bits_per_sample[3] = 0.0 +.rbps_4: + test edi, edi + jz .total_error_4_is_0 + fld1 ; ST = 1.0 data_len + mov [esp], edi + mov [esp + 4], eax ; [esp] = (uint64)total_error_4 + fild qword [esp] ; ST = total_error_4 1.0 data_len + fdiv st2 ; ST = total_error_4/data_len 1.0 data_len + fldln2 ; ST = ln2 total_error_4/data_len 1.0 data_len + fmulp st1 ; ST = ln2*total_error_4/data_len 1.0 data_len + fyl2x ; ST = log2(ln2*total_error_4/data_len) data_len + fstp dword [ebx + 16] ; residual_bits_per_sample[4] = log2(ln2*total_error_4/data_len) ST = data_len + jmp short .rbps_end +.total_error_4_is_0: + mov [ebx + 16], eax ; residual_bits_per_sample[4] = 0.0 +.rbps_end: + fstp st0 ; ST = [empty] + jmp short .end +.data_len_is_0: + ; data_len == 0, so residual_bits_per_sample[*] = 0.0 + mov ecx, 5 ; eax still == 0, ecx = # of dwords of 0 to store + mov edi, [esp + 44] + rep stosd + +.end: + mov eax, ebp ; return order + add esp, byte 16 + pop edi + pop esi + pop ebx + pop ebp + ret + +end diff --git a/src/libFLAC/ia32/lpc_asm-unrolled.nasm b/src/libFLAC/ia32/lpc_asm-unrolled.nasm new file mode 100644 index 0000000..cfc6ef2 --- /dev/null +++ b/src/libFLAC/ia32/lpc_asm-unrolled.nasm @@ -0,0 +1,770 @@ +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001 Josh Coalson +; +; This library is free software; you can redistribute it and/or +; modify it under the terms of the GNU Library General Public +; License as published by the Free Software Foundation; either +; version 2 of the License, or (at your option) any later version. +; +; This library is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Library General Public License for more details. +; +; You should have received a copy of the GNU Library General Public +; License along with this library; if not, write to the +; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +; Boston, MA 02111-1307, USA. + +; [CR] is a note to flag that the instruction can be easily reordered + +%include "nasm.h" + + data_section + +cglobal FLAC__lpc_compute_autocorrelation_asm + + code_section + +; ********************************************************************** +; +; void FLAC__lpc_compute_autocorrelation_asm(const real data[], unsigned data_len, unsigned lag, real autoc[]) +; { +; real d; +; unsigned sample, coeff; +; const unsigned limit = data_len - lag; +; +; assert(lag > 0); +; assert(lag <= data_len); +; +; for(coeff = 0; coeff < lag; coeff++) +; autoc[coeff] = 0.0; +; for(sample = 0; sample <= limit; sample++){ +; d = data[sample]; +; for(coeff = 0; coeff < lag; coeff++) +; autoc[coeff] += d * data[sample+coeff]; +; } +; for(; sample < data_len; sample++){ +; d = data[sample]; +; for(coeff = 0; coeff < data_len - sample; coeff++) +; autoc[coeff] += d * data[sample+coeff]; +; } +; } +; +FLAC__lpc_compute_autocorrelation_asm: + + push ebp + lea ebp, [esp + 8] + push ebx + push esi + push edi + + mov edx, [ebp + 8] ; edx == lag + mov ecx, [ebp + 4] ; ecx == data_len + mov esi, [ebp] ; esi == data + mov edi, [ebp + 12] ; edi == autoc + + cmp edx, 1 + ja short .lag_above_1 +.lag_eq_1: + fldz ; will accumulate autoc[0] + ALIGN 16 +.lag_1_loop: + fld dword [esi] + add esi, byte 4 ; sample++ + fmul st0, st0 + faddp st1, st0 + dec ecx + jnz .lag_1_loop + fstp dword [edi] + jmp .end + +.lag_above_1: + cmp edx, 2 + ja short .lag_above_2 +.lag_eq_2: + fldz ; will accumulate autoc[1] + dec ecx + fldz ; will accumulate autoc[0] + fld dword [esi] + ALIGN 16 +.lag_2_loop: + add esi, byte 4 ; [CR] sample++ + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi] + fmul st1, st0 + fxch + faddp st3, st0 ; add to autoc[1] + dec ecx + jnz .lag_2_loop + ; clean up the leftovers + fmul st0, st0 + faddp st1, st0 ; add to autoc[0] + fstp dword [edi] + fstp dword [edi + 4] + jmp .end + +.lag_above_2: + cmp edx, 3 + ja short .lag_above_3 +.lag_eq_3: + fldz ; will accumulate autoc[2] + dec ecx + fldz ; will accumulate autoc[1] + dec ecx + fldz ; will accumulate autoc[0] + ALIGN 16 +.lag_3_loop: + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st3, st0 ; add to autoc[2] + dec ecx + jnz .lag_3_loop + ; clean up the leftovers + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st1, st0 + fxch + faddp st3, st0 ; add to autoc[1] + fmul st0, st0 + faddp st1, st0 ; add to autoc[0] + fstp dword [edi] + fstp dword [edi + 4] + fstp dword [edi + 8] + jmp .end + +.lag_above_3: + cmp edx, 4 + ja near .lag_above_4 +.lag_eq_4: + fldz ; will accumulate autoc[3] + dec ecx + fldz ; will accumulate autoc[2] + dec ecx + fldz ; will accumulate autoc[1] + dec ecx + fldz ; will accumulate autoc[0] + ALIGN 16 +.lag_4_loop: + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmul st0, st1 + faddp st4, st0 ; add to autoc[2] + fld dword [esi + 12] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st4, st0 ; add to autoc[3] + dec ecx + jnz .lag_4_loop + ; clean up the leftovers + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st3, st0 ; add to autoc[2] + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st1, st0 + fxch + faddp st3, st0 ; add to autoc[1] + fmul st0, st0 + faddp st1, st0 ; add to autoc[0] + fstp dword [edi] + fstp dword [edi + 4] + fstp dword [edi + 8] + fstp dword [edi + 12] + jmp .end + +.lag_above_4: + cmp edx, 5 + ja near .lag_above_5 +.lag_eq_5: + fldz ; will accumulate autoc[4] + fldz ; will accumulate autoc[3] + fldz ; will accumulate autoc[2] + fldz ; will accumulate autoc[1] + fldz ; will accumulate autoc[0] + sub ecx, byte 4 + ALIGN 16 +.lag_5_loop: + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmul st0, st1 + faddp st4, st0 ; add to autoc[2] + fld dword [esi + 12] + fmul st0, st1 + faddp st5, st0 ; add to autoc[3] + fld dword [esi + 16] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st5, st0 ; add to autoc[4] + dec ecx + jnz .lag_5_loop + ; clean up the leftovers + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmul st0, st1 + faddp st4, st0 ; add to autoc[2] + fld dword [esi + 12] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st4, st0 ; add to autoc[3] + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st3, st0 ; add to autoc[2] + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st1, st0 + fxch + faddp st3, st0 ; add to autoc[1] + fmul st0, st0 + faddp st1, st0 ; add to autoc[0] + fstp dword [edi] + fstp dword [edi + 4] + fstp dword [edi + 8] + fstp dword [edi + 12] + fstp dword [edi + 16] + jmp .end + +.lag_above_5: + cmp edx, 6 + ja .lag_above_6 +.lag_eq_6: + fldz ; will accumulate autoc[5] + fldz ; will accumulate autoc[4] + fldz ; will accumulate autoc[3] + fldz ; will accumulate autoc[2] + fldz ; will accumulate autoc[1] + fldz ; will accumulate autoc[0] + sub ecx, byte 5 + ALIGN 16 +.lag_6_loop: + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmul st0, st1 + faddp st4, st0 ; add to autoc[2] + fld dword [esi + 12] + fmul st0, st1 + faddp st5, st0 ; add to autoc[3] + fld dword [esi + 16] + fmul st0, st1 + faddp st6, st0 ; add to autoc[4] + fld dword [esi + 20] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st6, st0 ; add to autoc[5] + dec ecx + jnz .lag_6_loop + ; clean up the leftovers + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmul st0, st1 + faddp st4, st0 ; add to autoc[2] + fld dword [esi + 12] + fmul st0, st1 + faddp st5, st0 ; add to autoc[3] + fld dword [esi + 16] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st5, st0 ; add to autoc[4] + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmul st0, st1 + faddp st4, st0 ; add to autoc[2] + fld dword [esi + 12] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st4, st0 ; add to autoc[3] + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st0, st1 + faddp st3, st0 ; add to autoc[1] + fld dword [esi + 8] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st3, st0 ; add to autoc[2] + fld dword [esi] + fld st0 + fmul st0, st0 + faddp st2, st0 ; add to autoc[0] + fld dword [esi + 4] + fmul st1, st0 + fxch + faddp st3, st0 ; add to autoc[1] + fmul st0, st0 + faddp st1, st0 ; add to autoc[0] + fstp dword [edi] + fstp dword [edi + 4] + fstp dword [edi + 8] + fstp dword [edi + 12] + fstp dword [edi + 16] + fstp dword [edi + 20] + jmp .end + +.lag_above_6: + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + lea ecx, [edx * 2] ; ecx = # of dwords of 0 to write + xor eax, eax + rep stosd + mov ecx, [ebp + 4] ; ecx == data_len + mov edi, [ebp + 12] ; edi == autoc + ; const unsigned limit = data_len - lag; + sub ecx, edx + inc ecx ; we are looping <= limit so we add one to the counter + ; for(sample = 0; sample <= limit; sample++){ + ; d = data[sample]; + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] += d * data[sample+coeff]; + ; } + xor eax, eax ; eax == sample <- 0 + ALIGN 16 +.outer_loop: + push eax ; save sample + fld dword [esi + eax * 4] ; ST = d <- data[sample] + mov ebx, eax ; ebx == sample+coeff <- sample + mov edx, [ebp + 8] ; edx <- lag + xor eax, eax ; eax == coeff <- 0 + ALIGN 16 +.inner_loop: + fld st0 ; ST = d d + fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d + fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d + fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d + inc ebx ; (sample+coeff)++ + inc eax ; coeff++ + dec edx + jnz .inner_loop + pop eax ; restore sample + fstp st0 ; pop d, ST = empty + inc eax ; sample++ + loop .outer_loop + ; for(; sample < data_len; sample++){ + ; d = data[sample]; + ; for(coeff = 0; coeff < data_len - sample; coeff++) + ; autoc[coeff] += d * data[sample+coeff]; + ; } + mov ecx, [ebp + 8] ; ecx <- lag + dec ecx ; ecx <- lag - 1 + jz .outer_end ; skip loop if 0 +.outer_loop2: + push eax ; save sample + fld dword [esi + eax * 4] ; ST = d <- data[sample] + mov ebx, eax ; ebx == sample+coeff <- sample + mov edx, [ebp + 4] ; edx <- data_len + sub edx, eax ; edx <- data_len-sample + xor eax, eax ; eax == coeff <- 0 +.inner_loop2: + fld st0 ; ST = d d + fmul dword [esi + ebx * 4] ; ST = d*data[sample+coeff] d + fadd dword [edi + eax * 4] ; ST = autoc[coeff]+d*data[sample+coeff] d + fstp dword [edi + eax * 4] ; autoc[coeff]+=d*data[sample+coeff] ST = d + inc ebx ; (sample+coeff)++ + inc eax ; coeff++ + dec edx + jnz .inner_loop2 + pop eax ; restore sample + fstp st0 ; pop d, ST = empty + inc eax ; sample++ + loop .outer_loop2 +.outer_end: + jmp .end + +.lag_eq_6_1: + mov ecx, [ebp + 4] ; ecx == data_len + mov esi, [ebp] ; esi == data + mov edi, [ebp + 12] ; edi == autoc + fldz ; will accumulate autoc[6] + sub ecx, byte 6 + ALIGN 16 +.lag_6_1_loop: + fld dword [esi] + fld dword [esi + 24] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st1, st0 ; add to autoc[6] + dec ecx + jnz .lag_6_1_loop + fstp dword [edi + 24] + jmp .end + +.lag_eq_6_2: + mov ecx, [ebp + 4] ; ecx == data_len + mov esi, [ebp] ; esi == data + mov edi, [ebp + 12] ; edi == autoc + fldz ; will accumulate autoc[7] + fldz ; will accumulate autoc[6] + sub ecx, byte 7 + ALIGN 16 +.lag_6_2_loop: + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st2, st0 ; add to autoc[7] + dec ecx + jnz .lag_6_2_loop + ; clean up the leftovers + fld dword [esi] + fld dword [esi + 24] + fmulp st1, st0 + faddp st1, st0 ; add to autoc[6] + fstp dword [edi + 24] + fstp dword [edi + 28] + jmp .end + +.lag_eq_6_3: + mov ecx, [ebp + 4] ; ecx == data_len + mov esi, [ebp] ; esi == data + mov edi, [ebp + 12] ; edi == autoc + fldz ; will accumulate autoc[8] + fldz ; will accumulate autoc[7] + fldz ; will accumulate autoc[6] + sub ecx, byte 8 + ALIGN 16 +.lag_6_3_loop: + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st3, st0 ; add to autoc[8] + dec ecx + jnz .lag_6_3_loop + ; clean up the leftovers + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st2, st0 ; add to autoc[7] + fld dword [esi] + fld dword [esi + 24] + fmulp st1, st0 + faddp st1, st0 ; add to autoc[6] + fstp dword [edi + 24] + fstp dword [edi + 28] + fstp dword [edi + 32] + jmp .end + +.lag_eq_6_4: + mov ecx, [ebp + 4] ; ecx == data_len + mov esi, [ebp] ; esi == data + mov edi, [ebp + 12] ; edi == autoc + fldz ; will accumulate autoc[9] + fldz ; will accumulate autoc[8] + fldz ; will accumulate autoc[7] + fldz ; will accumulate autoc[6] + sub ecx, byte 9 + ALIGN 16 +.lag_6_4_loop: + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmul st0, st1 + faddp st4, st0 ; add to autoc[8] + fld dword [esi + 36] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st4, st0 ; add to autoc[9] + dec ecx + jnz .lag_6_4_loop + ; clean up the leftovers + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st3, st0 ; add to autoc[8] + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st2, st0 ; add to autoc[7] + fld dword [esi] + fld dword [esi + 24] + fmulp st1, st0 + faddp st1, st0 ; add to autoc[6] + fstp dword [edi + 24] + fstp dword [edi + 28] + fstp dword [edi + 32] + fstp dword [edi + 36] + jmp .end + +.lag_eq_6_5: + mov ecx, [ebp + 4] ; ecx == data_len + mov esi, [ebp] ; esi == data + mov edi, [ebp + 12] ; edi == autoc + fldz ; will accumulate autoc[10] + fldz ; will accumulate autoc[9] + fldz ; will accumulate autoc[8] + fldz ; will accumulate autoc[7] + fldz ; will accumulate autoc[6] + sub ecx, byte 10 + ALIGN 16 +.lag_6_5_loop: + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmul st0, st1 + faddp st4, st0 ; add to autoc[8] + fld dword [esi + 36] + fmul st0, st1 + faddp st5, st0 ; add to autoc[9] + fld dword [esi + 40] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st5, st0 ; add to autoc[10] + dec ecx + jnz .lag_6_5_loop + ; clean up the leftovers + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmul st0, st1 + faddp st4, st0 ; add to autoc[8] + fld dword [esi + 36] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st4, st0 ; add to autoc[9] + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st3, st0 ; add to autoc[8] + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st2, st0 ; add to autoc[7] + fld dword [esi] + fld dword [esi + 24] + fmulp st1, st0 + faddp st1, st0 ; add to autoc[6] + fstp dword [edi + 24] + fstp dword [edi + 28] + fstp dword [edi + 32] + fstp dword [edi + 36] + fstp dword [edi + 40] + jmp .end + +.lag_eq_6_6: + mov ecx, [ebp + 4] ; ecx == data_len + mov esi, [ebp] ; esi == data + mov edi, [ebp + 12] ; edi == autoc + fldz ; will accumulate autoc[11] + fldz ; will accumulate autoc[10] + fldz ; will accumulate autoc[9] + fldz ; will accumulate autoc[8] + fldz ; will accumulate autoc[7] + fldz ; will accumulate autoc[6] + sub ecx, byte 11 + ALIGN 16 +.lag_6_6_loop: + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmul st0, st1 + faddp st4, st0 ; add to autoc[8] + fld dword [esi + 36] + fmul st0, st1 + faddp st5, st0 ; add to autoc[9] + fld dword [esi + 40] + fmul st0, st1 + faddp st6, st0 ; add to autoc[10] + fld dword [esi + 44] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st6, st0 ; add to autoc[11] + dec ecx + jnz .lag_6_6_loop + ; clean up the leftovers + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmul st0, st1 + faddp st4, st0 ; add to autoc[8] + fld dword [esi + 36] + fmul st0, st1 + faddp st5, st0 ; add to autoc[9] + fld dword [esi + 40] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st5, st0 ; add to autoc[10] + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmul st0, st1 + faddp st4, st0 ; add to autoc[8] + fld dword [esi + 36] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st4, st0 ; add to autoc[9] + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmul st0, st1 + faddp st3, st0 ; add to autoc[7] + fld dword [esi + 32] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st3, st0 ; add to autoc[8] + fld dword [esi] + fld dword [esi + 24] + fmul st0, st1 + faddp st2, st0 ; add to autoc[6] + fld dword [esi + 28] + fmulp st1, st0 + add esi, byte 4 ; [CR] sample++ + faddp st2, st0 ; add to autoc[7] + fld dword [esi] + fld dword [esi + 24] + fmulp st1, st0 + faddp st1, st0 ; add to autoc[6] + fstp dword [edi + 24] + fstp dword [edi + 28] + fstp dword [edi + 32] + fstp dword [edi + 36] + fstp dword [edi + 40] + fstp dword [edi + 44] + jmp .end + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +end diff --git a/src/libFLAC/ia32/lpc_asm.nasm b/src/libFLAC/ia32/lpc_asm.nasm new file mode 100644 index 0000000..eddffcd --- /dev/null +++ b/src/libFLAC/ia32/lpc_asm.nasm @@ -0,0 +1,1348 @@ +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001 Josh Coalson +; +; This library is free software; you can redistribute it and/or +; modify it under the terms of the GNU Library General Public +; License as published by the Free Software Foundation; either +; version 2 of the License, or (at your option) any later version. +; +; This library is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Library General Public License for more details. +; +; You should have received a copy of the GNU Library General Public +; License along with this library; if not, write to the +; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +; Boston, MA 02111-1307, USA. + +%include "nasm.h" + + data_section + +cglobal FLAC__lpc_compute_autocorrelation_asm_i386 +cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_4 +cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_8 +cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_12 +cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386 +cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx +cglobal FLAC__lpc_restore_signal_asm_i386 +cglobal FLAC__lpc_restore_signal_asm_i386_mmx + + code_section + +; ********************************************************************** +; +; void FLAC__lpc_compute_autocorrelation_asm(const real data[], unsigned data_len, unsigned lag, real autoc[]) +; { +; real d; +; unsigned sample, coeff; +; const unsigned limit = data_len - lag; +; +; FLAC__ASSERT(lag > 0); +; FLAC__ASSERT(lag <= data_len); +; +; for(coeff = 0; coeff < lag; coeff++) +; autoc[coeff] = 0.0; +; for(sample = 0; sample <= limit; sample++) { +; d = data[sample]; +; for(coeff = 0; coeff < lag; coeff++) +; autoc[coeff] += d * data[sample+coeff]; +; } +; for(; sample < data_len; sample++) { +; d = data[sample]; +; for(coeff = 0; coeff < data_len - sample; coeff++) +; autoc[coeff] += d * data[sample+coeff]; +; } +; } +; + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_i386 + ;[esp + 24] == autoc[] + ;[esp + 20] == lag + ;[esp + 16] == data_len + ;[esp + 12] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 33) + ;ASSERT(lag <= data_len) + +.begin: + push esi + push edi + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + mov edi, [esp + 24] ; edi == autoc + mov ecx, [esp + 20] ; ecx = # of dwords (=lag) of 0 to write + xor eax, eax + rep stosd + + ; const unsigned limit = data_len - lag; + mov eax, [esp + 20] ; eax == lag + mov ecx, [esp + 16] + sub ecx, eax ; ecx == limit + + mov edi, [esp + 24] ; edi == autoc + mov esi, [esp + 12] ; esi == data + inc ecx ; we are looping <= limit so we add one to the counter + + ; for(sample = 0; sample <= limit; sample++) { + ; d = data[sample]; + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] += d * data[sample+coeff]; + ; } + fld dword [esi] ; ST = d <- data[sample] + ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) + lea edx, [eax + eax*2] + neg edx + lea edx, [eax + edx*4 + .jumper1_0] + inc edx ; compensate for the shorter opcode on the last iteration + inc edx ; compensate for the shorter opcode on the last iteration + inc edx ; compensate for the shorter opcode on the last iteration + cmp eax, 33 + jne .loop1_start + sub edx, byte 9 ; compensate for the longer opcodes on the first iteration +.loop1_start: + jmp edx + + fld st0 ; ST = d d + fmul dword [esi + (32*4)] ; ST = d*data[sample+32] d WATCHOUT: not a byte displacement here! + fadd dword [edi + (32*4)] ; ST = autoc[32]+d*data[sample+32] d WATCHOUT: not a byte displacement here! + fstp dword [edi + (32*4)] ; autoc[32]+=d*data[sample+32] ST = d WATCHOUT: not a byte displacement here! + fld st0 ; ST = d d + fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d + fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d + fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d + fld st0 ; ST = d d + fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d + fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d + fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d + fld st0 ; ST = d d + fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d + fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d + fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d + fld st0 ; ST = d d + fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d + fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d + fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d + fld st0 ; ST = d d + fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d + fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d + fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d + fld st0 ; ST = d d + fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d + fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d + fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d + fld st0 ; ST = d d + fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d + fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d + fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d + fld st0 ; ST = d d + fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d + fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d + fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d + fld st0 ; ST = d d + fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d + fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d + fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d + fld st0 ; ST = d d + fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d + fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d + fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d + fld st0 ; ST = d d + fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d + fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d + fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d + fld st0 ; ST = d d + fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d + fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d + fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d + fld st0 ; ST = d d + fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d + fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d + fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d + fld st0 ; ST = d d + fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d + fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d + fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d + fld st0 ; ST = d d + fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d + fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d + fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d + fld st0 ; ST = d d + fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d + fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d + fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d + fld st0 ; ST = d d + fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d + fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d + fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d + fld st0 ; ST = d d + fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d + fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d + fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d + fld st0 ; ST = d d + fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d + fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d + fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d + fld st0 ; ST = d d + fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d + fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d + fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d + fld st0 ; ST = d d + fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d + fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d + fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d + fld st0 ; ST = d d + fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d + fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d + fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d + fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d + fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d + fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d + fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d + fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d + fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d + fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d + fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d + fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d + fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d + fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d + fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d + fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d + fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d + fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d + fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d + fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d + fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d + fld st0 ; ST = d d + fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! + fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! + fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! +.jumper1_0: + + fstp st0 ; pop d, ST = empty + add esi, byte 4 ; sample++ + dec ecx + jz .loop1_end + fld dword [esi] ; ST = d <- data[sample] + jmp edx +.loop1_end: + + ; for(; sample < data_len; sample++) { + ; d = data[sample]; + ; for(coeff = 0; coeff < data_len - sample; coeff++) + ; autoc[coeff] += d * data[sample+coeff]; + ; } + mov ecx, [esp + 20] ; ecx <- lag + dec ecx ; ecx <- lag - 1 + jz near .end ; skip loop if 0 (i.e. lag == 1) + + fld dword [esi] ; ST = d <- data[sample] + mov eax, ecx ; eax <- lag - 1 == data_len - sample the first time through + ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) + lea edx, [eax + eax*2] + neg edx + lea edx, [eax + edx*4 + .jumper2_0] + inc edx ; compensate for the shorter opcode on the last iteration + inc edx ; compensate for the shorter opcode on the last iteration + inc edx ; compensate for the shorter opcode on the last iteration + jmp edx + + fld st0 ; ST = d d + fmul dword [esi + (31*4)] ; ST = d*data[sample+31] d + fadd dword [edi + (31*4)] ; ST = autoc[31]+d*data[sample+31] d + fstp dword [edi + (31*4)] ; autoc[31]+=d*data[sample+31] ST = d + fld st0 ; ST = d d + fmul dword [esi + (30*4)] ; ST = d*data[sample+30] d + fadd dword [edi + (30*4)] ; ST = autoc[30]+d*data[sample+30] d + fstp dword [edi + (30*4)] ; autoc[30]+=d*data[sample+30] ST = d + fld st0 ; ST = d d + fmul dword [esi + (29*4)] ; ST = d*data[sample+29] d + fadd dword [edi + (29*4)] ; ST = autoc[29]+d*data[sample+29] d + fstp dword [edi + (29*4)] ; autoc[29]+=d*data[sample+29] ST = d + fld st0 ; ST = d d + fmul dword [esi + (28*4)] ; ST = d*data[sample+28] d + fadd dword [edi + (28*4)] ; ST = autoc[28]+d*data[sample+28] d + fstp dword [edi + (28*4)] ; autoc[28]+=d*data[sample+28] ST = d + fld st0 ; ST = d d + fmul dword [esi + (27*4)] ; ST = d*data[sample+27] d + fadd dword [edi + (27*4)] ; ST = autoc[27]+d*data[sample+27] d + fstp dword [edi + (27*4)] ; autoc[27]+=d*data[sample+27] ST = d + fld st0 ; ST = d d + fmul dword [esi + (26*4)] ; ST = d*data[sample+26] d + fadd dword [edi + (26*4)] ; ST = autoc[26]+d*data[sample+26] d + fstp dword [edi + (26*4)] ; autoc[26]+=d*data[sample+26] ST = d + fld st0 ; ST = d d + fmul dword [esi + (25*4)] ; ST = d*data[sample+25] d + fadd dword [edi + (25*4)] ; ST = autoc[25]+d*data[sample+25] d + fstp dword [edi + (25*4)] ; autoc[25]+=d*data[sample+25] ST = d + fld st0 ; ST = d d + fmul dword [esi + (24*4)] ; ST = d*data[sample+24] d + fadd dword [edi + (24*4)] ; ST = autoc[24]+d*data[sample+24] d + fstp dword [edi + (24*4)] ; autoc[24]+=d*data[sample+24] ST = d + fld st0 ; ST = d d + fmul dword [esi + (23*4)] ; ST = d*data[sample+23] d + fadd dword [edi + (23*4)] ; ST = autoc[23]+d*data[sample+23] d + fstp dword [edi + (23*4)] ; autoc[23]+=d*data[sample+23] ST = d + fld st0 ; ST = d d + fmul dword [esi + (22*4)] ; ST = d*data[sample+22] d + fadd dword [edi + (22*4)] ; ST = autoc[22]+d*data[sample+22] d + fstp dword [edi + (22*4)] ; autoc[22]+=d*data[sample+22] ST = d + fld st0 ; ST = d d + fmul dword [esi + (21*4)] ; ST = d*data[sample+21] d + fadd dword [edi + (21*4)] ; ST = autoc[21]+d*data[sample+21] d + fstp dword [edi + (21*4)] ; autoc[21]+=d*data[sample+21] ST = d + fld st0 ; ST = d d + fmul dword [esi + (20*4)] ; ST = d*data[sample+20] d + fadd dword [edi + (20*4)] ; ST = autoc[20]+d*data[sample+20] d + fstp dword [edi + (20*4)] ; autoc[20]+=d*data[sample+20] ST = d + fld st0 ; ST = d d + fmul dword [esi + (19*4)] ; ST = d*data[sample+19] d + fadd dword [edi + (19*4)] ; ST = autoc[19]+d*data[sample+19] d + fstp dword [edi + (19*4)] ; autoc[19]+=d*data[sample+19] ST = d + fld st0 ; ST = d d + fmul dword [esi + (18*4)] ; ST = d*data[sample+18] d + fadd dword [edi + (18*4)] ; ST = autoc[18]+d*data[sample+18] d + fstp dword [edi + (18*4)] ; autoc[18]+=d*data[sample+18] ST = d + fld st0 ; ST = d d + fmul dword [esi + (17*4)] ; ST = d*data[sample+17] d + fadd dword [edi + (17*4)] ; ST = autoc[17]+d*data[sample+17] d + fstp dword [edi + (17*4)] ; autoc[17]+=d*data[sample+17] ST = d + fld st0 ; ST = d d + fmul dword [esi + (16*4)] ; ST = d*data[sample+16] d + fadd dword [edi + (16*4)] ; ST = autoc[16]+d*data[sample+16] d + fstp dword [edi + (16*4)] ; autoc[16]+=d*data[sample+16] ST = d + fld st0 ; ST = d d + fmul dword [esi + (15*4)] ; ST = d*data[sample+15] d + fadd dword [edi + (15*4)] ; ST = autoc[15]+d*data[sample+15] d + fstp dword [edi + (15*4)] ; autoc[15]+=d*data[sample+15] ST = d + fld st0 ; ST = d d + fmul dword [esi + (14*4)] ; ST = d*data[sample+14] d + fadd dword [edi + (14*4)] ; ST = autoc[14]+d*data[sample+14] d + fstp dword [edi + (14*4)] ; autoc[14]+=d*data[sample+14] ST = d + fld st0 ; ST = d d + fmul dword [esi + (13*4)] ; ST = d*data[sample+13] d + fadd dword [edi + (13*4)] ; ST = autoc[13]+d*data[sample+13] d + fstp dword [edi + (13*4)] ; autoc[13]+=d*data[sample+13] ST = d + fld st0 ; ST = d d + fmul dword [esi + (12*4)] ; ST = d*data[sample+12] d + fadd dword [edi + (12*4)] ; ST = autoc[12]+d*data[sample+12] d + fstp dword [edi + (12*4)] ; autoc[12]+=d*data[sample+12] ST = d + fld st0 ; ST = d d + fmul dword [esi + (11*4)] ; ST = d*data[sample+11] d + fadd dword [edi + (11*4)] ; ST = autoc[11]+d*data[sample+11] d + fstp dword [edi + (11*4)] ; autoc[11]+=d*data[sample+11] ST = d + fld st0 ; ST = d d + fmul dword [esi + (10*4)] ; ST = d*data[sample+10] d + fadd dword [edi + (10*4)] ; ST = autoc[10]+d*data[sample+10] d + fstp dword [edi + (10*4)] ; autoc[10]+=d*data[sample+10] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 9*4)] ; ST = d*data[sample+9] d + fadd dword [edi + ( 9*4)] ; ST = autoc[9]+d*data[sample+9] d + fstp dword [edi + ( 9*4)] ; autoc[9]+=d*data[sample+9] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 8*4)] ; ST = d*data[sample+8] d + fadd dword [edi + ( 8*4)] ; ST = autoc[8]+d*data[sample+8] d + fstp dword [edi + ( 8*4)] ; autoc[8]+=d*data[sample+8] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 7*4)] ; ST = d*data[sample+7] d + fadd dword [edi + ( 7*4)] ; ST = autoc[7]+d*data[sample+7] d + fstp dword [edi + ( 7*4)] ; autoc[7]+=d*data[sample+7] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 6*4)] ; ST = d*data[sample+6] d + fadd dword [edi + ( 6*4)] ; ST = autoc[6]+d*data[sample+6] d + fstp dword [edi + ( 6*4)] ; autoc[6]+=d*data[sample+6] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 5*4)] ; ST = d*data[sample+4] d + fadd dword [edi + ( 5*4)] ; ST = autoc[4]+d*data[sample+4] d + fstp dword [edi + ( 5*4)] ; autoc[4]+=d*data[sample+4] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 4*4)] ; ST = d*data[sample+4] d + fadd dword [edi + ( 4*4)] ; ST = autoc[4]+d*data[sample+4] d + fstp dword [edi + ( 4*4)] ; autoc[4]+=d*data[sample+4] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 3*4)] ; ST = d*data[sample+3] d + fadd dword [edi + ( 3*4)] ; ST = autoc[3]+d*data[sample+3] d + fstp dword [edi + ( 3*4)] ; autoc[3]+=d*data[sample+3] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 2*4)] ; ST = d*data[sample+2] d + fadd dword [edi + ( 2*4)] ; ST = autoc[2]+d*data[sample+2] d + fstp dword [edi + ( 2*4)] ; autoc[2]+=d*data[sample+2] ST = d + fld st0 ; ST = d d + fmul dword [esi + ( 1*4)] ; ST = d*data[sample+1] d + fadd dword [edi + ( 1*4)] ; ST = autoc[1]+d*data[sample+1] d + fstp dword [edi + ( 1*4)] ; autoc[1]+=d*data[sample+1] ST = d + fld st0 ; ST = d d + fmul dword [esi] ; ST = d*data[sample] d WATCHOUT: no displacement byte here! + fadd dword [edi] ; ST = autoc[0]+d*data[sample] d WATCHOUT: no displacement byte here! + fstp dword [edi] ; autoc[0]+=d*data[sample] ST = d WATCHOUT: no displacement byte here! +.jumper2_0: + + fstp st0 ; pop d, ST = empty + add esi, byte 4 ; sample++ + dec ecx + jz .loop2_end + add edx, byte 11 ; adjust our inner loop counter by adjusting the jump target + fld dword [esi] ; ST = d <- data[sample] + jmp edx +.loop2_end: + +.end: + pop edi + pop esi + ret + + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_4 + ;[esp + 16] == autoc[] + ;[esp + 12] == lag + ;[esp + 8] == data_len + ;[esp + 4] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 4) + ;ASSERT(lag <= data_len) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + + mov edx, [esp + 8] ; edx == data_len + mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] +.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample] + mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 + addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + add eax, 4 + shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] + shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float + movss xmm2, xmm0 + mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 + addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [esp + 16] ; edx == autoc + movups [edx], xmm5 + +.end: + ret + + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_8 + ;[esp + 16] == autoc[] + ;[esp + 12] == lag + ;[esp + 8] == data_len + ;[esp + 4] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 8) + ;ASSERT(lag <= data_len) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + xorps xmm6, xmm6 + + mov edx, [esp + 8] ; edx == data_len + mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 +.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] + mulps xmm0, xmm2 + mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 + addps xmm5, xmm0 + addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + ; here we reorder the instructions; see the (#) indexes for a logical order + shufps xmm2, xmm2, 93h ; (3) 93h=2-1-0-3 => xmm2 gets rotated left by one float + add eax, 4 ; (0) + shufps xmm3, xmm3, 93h ; (4) 93h=2-1-0-3 => xmm3 gets rotated left by one float + shufps xmm0, xmm0, 0 ; (1) xmm0 = data[sample],data[sample],data[sample],data[sample] + movss xmm3, xmm2 ; (5) + movaps xmm1, xmm0 ; (2) xmm1 = data[sample],data[sample],data[sample],data[sample] + movss xmm2, xmm0 ; (6) + mulps xmm1, xmm3 ; (8) + mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 + addps xmm6, xmm1 ; (10) + addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [esp + 16] ; edx == autoc + movups [edx], xmm5 + movups [edx + 4], xmm6 + +.end: + ret + + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_12 + ;[esp + 16] == autoc[] + ;[esp + 12] == lag + ;[esp + 8] == data_len + ;[esp + 4] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 12) + ;ASSERT(lag <= data_len) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + xorps xmm6, xmm6 + xorps xmm7, xmm7 + + mov edx, [esp + 8] ; edx == data_len + mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 + xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 +.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] + movaps xmm1, xmm0 + mulps xmm1, xmm2 + addps xmm5, xmm1 + movaps xmm1, xmm0 + mulps xmm1, xmm3 + addps xmm6, xmm1 + mulps xmm0, xmm4 + addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + add eax, 4 + shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] + + ; shift xmm4:xmm3:xmm2 left by one float + shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float + shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float + shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float + movss xmm4, xmm3 + movss xmm3, xmm2 + movss xmm2, xmm0 + + ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2 + movaps xmm1, xmm0 + mulps xmm1, xmm2 + addps xmm5, xmm1 + movaps xmm1, xmm0 + mulps xmm1, xmm3 + addps xmm6, xmm1 + mulps xmm0, xmm4 + addps xmm7, xmm0 + + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [esp + 16] ; edx == autoc + movups [edx], xmm5 + movups [edx + 4], xmm6 + movups [edx + 8], xmm7 + +.end: + ret + +;void FLAC__lpc_compute_residual_from_qlp_coefficients(const int32 data[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 residual[]) +; +; for(i = 0; i < data_len; i++) { +; sum = 0; +; for(j = 0; j < order; j++) +; sum += qlp_coeff[j] * data[i-j-1]; +; residual[i] = data[i] - (sum >> lp_quantization); +; } +; + ALIGN 16 +cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386 + ;[esp + 40] residual[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] data[] + + ;ASSERT(order > 0) + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] ; esi = data[] + mov edi, [esp + 40] ; edi = residual[] + mov eax, [esp + 32] ; eax = order + mov ebx, [esp + 24] ; ebx = data_len + + test ebx, ebx + jz near .end ; do nothing if data_len == 0 +.begin: + cmp eax, byte 1 + jg short .i_1more + + mov ecx, [esp + 28] + mov edx, [ecx] ; edx = qlp_coeff[0] + mov eax, [esi - 4] ; eax = data[-1] + mov cl, [esp + 36] ; cl = lp_quantization + ALIGN 16 +.i_1_loop_i: + imul eax, edx + sar eax, cl + neg eax + add eax, [esi] + mov [edi], eax + mov eax, [esi] + add edi, byte 4 + add esi, byte 4 + dec ebx + jnz .i_1_loop_i + + jmp .end + +.i_1more: + cmp eax, byte 32 ; for order <= 32 there is a faster routine + jbe short .i_32 + + ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 + ALIGN 16 +.i_32more_loop_i: + xor ebp, ebp + mov ecx, [esp + 32] + mov edx, ecx + shl edx, 2 + add edx, [esp + 28] + neg ecx + ALIGN 16 +.i_32more_loop_j: + sub edx, byte 4 + mov eax, [edx] + imul eax, [esi + 4 * ecx] + add ebp, eax + inc ecx + jnz short .i_32more_loop_j + + mov cl, [esp + 36] + sar ebp, cl + neg ebp + add ebp, [esi] + mov [edi], ebp + add esi, byte 4 + add edi, byte 4 + + dec ebx + jnz .i_32more_loop_i + + jmp .end + +.i_32: + sub edi, esi + neg eax + lea edx, [eax + eax * 8 + .jumper_0] + inc edx + mov eax, [esp + 28] ; eax = qlp_coeff[] + xor ebp, ebp + jmp edx + + mov ecx, [eax + 124] + imul ecx, [esi - 128] + add ebp, ecx + mov ecx, [eax + 120] + imul ecx, [esi - 124] + add ebp, ecx + mov ecx, [eax + 116] + imul ecx, [esi - 120] + add ebp, ecx + mov ecx, [eax + 112] + imul ecx, [esi - 116] + add ebp, ecx + mov ecx, [eax + 108] + imul ecx, [esi - 112] + add ebp, ecx + mov ecx, [eax + 104] + imul ecx, [esi - 108] + add ebp, ecx + mov ecx, [eax + 100] + imul ecx, [esi - 104] + add ebp, ecx + mov ecx, [eax + 96] + imul ecx, [esi - 100] + add ebp, ecx + mov ecx, [eax + 92] + imul ecx, [esi - 96] + add ebp, ecx + mov ecx, [eax + 88] + imul ecx, [esi - 92] + add ebp, ecx + mov ecx, [eax + 84] + imul ecx, [esi - 88] + add ebp, ecx + mov ecx, [eax + 80] + imul ecx, [esi - 84] + add ebp, ecx + mov ecx, [eax + 76] + imul ecx, [esi - 80] + add ebp, ecx + mov ecx, [eax + 72] + imul ecx, [esi - 76] + add ebp, ecx + mov ecx, [eax + 68] + imul ecx, [esi - 72] + add ebp, ecx + mov ecx, [eax + 64] + imul ecx, [esi - 68] + add ebp, ecx + mov ecx, [eax + 60] + imul ecx, [esi - 64] + add ebp, ecx + mov ecx, [eax + 56] + imul ecx, [esi - 60] + add ebp, ecx + mov ecx, [eax + 52] + imul ecx, [esi - 56] + add ebp, ecx + mov ecx, [eax + 48] + imul ecx, [esi - 52] + add ebp, ecx + mov ecx, [eax + 44] + imul ecx, [esi - 48] + add ebp, ecx + mov ecx, [eax + 40] + imul ecx, [esi - 44] + add ebp, ecx + mov ecx, [eax + 36] + imul ecx, [esi - 40] + add ebp, ecx + mov ecx, [eax + 32] + imul ecx, [esi - 36] + add ebp, ecx + mov ecx, [eax + 28] + imul ecx, [esi - 32] + add ebp, ecx + mov ecx, [eax + 24] + imul ecx, [esi - 28] + add ebp, ecx + mov ecx, [eax + 20] + imul ecx, [esi - 24] + add ebp, ecx + mov ecx, [eax + 16] + imul ecx, [esi - 20] + add ebp, ecx + mov ecx, [eax + 12] + imul ecx, [esi - 16] + add ebp, ecx + mov ecx, [eax + 8] + imul ecx, [esi - 12] + add ebp, ecx + mov ecx, [eax + 4] + imul ecx, [esi - 8] + add ebp, ecx + mov ecx, [eax] ; there is one byte missing + imul ecx, [esi - 4] + add ebp, ecx +.jumper_0: + + mov cl, [esp + 36] + sar ebp, cl + neg ebp + add ebp, [esi] + mov [edi + esi], ebp + add esi, byte 4 + + dec ebx + jz short .end + xor ebp, ebp + jmp edx + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for +; the channel must be <= 16. Especially note that this routine cannot be used +; for side-channel coded 16bps channels since the effective bps is 17. + ALIGN 16 +cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx + ;[esp + 40] residual[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] data[] + + ;ASSERT(order > 0) + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] ; esi = data[] + mov edi, [esp + 40] ; edi = residual[] + mov eax, [esp + 32] ; eax = order + mov ebx, [esp + 24] ; ebx = data_len + + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + dec ebx + test ebx, ebx + jz near .last_one + + mov edx, [esp + 28] ; edx = qlp_coeff[] + movd mm6, [esp + 36] ; mm6 = 0:lp_quantization + mov ebp, esp + + and esp, 0xfffffff8 + + xor ecx, ecx +.copy_qlp_loop: + push word [edx + 4 * ecx] + inc ecx + cmp ecx, eax + jnz short .copy_qlp_loop + + and ecx, 0x3 + test ecx, ecx + je short .za_end + sub ecx, byte 4 +.za_loop: + push word 0 + inc eax + inc ecx + jnz short .za_loop +.za_end: + + movq mm5, [esp + 2 * eax - 8] + movd mm4, [esi - 16] + punpckldq mm4, [esi - 12] + movd mm0, [esi - 8] + punpckldq mm0, [esi - 4] + packssdw mm4, mm0 + + cmp eax, byte 4 + jnbe short .mmx_4more + + align 16 +.mmx_4_loop_i: + movd mm1, [esi] + movq mm3, mm4 + punpckldq mm1, [esi + 4] + psrlq mm4, 16 + movq mm0, mm1 + psllq mm0, 48 + por mm4, mm0 + movq mm2, mm4 + psrlq mm4, 16 + pxor mm0, mm0 + punpckhdq mm0, mm1 + pmaddwd mm3, mm5 + pmaddwd mm2, mm5 + psllq mm0, 16 + por mm4, mm0 + movq mm0, mm3 + punpckldq mm3, mm2 + punpckhdq mm0, mm2 + paddd mm3, mm0 + psrad mm3, mm6 + psubd mm1, mm3 + movd [edi], mm1 + punpckhdq mm1, mm1 + movd [edi + 4], mm1 + + add edi, byte 8 + add esi, byte 8 + + sub ebx, 2 + jg .mmx_4_loop_i + jmp .mmx_end + +.mmx_4more: + shl eax, 2 + neg eax + add eax, byte 16 + + align 16 +.mmx_4more_loop_i: + movd mm1, [esi] + punpckldq mm1, [esi + 4] + movq mm3, mm4 + psrlq mm4, 16 + movq mm0, mm1 + psllq mm0, 48 + por mm4, mm0 + movq mm2, mm4 + psrlq mm4, 16 + pxor mm0, mm0 + punpckhdq mm0, mm1 + pmaddwd mm3, mm5 + pmaddwd mm2, mm5 + psllq mm0, 16 + por mm4, mm0 + + mov ecx, esi + add ecx, eax + mov edx, esp + + align 16 +.mmx_4more_loop_j: + movd mm0, [ecx - 16] + movd mm7, [ecx - 8] + punpckldq mm0, [ecx - 12] + punpckldq mm7, [ecx - 4] + packssdw mm0, mm7 + pmaddwd mm0, [edx] + punpckhdq mm7, mm7 + paddd mm3, mm0 + movd mm0, [ecx - 12] + punpckldq mm0, [ecx - 8] + punpckldq mm7, [ecx] + packssdw mm0, mm7 + pmaddwd mm0, [edx] + paddd mm2, mm0 + + add edx, byte 8 + add ecx, byte 16 + cmp ecx, esi + jnz .mmx_4more_loop_j + + movq mm0, mm3 + punpckldq mm3, mm2 + punpckhdq mm0, mm2 + paddd mm3, mm0 + psrad mm3, mm6 + psubd mm1, mm3 + movd [edi], mm1 + punpckhdq mm1, mm1 + movd [edi + 4], mm1 + + add edi, byte 8 + add esi, byte 8 + + sub ebx, 2 + jg near .mmx_4more_loop_i + +.mmx_end: + emms + mov esp, ebp +.last_one: + mov eax, [esp + 32] + inc ebx + jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386.begin + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; ********************************************************************** +; +; void FLAC__lpc_restore_signal(const int32 residual[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 data[]) +; { +; unsigned i, j; +; int32 sum; +; +; FLAC__ASSERT(order > 0); +; +; for(i = 0; i < data_len; i++) { +; sum = 0; +; for(j = 0; j < order; j++) +; sum += qlp_coeff[j] * data[i-j-1]; +; data[i] = residual[i] + (sum >> lp_quantization); +; } +; } + ALIGN 16 +cident FLAC__lpc_restore_signal_asm_i386 + ;[esp + 40] data[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] residual[] + + ;ASSERT(order > 0) + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] ; esi = residual[] + mov edi, [esp + 40] ; edi = data[] + mov eax, [esp + 32] ; eax = order + mov ebx, [esp + 24] ; ebx = data_len + + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + +.begin: + cmp eax, byte 1 + jg short .x87_1more + + mov ecx, [esp + 28] + mov edx, [ecx] + mov eax, [edi - 4] + mov cl, [esp + 36] + ALIGN 16 +.x87_1_loop_i: + imul eax, edx + sar eax, cl + add eax, [esi] + mov [edi], eax + add esi, byte 4 + add edi, byte 4 + dec ebx + jnz .x87_1_loop_i + + jmp .end + +.x87_1more: + cmp eax, byte 32 ; for order <= 32 there is a faster routine + jbe short .x87_32 + + ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 + ALIGN 16 +.x87_32more_loop_i: + xor ebp, ebp + mov ecx, [esp + 32] + mov edx, ecx + shl edx, 2 + add edx, [esp + 28] + neg ecx + ALIGN 16 +.x87_32more_loop_j: + sub edx, byte 4 + mov eax, [edx] + imul eax, [edi + 4 * ecx] + add ebp, eax + inc ecx + jnz short .x87_32more_loop_j + + mov cl, [esp + 36] + sar ebp, cl + add ebp, [esi] + mov [edi], ebp + add edi, byte 4 + add esi, byte 4 + + dec ebx + jnz .x87_32more_loop_i + + jmp .end + +.x87_32: + sub esi, edi + neg eax + lea edx, [eax + eax * 8 + .jumper_0] + inc edx ; compensate for the shorter opcode on the last iteration + mov eax, [esp + 28] ; eax = qlp_coeff[] + xor ebp, ebp + jmp edx + + mov ecx, [eax + 124] ; ecx = qlp_coeff[31] + imul ecx, [edi - 128] ; ecx = qlp_coeff[31] * data[i-32] + add ebp, ecx ; sum += qlp_coeff[31] * data[i-32] + mov ecx, [eax + 120] ; ecx = qlp_coeff[30] + imul ecx, [edi - 124] ; ecx = qlp_coeff[30] * data[i-31] + add ebp, ecx ; sum += qlp_coeff[30] * data[i-31] + mov ecx, [eax + 116] ; ecx = qlp_coeff[29] + imul ecx, [edi - 120] ; ecx = qlp_coeff[29] * data[i-30] + add ebp, ecx ; sum += qlp_coeff[29] * data[i-30] + mov ecx, [eax + 112] ; ecx = qlp_coeff[28] + imul ecx, [edi - 116] ; ecx = qlp_coeff[28] * data[i-29] + add ebp, ecx ; sum += qlp_coeff[28] * data[i-29] + mov ecx, [eax + 108] ; ecx = qlp_coeff[27] + imul ecx, [edi - 112] ; ecx = qlp_coeff[27] * data[i-28] + add ebp, ecx ; sum += qlp_coeff[27] * data[i-28] + mov ecx, [eax + 104] ; ecx = qlp_coeff[26] + imul ecx, [edi - 108] ; ecx = qlp_coeff[26] * data[i-27] + add ebp, ecx ; sum += qlp_coeff[26] * data[i-27] + mov ecx, [eax + 100] ; ecx = qlp_coeff[25] + imul ecx, [edi - 104] ; ecx = qlp_coeff[25] * data[i-26] + add ebp, ecx ; sum += qlp_coeff[25] * data[i-26] + mov ecx, [eax + 96] ; ecx = qlp_coeff[24] + imul ecx, [edi - 100] ; ecx = qlp_coeff[24] * data[i-25] + add ebp, ecx ; sum += qlp_coeff[24] * data[i-25] + mov ecx, [eax + 92] ; ecx = qlp_coeff[23] + imul ecx, [edi - 96] ; ecx = qlp_coeff[23] * data[i-24] + add ebp, ecx ; sum += qlp_coeff[23] * data[i-24] + mov ecx, [eax + 88] ; ecx = qlp_coeff[22] + imul ecx, [edi - 92] ; ecx = qlp_coeff[22] * data[i-23] + add ebp, ecx ; sum += qlp_coeff[22] * data[i-23] + mov ecx, [eax + 84] ; ecx = qlp_coeff[21] + imul ecx, [edi - 88] ; ecx = qlp_coeff[21] * data[i-22] + add ebp, ecx ; sum += qlp_coeff[21] * data[i-22] + mov ecx, [eax + 80] ; ecx = qlp_coeff[20] + imul ecx, [edi - 84] ; ecx = qlp_coeff[20] * data[i-21] + add ebp, ecx ; sum += qlp_coeff[20] * data[i-21] + mov ecx, [eax + 76] ; ecx = qlp_coeff[19] + imul ecx, [edi - 80] ; ecx = qlp_coeff[19] * data[i-20] + add ebp, ecx ; sum += qlp_coeff[19] * data[i-20] + mov ecx, [eax + 72] ; ecx = qlp_coeff[18] + imul ecx, [edi - 76] ; ecx = qlp_coeff[18] * data[i-19] + add ebp, ecx ; sum += qlp_coeff[18] * data[i-19] + mov ecx, [eax + 68] ; ecx = qlp_coeff[17] + imul ecx, [edi - 72] ; ecx = qlp_coeff[17] * data[i-18] + add ebp, ecx ; sum += qlp_coeff[17] * data[i-18] + mov ecx, [eax + 64] ; ecx = qlp_coeff[16] + imul ecx, [edi - 68] ; ecx = qlp_coeff[16] * data[i-17] + add ebp, ecx ; sum += qlp_coeff[16] * data[i-17] + mov ecx, [eax + 60] ; ecx = qlp_coeff[15] + imul ecx, [edi - 64] ; ecx = qlp_coeff[15] * data[i-16] + add ebp, ecx ; sum += qlp_coeff[15] * data[i-16] + mov ecx, [eax + 56] ; ecx = qlp_coeff[14] + imul ecx, [edi - 60] ; ecx = qlp_coeff[14] * data[i-15] + add ebp, ecx ; sum += qlp_coeff[14] * data[i-15] + mov ecx, [eax + 52] ; ecx = qlp_coeff[13] + imul ecx, [edi - 56] ; ecx = qlp_coeff[13] * data[i-14] + add ebp, ecx ; sum += qlp_coeff[13] * data[i-14] + mov ecx, [eax + 48] ; ecx = qlp_coeff[12] + imul ecx, [edi - 52] ; ecx = qlp_coeff[12] * data[i-13] + add ebp, ecx ; sum += qlp_coeff[12] * data[i-13] + mov ecx, [eax + 44] ; ecx = qlp_coeff[11] + imul ecx, [edi - 48] ; ecx = qlp_coeff[11] * data[i-12] + add ebp, ecx ; sum += qlp_coeff[11] * data[i-12] + mov ecx, [eax + 40] ; ecx = qlp_coeff[10] + imul ecx, [edi - 44] ; ecx = qlp_coeff[10] * data[i-11] + add ebp, ecx ; sum += qlp_coeff[10] * data[i-11] + mov ecx, [eax + 36] ; ecx = qlp_coeff[ 9] + imul ecx, [edi - 40] ; ecx = qlp_coeff[ 9] * data[i-10] + add ebp, ecx ; sum += qlp_coeff[ 9] * data[i-10] + mov ecx, [eax + 32] ; ecx = qlp_coeff[ 8] + imul ecx, [edi - 36] ; ecx = qlp_coeff[ 8] * data[i- 9] + add ebp, ecx ; sum += qlp_coeff[ 8] * data[i- 9] + mov ecx, [eax + 28] ; ecx = qlp_coeff[ 7] + imul ecx, [edi - 32] ; ecx = qlp_coeff[ 7] * data[i- 8] + add ebp, ecx ; sum += qlp_coeff[ 7] * data[i- 8] + mov ecx, [eax + 24] ; ecx = qlp_coeff[ 6] + imul ecx, [edi - 28] ; ecx = qlp_coeff[ 6] * data[i- 7] + add ebp, ecx ; sum += qlp_coeff[ 6] * data[i- 7] + mov ecx, [eax + 20] ; ecx = qlp_coeff[ 5] + imul ecx, [edi - 24] ; ecx = qlp_coeff[ 5] * data[i- 6] + add ebp, ecx ; sum += qlp_coeff[ 5] * data[i- 6] + mov ecx, [eax + 16] ; ecx = qlp_coeff[ 4] + imul ecx, [edi - 20] ; ecx = qlp_coeff[ 4] * data[i- 5] + add ebp, ecx ; sum += qlp_coeff[ 4] * data[i- 5] + mov ecx, [eax + 12] ; ecx = qlp_coeff[ 3] + imul ecx, [edi - 16] ; ecx = qlp_coeff[ 3] * data[i- 4] + add ebp, ecx ; sum += qlp_coeff[ 3] * data[i- 4] + mov ecx, [eax + 8] ; ecx = qlp_coeff[ 2] + imul ecx, [edi - 12] ; ecx = qlp_coeff[ 2] * data[i- 3] + add ebp, ecx ; sum += qlp_coeff[ 2] * data[i- 3] + mov ecx, [eax + 4] ; ecx = qlp_coeff[ 1] + imul ecx, [edi - 8] ; ecx = qlp_coeff[ 1] * data[i- 2] + add ebp, ecx ; sum += qlp_coeff[ 1] * data[i- 2] + mov ecx, [eax] ; ecx = qlp_coeff[ 0] (NOTE: one byte missing from instruction) + imul ecx, [edi - 4] ; ecx = qlp_coeff[ 0] * data[i- 1] + add ebp, ecx ; sum += qlp_coeff[ 0] * data[i- 1] +.jumper_0: + + mov cl, [esp + 36] + sar ebp, cl ; ebp = (sum >> lp_quantization) + add ebp, [esi + edi] ; ebp = residual[i] + (sum >> lp_quantization) + mov [edi], ebp ; data[i] = residual[i] + (sum >> lp_quantization) + add edi, byte 4 + + dec ebx + jz short .end + xor ebp, ebp + jmp edx + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for +; the channel must be <= 16. Especially note that this routine cannot be used +; for side-channel coded 16bps channels since the effective bps is 17. + ALIGN 16 +cident FLAC__lpc_restore_signal_asm_i386_mmx + ;[esp + 40] data[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] residual[] + + ;ASSERT(order > 0) + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] + mov edi, [esp + 40] + mov eax, [esp + 32] + mov ebx, [esp + 24] + + test ebx, ebx + jz near .end ; do nothing if data_len == 0 + cmp eax, byte 4 + jb near FLAC__lpc_restore_signal_asm_i386.begin + + mov edx, [esp + 28] + movd mm6, [esp + 36] + mov ebp, esp + + and esp, 0xfffffff8 + + xor ecx, ecx +.copy_qlp_loop: + push word [edx + 4 * ecx] + inc ecx + cmp ecx, eax + jnz short .copy_qlp_loop + + and ecx, 0x3 + test ecx, ecx + je short .za_end + sub ecx, byte 4 +.za_loop: + push word 0 + inc eax + inc ecx + jnz short .za_loop +.za_end: + + movq mm5, [esp + 2 * eax - 8] + movd mm4, [edi - 16] + punpckldq mm4, [edi - 12] + movd mm0, [edi - 8] + punpckldq mm0, [edi - 4] + packssdw mm4, mm0 + + cmp eax, byte 4 + jnbe short .mmx_4more + + align 16 +.mmx_4_loop_i: + movq mm7, mm4 + pmaddwd mm7, mm5 + movq mm0, mm7 + punpckhdq mm7, mm7 + paddd mm7, mm0 + psrad mm7, mm6 + movd mm1, [esi] + paddd mm7, mm1 + movd [edi], mm7 + psllq mm7, 48 + psrlq mm4, 16 + por mm4, mm7 + + add esi, byte 4 + add edi, byte 4 + + dec ebx + jnz .mmx_4_loop_i + jmp .mmx_end +.mmx_4more: + shl eax, 2 + neg eax + add eax, byte 16 + align 16 +.mmx_4more_loop_i: + mov ecx, edi + add ecx, eax + mov edx, esp + + movq mm7, mm4 + pmaddwd mm7, mm5 + + align 16 +.mmx_4more_loop_j: + movd mm0, [ecx - 16] + punpckldq mm0, [ecx - 12] + movd mm1, [ecx - 8] + punpckldq mm1, [ecx - 4] + packssdw mm0, mm1 + pmaddwd mm0, [edx] + paddd mm7, mm0 + + add edx, byte 8 + add ecx, byte 16 + cmp ecx, edi + jnz .mmx_4more_loop_j + + movq mm0, mm7 + punpckhdq mm7, mm7 + paddd mm7, mm0 + psrad mm7, mm6 + movd mm1, [esi] + paddd mm7, mm1 + movd [edi], mm7 + psllq mm7, 48 + psrlq mm4, 16 + por mm4, mm7 + + add esi, byte 4 + add edi, byte 4 + + dec ebx + jnz short .mmx_4more_loop_i +.mmx_end: + emms + mov esp, ebp + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +end diff --git a/src/libFLAC/ia32/nasm.h b/src/libFLAC/ia32/nasm.h new file mode 100644 index 0000000..316955f --- /dev/null +++ b/src/libFLAC/ia32/nasm.h @@ -0,0 +1,58 @@ +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001 Josh Coalson +; +; This library is free software; you can redistribute it and/or +; modify it under the terms of the GNU Library General Public +; License as published by the Free Software Foundation; either +; version 2 of the License, or (at your option) any later version. +; +; This library is distributed in the hope that it will be useful, +; but WITHOUT ANY WARRANTY; without even the implied warranty of +; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +; Library General Public License for more details. +; +; You should have received a copy of the GNU Library General Public +; License along with this library; if not, write to the +; Free Software Foundation, Inc., 59 Temple Place - Suite 330, +; Boston, MA 02111-1307, USA. + + bits 32 + +%ifdef WIN32 + %define FLAC__PUBLIC_NEEDS_UNDERSCORE + %idefine code_section section .text align=16 class=CODE use32 + %idefine data_section section .data align=32 class=DATA use32 + %idefine bss_section section .bss align=32 class=DATA use32 +%elifdef AOUT + %define FLAC__PUBLIC_NEEDS_UNDERSCORE + %idefine code_section section .text + %idefine data_section section .data + %idefine bss_section section .bss +%elifdef ELF + %idefine code_section section .text align=16 + %idefine data_section section .data align=32 + %idefine bss_section section .bss align=32 +%else + %error unsupported object format! +%endif + +%imacro cglobal 1 + %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE + global _%1 + %else + global %1 + %endif +%endmacro + +%imacro cextern 1 + %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE + extern _%1 + %else + extern %1 + %endif +%endmacro + +%imacro cident 1 +_%1: +%1: +%endmacro -- 2.7.4