-; libFLAC - Free Lossless Audio Codec library
-; Copyright (C) 2001 Josh Coalson
+; libFLAC - Free Lossless Audio Codec library
+; Copyright (C) 2001,2002,2003,2004,2005 Josh Coalson
;
-; This library is free software; you can redistribute it and/or
-; modify it under the terms of the GNU Library General Public
-; License as published by the Free Software Foundation; either
-; version 2 of the License, or (at your option) any later version.
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions
+; are met:
;
-; This library is distributed in the hope that it will be useful,
-; but WITHOUT ANY WARRANTY; without even the implied warranty of
-; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-; Library General Public License for more details.
+; - Redistributions of source code must retain the above copyright
+; notice, this list of conditions and the following disclaimer.
;
-; You should have received a copy of the GNU Library General Public
-; License along with this library; if not, write to the
-; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-; Boston, MA 02111-1307, USA.
+; - Redistributions in binary form must reproduce the above copyright
+; notice, this list of conditions and the following disclaimer in the
+; documentation and/or other materials provided with the distribution.
+;
+; - Neither the name of the Xiph.org Foundation nor the names of its
+; contributors may be used to endorse or promote products derived from
+; this software without specific prior written permission.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
+; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "nasm.h"
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
+cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
cglobal FLAC__lpc_restore_signal_asm_ia32
;
ALIGN 16
cident FLAC__lpc_compute_autocorrelation_asm_ia32
- ;[esp + 24] == autoc[]
- ;[esp + 20] == lag
- ;[esp + 16] == data_len
- ;[esp + 12] == data[]
+ ;[esp + 28] == autoc[]
+ ;[esp + 24] == lag
+ ;[esp + 20] == data_len
+ ;[esp + 16] == data[]
;ASSERT(lag > 0)
;ASSERT(lag <= 33)
.begin:
push esi
push edi
+ push ebx
; for(coeff = 0; coeff < lag; coeff++)
; autoc[coeff] = 0.0;
- mov edi, [esp + 24] ; edi == autoc
- mov ecx, [esp + 20] ; ecx = # of dwords (=lag) of 0 to write
+ mov edi, [esp + 28] ; edi == autoc
+ mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write
xor eax, eax
rep stosd
; const unsigned limit = data_len - lag;
- mov eax, [esp + 20] ; eax == lag
- mov ecx, [esp + 16]
+ mov eax, [esp + 24] ; eax == lag
+ mov ecx, [esp + 20]
sub ecx, eax ; ecx == limit
- mov edi, [esp + 24] ; edi == autoc
- mov esi, [esp + 12] ; esi == data
+ mov edi, [esp + 28] ; edi == autoc
+ mov esi, [esp + 16] ; esi == data
inc ecx ; we are looping <= limit so we add one to the counter
; for(sample = 0; sample <= limit; sample++) {
; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
lea edx, [eax + eax*2]
neg edx
- lea edx, [eax + edx*4 + .jumper1_0]
+ lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
+ call .get_eip1
+.get_eip1:
+ pop ebx
+ add edx, ebx
inc edx ; compensate for the shorter opcode on the last iteration
inc edx ; compensate for the shorter opcode on the last iteration
inc edx ; compensate for the shorter opcode on the last iteration
; for(coeff = 0; coeff < data_len - sample; coeff++)
; autoc[coeff] += d * data[sample+coeff];
; }
- mov ecx, [esp + 20] ; ecx <- lag
+ mov ecx, [esp + 24] ; ecx <- lag
dec ecx ; ecx <- lag - 1
jz near .end ; skip loop if 0 (i.e. lag == 1)
; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
lea edx, [eax + eax*2]
neg edx
- lea edx, [eax + edx*4 + .jumper2_0]
+ lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
+ call .get_eip2
+.get_eip2:
+ pop ebx
+ add edx, ebx
inc edx ; compensate for the shorter opcode on the last iteration
inc edx ; compensate for the shorter opcode on the last iteration
inc edx ; compensate for the shorter opcode on the last iteration
.loop2_end:
.end:
+ pop ebx
pop edi
pop esi
ret
; store autoc
mov edx, [esp + 16] ; edx == autoc
movups [edx], xmm5
- movups [edx + 4], xmm6
+ movups [edx + 16], xmm6
.end:
ret
; store autoc
mov edx, [esp + 16] ; edx == autoc
movups [edx], xmm5
- movups [edx + 4], xmm6
- movups [edx + 8], xmm7
+ movups [edx + 16], xmm6
+ movups [edx + 32], xmm7
.end:
ret
-;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+ align 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
+ ;[ebp + 32] autoc
+ ;[ebp + 28] lag
+ ;[ebp + 24] data_len
+ ;[ebp + 20] data
+
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov ebp, esp
+
+ mov esi, [ebp + 20]
+ mov edi, [ebp + 24]
+ mov edx, [ebp + 28]
+ inc edx
+ and edx, byte -2
+ mov eax, edx
+ neg eax
+ and esp, byte -8
+ lea esp, [esp + 4 * eax]
+ mov ecx, edx
+ xor eax, eax
+.loop0:
+ dec ecx
+ mov [esp + 4 * ecx], eax
+ jnz short .loop0
+
+ mov eax, edi
+ sub eax, edx
+ mov ebx, edx
+ and ebx, byte 1
+ sub eax, ebx
+ lea ecx, [esi + 4 * eax - 12]
+ cmp esi, ecx
+ mov eax, esi
+ ja short .loop2_pre
+ align 16 ;4 nops
+.loop1_i:
+ movd mm0, [eax]
+ movd mm2, [eax + 4]
+ movd mm4, [eax + 8]
+ movd mm6, [eax + 12]
+ mov ebx, edx
+ punpckldq mm0, mm0
+ punpckldq mm2, mm2
+ punpckldq mm4, mm4
+ punpckldq mm6, mm6
+ align 16 ;3 nops
+.loop1_j:
+ sub ebx, byte 2
+ movd mm1, [eax + 4 * ebx]
+ movd mm3, [eax + 4 * ebx + 4]
+ movd mm5, [eax + 4 * ebx + 8]
+ movd mm7, [eax + 4 * ebx + 12]
+ punpckldq mm1, mm3
+ punpckldq mm3, mm5
+ pfmul mm1, mm0
+ punpckldq mm5, mm7
+ pfmul mm3, mm2
+ punpckldq mm7, [eax + 4 * ebx + 16]
+ pfmul mm5, mm4
+ pfmul mm7, mm6
+ pfadd mm1, mm3
+ movq mm3, [esp + 4 * ebx]
+ pfadd mm5, mm7
+ pfadd mm1, mm5
+ pfadd mm3, mm1
+ movq [esp + 4 * ebx], mm3
+ jg short .loop1_j
+
+ add eax, byte 16
+ cmp eax, ecx
+ jb short .loop1_i
+
+.loop2_pre:
+ mov ebx, eax
+ sub eax, esi
+ shr eax, 2
+ lea ecx, [esi + 4 * edi]
+ mov esi, ebx
+.loop2_i:
+ movd mm0, [esi]
+ mov ebx, edi
+ sub ebx, eax
+ cmp ebx, edx
+ jbe short .loop2_j
+ mov ebx, edx
+.loop2_j:
+ dec ebx
+ movd mm1, [esi + 4 * ebx]
+ pfmul mm1, mm0
+ movd mm2, [esp + 4 * ebx]
+ pfadd mm1, mm2
+ movd [esp + 4 * ebx], mm1
+
+ jnz short .loop2_j
+
+ add esi, byte 4
+ inc eax
+ cmp esi, ecx
+ jnz short .loop2_i
+
+ mov edi, [ebp + 32]
+ mov edx, [ebp + 28]
+.loop3:
+ dec edx
+ mov eax, [esp + 4 * edx]
+ mov [edi + 4 * edx], eax
+ jnz short .loop3
+
+ femms
+
+ mov esp, ebp
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
;
; for(i = 0; i < data_len; i++) {
; sum = 0;
.i_32:
sub edi, esi
neg eax
- lea edx, [eax + eax * 8 + .jumper_0]
+ lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
+ call .get_eip0
+.get_eip0:
+ pop eax
+ add edx, eax
inc edx
mov eax, [esp + 28] ; eax = qlp_coeff[]
xor ebp, ebp
.x87_32:
sub esi, edi
neg eax
- lea edx, [eax + eax * 8 + .jumper_0]
+ lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
+ call .get_eip0
+.get_eip0:
+ pop eax
+ add edx, eax
inc edx ; compensate for the shorter opcode on the last iteration
mov eax, [esp + 28] ; eax = qlp_coeff[]
xor ebp, ebp