X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=src%2FlibFLAC%2Fia32%2Flpc_asm.nasm;h=81a37b456f9bda77836c714161f083017d4a7137;hb=7446e18663fb9bfde402742cb77bd4d3c69966c7;hp=a7dec5fadaa7dd5f0c609e1a6e8b46669e756bea;hpb=f01dadd3865c18d7e54fd828ebb5a5c6e31c677b;p=platform%2Fupstream%2Fflac.git

diff --git a/src/libFLAC/ia32/lpc_asm.nasm b/src/libFLAC/ia32/lpc_asm.nasm
index a7dec5f..81a37b4 100644
--- a/src/libFLAC/ia32/lpc_asm.nasm
+++ b/src/libFLAC/ia32/lpc_asm.nasm
@@ -1,20 +1,32 @@
-; libFLAC - Free Lossless Audio Codec library
-; Copyright (C) 2001  Josh Coalson
+;  libFLAC - Free Lossless Audio Codec library
+;  Copyright (C) 2001,2002,2003,2004,2005  Josh Coalson
 ;
-; This library is free software; you can redistribute it and/or
-; modify it under the terms of the GNU Library General Public
-; License as published by the Free Software Foundation; either
-; version 2 of the License, or (at your option) any later version.
+;  Redistribution and use in source and binary forms, with or without
+;  modification, are permitted provided that the following conditions
+;  are met:
 ;
-; This library is distributed in the hope that it will be useful,
-; but WITHOUT ANY WARRANTY; without even the implied warranty of
-; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-; Library General Public License for more details.
+;  - Redistributions of source code must retain the above copyright
+;  notice, this list of conditions and the following disclaimer.
 ;
-; You should have received a copy of the GNU Library General Public
-; License along with this library; if not, write to the
-; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-; Boston, MA  02111-1307, USA.
+;  - Redistributions in binary form must reproduce the above copyright
+;  notice, this list of conditions and the following disclaimer in the
+;  documentation and/or other materials provided with the distribution.
+;
+;  - Neither the name of the Xiph.org Foundation nor the names of its
+;  contributors may be used to endorse or promote products derived from
+;  this software without specific prior written permission.
+;
+;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 %include "nasm.h"
 
@@ -24,6 +36,7 @@ cglobal FLAC__lpc_compute_autocorrelation_asm_ia32
 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
+cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
 cglobal FLAC__lpc_restore_signal_asm_ia32
@@ -58,10 +71,10 @@ cglobal FLAC__lpc_restore_signal_asm_ia32_mmx
 ;
 	ALIGN 16
 cident FLAC__lpc_compute_autocorrelation_asm_ia32
-	;[esp + 24] == autoc[]
-	;[esp + 20] == lag
-	;[esp + 16] == data_len
-	;[esp + 12] == data[]
+	;[esp + 28] == autoc[]
+	;[esp + 24] == lag
+	;[esp + 20] == data_len
+	;[esp + 16] == data[]
 
 	;ASSERT(lag > 0)
 	;ASSERT(lag <= 33)
@@ -70,21 +83,22 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32
 .begin:
 	push	esi
 	push	edi
+	push	ebx
 
 	;	for(coeff = 0; coeff < lag; coeff++)
 	;		autoc[coeff] = 0.0;
-	mov	edi, [esp + 24]			; edi == autoc
-	mov	ecx, [esp + 20]			; ecx = # of dwords (=lag) of 0 to write
+	mov	edi, [esp + 28]			; edi == autoc
+	mov	ecx, [esp + 24]			; ecx = # of dwords (=lag) of 0 to write
 	xor	eax, eax
 	rep	stosd
 
 	;	const unsigned limit = data_len - lag;
-	mov	eax, [esp + 20]			; eax == lag
-	mov	ecx, [esp + 16]
+	mov	eax, [esp + 24]			; eax == lag
+	mov	ecx, [esp + 20]
 	sub	ecx, eax			; ecx == limit
 
-	mov	edi, [esp + 24]			; edi == autoc
-	mov	esi, [esp + 12]			; esi == data
+	mov	edi, [esp + 28]			; edi == autoc
+	mov	esi, [esp + 16]			; esi == data
 	inc	ecx				; we are looping <= limit so we add one to the counter
 
 	;	for(sample = 0; sample <= limit; sample++) {
@@ -96,7 +110,11 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32
 	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
 	lea	edx, [eax + eax*2]
 	neg	edx
-	lea	edx, [eax + edx*4 + .jumper1_0]
+	lea	edx, [eax + edx*4 + .jumper1_0 - .get_eip1]
+	call	.get_eip1
+.get_eip1:
+	pop	ebx
+	add	edx, ebx
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	inc	edx				; compensate for the shorter opcode on the last iteration
@@ -253,7 +271,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32
 	;		for(coeff = 0; coeff < data_len - sample; coeff++)
 	;			autoc[coeff] += d * data[sample+coeff];
 	;	}
-	mov	ecx, [esp + 20]			; ecx <- lag
+	mov	ecx, [esp + 24]			; ecx <- lag
 	dec	ecx				; ecx <- lag - 1
 	jz	near .end			; skip loop if 0 (i.e. lag == 1)
 
@@ -262,7 +280,11 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32
 	; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax)
 	lea	edx, [eax + eax*2]
 	neg	edx
-	lea	edx, [eax + edx*4 + .jumper2_0]
+	lea	edx, [eax + edx*4 + .jumper2_0 - .get_eip2]
+	call	.get_eip2
+.get_eip2:
+	pop	ebx
+	add	edx, ebx
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	inc	edx				; compensate for the shorter opcode on the last iteration
@@ -408,6 +430,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32
 .loop2_end:
 
 .end:
+	pop	ebx
 	pop	edi
 	pop	esi
 	ret
@@ -513,7 +536,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
 	; store autoc
 	mov	edx, [esp + 16]			; edx == autoc
 	movups	[edx], xmm5
-	movups	[edx + 4], xmm6
+	movups	[edx + 16], xmm6
 
 .end:
 	ret
@@ -586,13 +609,134 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
 	; store autoc
 	mov	edx, [esp + 16]			; edx == autoc
 	movups	[edx], xmm5
-	movups	[edx + 4], xmm6
-	movups	[edx + 8], xmm7
+	movups	[edx + 16], xmm6
+	movups	[edx + 32], xmm7
 
 .end:
 	ret
 
-;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
+	align 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
+	;[ebp + 32] autoc
+	;[ebp + 28] lag
+	;[ebp + 24] data_len
+	;[ebp + 20] data
+
+	push	ebp
+	push	ebx
+	push	esi
+	push	edi
+	mov	ebp, esp
+
+	mov	esi, [ebp + 20]
+	mov	edi, [ebp + 24]
+	mov	edx, [ebp + 28]
+	inc	edx
+	and	edx, byte -2
+	mov	eax, edx
+	neg	eax
+	and	esp, byte -8
+	lea	esp, [esp + 4 * eax]
+	mov	ecx, edx
+	xor	eax, eax
+.loop0:
+	dec	ecx
+	mov	[esp + 4 * ecx], eax
+	jnz	short .loop0
+
+	mov	eax, edi
+	sub	eax, edx
+	mov	ebx, edx
+	and	ebx, byte 1
+	sub	eax, ebx
+	lea	ecx, [esi + 4 * eax - 12]
+	cmp	esi, ecx
+	mov	eax, esi
+	ja	short .loop2_pre
+	align	16		;4 nops
+.loop1_i:
+	movd	mm0, [eax]
+	movd	mm2, [eax + 4]
+	movd	mm4, [eax + 8]
+	movd	mm6, [eax + 12]
+	mov	ebx, edx
+	punpckldq	mm0, mm0
+	punpckldq	mm2, mm2
+	punpckldq	mm4, mm4
+	punpckldq	mm6, mm6
+	align	16		;3 nops
+.loop1_j:
+	sub	ebx, byte 2
+	movd	mm1, [eax + 4 * ebx]
+	movd	mm3, [eax + 4 * ebx + 4]
+	movd	mm5, [eax + 4 * ebx + 8]
+	movd	mm7, [eax + 4 * ebx + 12]
+	punpckldq	mm1, mm3
+	punpckldq	mm3, mm5
+	pfmul	mm1, mm0
+	punpckldq	mm5, mm7
+	pfmul	mm3, mm2
+	punpckldq	mm7, [eax + 4 * ebx + 16]
+	pfmul	mm5, mm4
+	pfmul	mm7, mm6
+	pfadd	mm1, mm3
+	movq	mm3, [esp + 4 * ebx]
+	pfadd	mm5, mm7
+	pfadd	mm1, mm5
+	pfadd	mm3, mm1
+	movq	[esp + 4 * ebx], mm3
+	jg	short .loop1_j
+
+	add	eax, byte 16
+	cmp	eax, ecx
+	jb	short .loop1_i
+
+.loop2_pre:
+	mov	ebx, eax
+	sub	eax, esi
+	shr	eax, 2
+	lea	ecx, [esi + 4 * edi]
+	mov	esi, ebx
+.loop2_i:
+	movd	mm0, [esi]
+	mov	ebx, edi
+	sub	ebx, eax
+	cmp	ebx, edx
+	jbe	short .loop2_j
+	mov	ebx, edx
+.loop2_j:
+	dec	ebx
+	movd	mm1, [esi + 4 * ebx]
+	pfmul	mm1, mm0
+	movd	mm2, [esp + 4 * ebx]
+	pfadd	mm1, mm2
+	movd	[esp + 4 * ebx], mm1
+
+	jnz	short .loop2_j
+
+	add	esi, byte 4
+	inc	eax
+	cmp	esi, ecx
+	jnz	short .loop2_i
+
+	mov	edi, [ebp + 32]
+	mov	edx, [ebp + 28]
+.loop3:
+	dec	edx
+	mov	eax, [esp + 4 * edx]
+	mov	[edi + 4 * edx], eax
+	jnz	short .loop3
+
+	femms
+
+	mov	esp, ebp
+	pop	edi
+	pop	esi
+	pop	ebx
+	pop	ebp
+	ret
+
+;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
 ;
 ;	for(i = 0; i < data_len; i++) {
 ;		sum = 0;
@@ -685,7 +829,11 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
 .i_32:
 	sub	edi, esi
 	neg	eax
-	lea	edx, [eax + eax * 8 + .jumper_0]
+	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
+	call	.get_eip0
+.get_eip0:
+	pop	eax
+	add	edx, eax
 	inc	edx
 	mov	eax, [esp + 28]			; eax = qlp_coeff[]
 	xor	ebp, ebp
@@ -1084,7 +1232,11 @@ cident FLAC__lpc_restore_signal_asm_ia32
 .x87_32:
 	sub	esi, edi
 	neg	eax
-	lea	edx, [eax + eax * 8 + .jumper_0]
+	lea	edx, [eax + eax * 8 + .jumper_0 - .get_eip0]
+	call	.get_eip0
+.get_eip0:
+	pop	eax
+	add	edx, eax
 	inc	edx				; compensate for the shorter opcode on the last iteration
 	mov	eax, [esp + 28]			; eax = qlp_coeff[]
 	xor	ebp, ebp