X-Git-Url: http://review.tizen.org/git/?a=blobdiff_plain;f=src%2FlibFLAC%2Fia32%2Flpc_asm.nasm;h=81a37b456f9bda77836c714161f083017d4a7137;hb=7446e18663fb9bfde402742cb77bd4d3c69966c7;hp=a7dec5fadaa7dd5f0c609e1a6e8b46669e756bea;hpb=f01dadd3865c18d7e54fd828ebb5a5c6e31c677b;p=platform%2Fupstream%2Fflac.git diff --git a/src/libFLAC/ia32/lpc_asm.nasm b/src/libFLAC/ia32/lpc_asm.nasm index a7dec5f..81a37b4 100644 --- a/src/libFLAC/ia32/lpc_asm.nasm +++ b/src/libFLAC/ia32/lpc_asm.nasm @@ -1,20 +1,32 @@ -; libFLAC - Free Lossless Audio Codec library -; Copyright (C) 2001 Josh Coalson +; libFLAC - Free Lossless Audio Codec library +; Copyright (C) 2001,2002,2003,2004,2005 Josh Coalson ; -; This library is free software; you can redistribute it and/or -; modify it under the terms of the GNU Library General Public -; License as published by the Free Software Foundation; either -; version 2 of the License, or (at your option) any later version. +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: ; -; This library is distributed in the hope that it will be useful, -; but WITHOUT ANY WARRANTY; without even the implied warranty of -; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -; Library General Public License for more details. +; - Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. ; -; You should have received a copy of the GNU Library General Public -; License along with this library; if not, write to the -; Free Software Foundation, Inc., 59 Temple Place - Suite 330, -; Boston, MA 02111-1307, USA. +; - Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in the +; documentation and/or other materials provided with the distribution. +; +; - Neither the name of the Xiph.org Foundation nor the names of its +; contributors may be used to endorse or promote products derived from +; this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR +; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. %include "nasm.h" @@ -24,6 +36,7 @@ cglobal FLAC__lpc_compute_autocorrelation_asm_ia32 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 +cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx cglobal FLAC__lpc_restore_signal_asm_ia32 @@ -58,10 +71,10 @@ cglobal FLAC__lpc_restore_signal_asm_ia32_mmx ; ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_ia32 - ;[esp + 24] == autoc[] - ;[esp + 20] == lag - ;[esp + 16] == data_len - ;[esp + 12] == data[] + ;[esp + 28] == autoc[] + ;[esp + 24] == lag + ;[esp + 20] == data_len + ;[esp + 16] == data[] ;ASSERT(lag > 0) ;ASSERT(lag <= 33) @@ -70,21 +83,22 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 .begin: push esi push edi + push ebx ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; - mov edi, [esp + 24] ; edi == autoc - mov ecx, [esp + 20] ; ecx = # of dwords (=lag) of 0 to write + mov edi, [esp + 28] ; edi == autoc + mov ecx, [esp + 24] ; ecx = # of dwords (=lag) of 0 to write xor eax, eax rep stosd ; const unsigned limit = data_len - lag; - mov eax, [esp + 20] ; eax == lag - mov ecx, [esp + 16] + mov eax, [esp + 24] ; eax == lag + mov ecx, [esp + 20] sub ecx, eax ; ecx == limit - mov edi, [esp + 24] ; edi == autoc - mov esi, [esp + 12] ; esi == data + mov edi, [esp + 28] ; edi == autoc + mov esi, [esp + 16] ; esi == data inc ecx ; we are looping <= limit so we add one to the counter ; for(sample = 0; sample <= limit; sample++) { @@ -96,7 +110,11 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) lea edx, [eax + eax*2] neg edx - lea edx, [eax + edx*4 + .jumper1_0] + lea edx, [eax + edx*4 + .jumper1_0 - .get_eip1] + call .get_eip1 +.get_eip1: + pop ebx + add edx, ebx inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration @@ -253,7 +271,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 ; for(coeff = 0; coeff < data_len - sample; coeff++) ; autoc[coeff] += d * data[sample+coeff]; ; } - mov ecx, [esp + 20] ; ecx <- lag + mov ecx, [esp + 24] ; ecx <- lag dec ecx ; ecx <- lag - 1 jz near .end ; skip loop if 0 (i.e. lag == 1) @@ -262,7 +280,11 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 ; each iteration is 11 bytes so we need (-eax)*11, so we do (-12*eax + eax) lea edx, [eax + eax*2] neg edx - lea edx, [eax + edx*4 + .jumper2_0] + lea edx, [eax + edx*4 + .jumper2_0 - .get_eip2] + call .get_eip2 +.get_eip2: + pop ebx + add edx, ebx inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration inc edx ; compensate for the shorter opcode on the last iteration @@ -408,6 +430,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32 .loop2_end: .end: + pop ebx pop edi pop esi ret @@ -513,7 +536,7 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8 ; store autoc mov edx, [esp + 16] ; edx == autoc movups [edx], xmm5 - movups [edx + 4], xmm6 + movups [edx + 16], xmm6 .end: ret @@ -586,13 +609,134 @@ cident FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12 ; store autoc mov edx, [esp + 16] ; edx == autoc movups [edx], xmm5 - movups [edx + 4], xmm6 - movups [edx + 8], xmm7 + movups [edx + 16], xmm6 + movups [edx + 32], xmm7 .end: ret -;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) + align 16 +cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow + ;[ebp + 32] autoc + ;[ebp + 28] lag + ;[ebp + 24] data_len + ;[ebp + 20] data + + push ebp + push ebx + push esi + push edi + mov ebp, esp + + mov esi, [ebp + 20] + mov edi, [ebp + 24] + mov edx, [ebp + 28] + inc edx + and edx, byte -2 + mov eax, edx + neg eax + and esp, byte -8 + lea esp, [esp + 4 * eax] + mov ecx, edx + xor eax, eax +.loop0: + dec ecx + mov [esp + 4 * ecx], eax + jnz short .loop0 + + mov eax, edi + sub eax, edx + mov ebx, edx + and ebx, byte 1 + sub eax, ebx + lea ecx, [esi + 4 * eax - 12] + cmp esi, ecx + mov eax, esi + ja short .loop2_pre + align 16 ;4 nops +.loop1_i: + movd mm0, [eax] + movd mm2, [eax + 4] + movd mm4, [eax + 8] + movd mm6, [eax + 12] + mov ebx, edx + punpckldq mm0, mm0 + punpckldq mm2, mm2 + punpckldq mm4, mm4 + punpckldq mm6, mm6 + align 16 ;3 nops +.loop1_j: + sub ebx, byte 2 + movd mm1, [eax + 4 * ebx] + movd mm3, [eax + 4 * ebx + 4] + movd mm5, [eax + 4 * ebx + 8] + movd mm7, [eax + 4 * ebx + 12] + punpckldq mm1, mm3 + punpckldq mm3, mm5 + pfmul mm1, mm0 + punpckldq mm5, mm7 + pfmul mm3, mm2 + punpckldq mm7, [eax + 4 * ebx + 16] + pfmul mm5, mm4 + pfmul mm7, mm6 + pfadd mm1, mm3 + movq mm3, [esp + 4 * ebx] + pfadd mm5, mm7 + pfadd mm1, mm5 + pfadd mm3, mm1 + movq [esp + 4 * ebx], mm3 + jg short .loop1_j + + add eax, byte 16 + cmp eax, ecx + jb short .loop1_i + +.loop2_pre: + mov ebx, eax + sub eax, esi + shr eax, 2 + lea ecx, [esi + 4 * edi] + mov esi, ebx +.loop2_i: + movd mm0, [esi] + mov ebx, edi + sub ebx, eax + cmp ebx, edx + jbe short .loop2_j + mov ebx, edx +.loop2_j: + dec ebx + movd mm1, [esi + 4 * ebx] + pfmul mm1, mm0 + movd mm2, [esp + 4 * ebx] + pfadd mm1, mm2 + movd [esp + 4 * ebx], mm1 + + jnz short .loop2_j + + add esi, byte 4 + inc eax + cmp esi, ecx + jnz short .loop2_i + + mov edi, [ebp + 32] + mov edx, [ebp + 28] +.loop3: + dec edx + mov eax, [esp + 4 * edx] + mov [edi + 4 * edx], eax + jnz short .loop3 + + femms + + mov esp, ebp + pop edi + pop esi + pop ebx + pop ebp + ret + +;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) ; ; for(i = 0; i < data_len; i++) { ; sum = 0; @@ -685,7 +829,11 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32 .i_32: sub edi, esi neg eax - lea edx, [eax + eax * 8 + .jumper_0] + lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] + call .get_eip0 +.get_eip0: + pop eax + add edx, eax inc edx mov eax, [esp + 28] ; eax = qlp_coeff[] xor ebp, ebp @@ -1084,7 +1232,11 @@ cident FLAC__lpc_restore_signal_asm_ia32 .x87_32: sub esi, edi neg eax - lea edx, [eax + eax * 8 + .jumper_0] + lea edx, [eax + eax * 8 + .jumper_0 - .get_eip0] + call .get_eip0 +.get_eip0: + pop eax + add edx, eax inc edx ; compensate for the shorter opcode on the last iteration mov eax, [esp + 28] ; eax = qlp_coeff[] xor ebp, ebp