From 2a389f41ca5f9a06a48d2415ea088cdecede2f88 Mon Sep 17 00:00:00 2001 From: Josh Coalson Date: Thu, 31 May 2001 16:17:32 +0000 Subject: [PATCH] add several SSE versions of FLAC__lpc_compute_autocorrelation for different lags --- src/libFLAC/i386/lpc_asm.s | 170 +++++++++++++++++++++++++++++++++----- src/libFLAC/include/private/lpc.h | 4 +- 2 files changed, 153 insertions(+), 21 deletions(-) diff --git a/src/libFLAC/i386/lpc_asm.s b/src/libFLAC/i386/lpc_asm.s index 99dcd13..6dbec51 100644 --- a/src/libFLAC/i386/lpc_asm.s +++ b/src/libFLAC/i386/lpc_asm.s @@ -21,7 +21,9 @@ data_section cglobal FLAC__lpc_compute_autocorrelation_asm_i386 -cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse +cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse_4 +cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse_8 +cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse_12 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx cglobal FLAC__lpc_restore_signal_asm_i386 @@ -56,12 +58,14 @@ cglobal FLAC__lpc_restore_signal_asm_i386_mmx ; ALIGN 16 cident FLAC__lpc_compute_autocorrelation_asm_i386 - ;[esp + 32] == autoc[] ;[esp + 28] == lag ;[esp + 24] == data_len ;[esp + 20] == data[] + ;ASSERT(lag > 0) + ;ASSERT(lag <= data_len) + .begin: push ebp push ebx @@ -205,22 +209,68 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386 pop ebp ret -;@@@ NOTE: this SSE version is not even tested yet and only works for lag == 8 ALIGN 16 -cident FLAC__lpc_compute_autocorrelation_asm_i386_sse +cident FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_4 + ;[esp + 16] == autoc[] + ;[esp + 12] == lag + ;[esp + 8] == data_len + ;[esp + 4] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 4) + ;ASSERT(lag <= data_len) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + mov edx, [esp + 8] ; edx == data_len + mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] +.warmup: ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample] + mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 + addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + add eax, 4 + shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] + shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float + movss xmm2, xmm0 + mulps xmm0, xmm2 ; xmm0 = xmm0 * xmm2 + addps xmm5, xmm0 ; xmm5 += xmm0 * xmm2 + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [esp + 16] ; edx == autoc + movups [edx], xmm5 + +.end: + ret + + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_8 ;[esp + 16] == autoc[] ;[esp + 12] == lag ;[esp + 8] == data_len ;[esp + 4] == data[] - cmp [esp + 12], 8 - jne near FLAC__lpc_compute_autocorrelation_asm_i386.begin + ;ASSERT(lag > 0) + ;ASSERT(lag <= 8) + ;ASSERT(lag <= data_len) ; for(coeff = 0; coeff < lag; coeff++) ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 xorps xmm6, xmm6 - xorps xmm7, xmm7 mov edx, [esp + 8] ; edx == data_len mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] @@ -228,20 +278,18 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386_sse movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] add eax, 4 movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] - shufps xmm0, xmm0, 0 ; xmm0 = data[0],data[0],data[0],data[0] - movaps xmm1, xmm0 ; xmm1 = data[0],data[0],data[0],data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + movaps xmm1, xmm0 ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 .warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] mulps xmm0, xmm2 mulps xmm1, xmm3 ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 - addps xmm6, xmm0 - addps xmm7, xmm1 ; xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2 + addps xmm5, xmm0 + addps xmm6, xmm1 ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 dec edx - ;* there's no need to even check for this because we know that lag == 8 - ;* and data_len >= lag, so our 1-sample warmup cannot finish the loop - ; jz .loop_end + jz .loop_end ALIGN 16 -.loop_8: +.loop_start: ; start by reading the next sample movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] ; here we reorder the instructions; see the (#) indexes for a logical order @@ -254,15 +302,89 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386_sse movss xmm2, xmm0 ; (6) mulps xmm1, xmm3 ; (8) mulps xmm0, xmm2 ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2 - addps xmm7, xmm1 ; (10) - addps xmm6, xmm0 ; (9) xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2 + addps xmm6, xmm1 ; (10) + addps xmm5, xmm0 ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2 dec edx - jnz .loop_8 + jnz .loop_start .loop_end: ; store autoc mov edx, [esp + 16] ; edx == autoc - movups [edx], xmm6 - movups [edx + 4], xmm7 + movups [edx], xmm5 + movups [edx + 4], xmm6 + +.end: + ret + + ALIGN 16 +cident FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_12 + ;[esp + 16] == autoc[] + ;[esp + 12] == lag + ;[esp + 8] == data_len + ;[esp + 4] == data[] + + ;ASSERT(lag > 0) + ;ASSERT(lag <= 12) + ;ASSERT(lag <= data_len) + + ; for(coeff = 0; coeff < lag; coeff++) + ; autoc[coeff] = 0.0; + xorps xmm5, xmm5 + xorps xmm6, xmm6 + xorps xmm7, xmm7 + + mov edx, [esp + 8] ; edx == data_len + mov eax, [esp + 4] ; eax == &data[sample] <- &data[0] + + movss xmm0, [eax] ; xmm0 = 0,0,0,data[0] + add eax, 4 + movaps xmm2, xmm0 ; xmm2 = 0,0,0,data[0] + shufps xmm0, xmm0, 0 ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0] + xorps xmm3, xmm3 ; xmm3 = 0,0,0,0 + xorps xmm4, xmm4 ; xmm4 = 0,0,0,0 +.warmup: ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample] + movaps xmm1, xmm0 + mulps xmm1, xmm2 + addps xmm5, xmm1 + movaps xmm1, xmm0 + mulps xmm1, xmm3 + addps xmm6, xmm1 + mulps xmm0, xmm4 + addps xmm7, xmm0 ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2 + dec edx + jz .loop_end + ALIGN 16 +.loop_start: + ; start by reading the next sample + movss xmm0, [eax] ; xmm0 = 0,0,0,data[sample] + add eax, 4 + shufps xmm0, xmm0, 0 ; xmm0 = data[sample],data[sample],data[sample],data[sample] + + ; shift xmm4:xmm3:xmm2 left by one float + shufps xmm2, xmm2, 93h ; 93h=2-1-0-3 => xmm2 gets rotated left by one float + shufps xmm3, xmm3, 93h ; 93h=2-1-0-3 => xmm3 gets rotated left by one float + shufps xmm4, xmm4, 93h ; 93h=2-1-0-3 => xmm4 gets rotated left by one float + movss xmm4, xmm3 + movss xmm3, xmm2 + movss xmm2, xmm0 + + ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2 + movaps xmm1, xmm0 + mulps xmm1, xmm2 + addps xmm5, xmm1 + movaps xmm1, xmm0 + mulps xmm1, xmm3 + addps xmm6, xmm1 + mulps xmm0, xmm4 + addps xmm7, xmm0 + + dec edx + jnz .loop_start +.loop_end: + ; store autoc + mov edx, [esp + 16] ; edx == autoc + movups [edx], xmm5 + movups [edx + 4], xmm6 + movups [edx + 8], xmm7 .end: ret @@ -285,6 +407,8 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386 ;[esp + 24] data_len ;[esp + 20] data[] + ;ASSERT(order > 0) + push ebp push ebx push esi @@ -493,6 +617,8 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx ;[esp + 24] data_len ;[esp + 20] data[] + ;ASSERT(order > 0) + push ebp push ebx push esi @@ -682,6 +808,8 @@ cident FLAC__lpc_restore_signal_asm_i386 ;[esp + 24] data_len ;[esp + 20] residual[] + ;ASSERT(order > 0) + push ebp push ebx push esi @@ -887,6 +1015,8 @@ cident FLAC__lpc_restore_signal_asm_i386_mmx ;[esp + 24] data_len ;[esp + 20] residual[] + ;ASSERT(order > 0) + push ebp push ebx push esi diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h index c65d00a..4838b32 100644 --- a/src/libFLAC/include/private/lpc.h +++ b/src/libFLAC/include/private/lpc.h @@ -41,7 +41,9 @@ void FLAC__lpc_compute_autocorrelation(const real data[], unsigned data_len, uns #ifdef FLAC__CPU_IA32 #ifdef FLAC__HAS_NASM void FLAC__lpc_compute_autocorrelation_asm_i386(const real data[], unsigned data_len, unsigned lag, real autoc[]); -void FLAC__lpc_compute_autocorrelation_asm_i386_sse(const real data[], unsigned data_len, unsigned lag, real autoc[]); +void FLAC__lpc_compute_autocorrelation_asm_i386_sse_4(const real data[], unsigned data_len, unsigned lag, real autoc[]); +void FLAC__lpc_compute_autocorrelation_asm_i386_sse_8(const real data[], unsigned data_len, unsigned lag, real autoc[]); +void FLAC__lpc_compute_autocorrelation_asm_i386_sse_12(const real data[], unsigned data_len, unsigned lag, real autoc[]); #endif #endif #endif -- 2.7.4