add several SSE versions of FLAC__lpc_compute_autocorrelation for different lags
authorJosh Coalson <jcoalson@users.sourceforce.net>
Thu, 31 May 2001 16:17:32 +0000 (16:17 +0000)
committerJosh Coalson <jcoalson@users.sourceforce.net>
Thu, 31 May 2001 16:17:32 +0000 (16:17 +0000)
src/libFLAC/i386/lpc_asm.s
src/libFLAC/include/private/lpc.h

index 99dcd13..6dbec51 100644 (file)
@@ -21,7 +21,9 @@
        data_section
 
 cglobal FLAC__lpc_compute_autocorrelation_asm_i386
-cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse
+cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse_4
+cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse_8
+cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse_12
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386
 cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx
 cglobal FLAC__lpc_restore_signal_asm_i386
@@ -56,12 +58,14 @@ cglobal FLAC__lpc_restore_signal_asm_i386_mmx
 ;
        ALIGN 16
 cident FLAC__lpc_compute_autocorrelation_asm_i386
-
        ;[esp + 32] == autoc[]
        ;[esp + 28] == lag
        ;[esp + 24] == data_len
        ;[esp + 20] == data[]
 
+       ;ASSERT(lag > 0)
+       ;ASSERT(lag <= data_len)
+
 .begin:
        push    ebp
        push    ebx
@@ -205,22 +209,68 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386
        pop     ebp
        ret
 
-;@@@ NOTE: this SSE version is not even tested yet and only works for lag == 8
        ALIGN 16
-cident FLAC__lpc_compute_autocorrelation_asm_i386_sse
+cident FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_4
+       ;[esp + 16] == autoc[]
+       ;[esp + 12] == lag
+       ;[esp + 8] == data_len
+       ;[esp + 4] == data[]
+
+       ;ASSERT(lag > 0)
+       ;ASSERT(lag <= 4)
+       ;ASSERT(lag <= data_len)
+
+       ;       for(coeff = 0; coeff < lag; coeff++)
+       ;               autoc[coeff] = 0.0;
+       xorps   xmm5, xmm5
 
+       mov     edx, [esp + 8]                  ; edx == data_len
+       mov     eax, [esp + 4]                  ; eax == &data[sample] <- &data[0]
+
+       movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
+       add     eax, 4
+       movaps  xmm2, xmm0                      ; xmm2 = 0,0,0,data[0]
+       shufps  xmm0, xmm0, 0                   ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
+.warmup:                                       ; xmm2 == data[sample-3],data[sample-2],data[sample-1],data[sample]
+       mulps   xmm0, xmm2                      ; xmm0 = xmm0 * xmm2
+       addps   xmm5, xmm0                      ; xmm5 += xmm0 * xmm2
+       dec     edx
+       jz      .loop_end
+       ALIGN 16
+.loop_start:
+       ; start by reading the next sample
+       movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[sample]
+       add     eax, 4
+       shufps  xmm0, xmm0, 0                   ; xmm0 = data[sample],data[sample],data[sample],data[sample]
+       shufps  xmm2, xmm2, 93h                 ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
+       movss   xmm2, xmm0
+       mulps   xmm0, xmm2                      ; xmm0 = xmm0 * xmm2
+       addps   xmm5, xmm0                      ; xmm5 += xmm0 * xmm2
+       dec     edx
+       jnz     .loop_start
+.loop_end:
+       ; store autoc
+       mov     edx, [esp + 16]                 ; edx == autoc
+       movups  [edx], xmm5
+
+.end:
+       ret
+
+       ALIGN 16
+cident FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_8
        ;[esp + 16] == autoc[]
        ;[esp + 12] == lag
        ;[esp + 8] == data_len
        ;[esp + 4] == data[]
 
-       cmp     [esp + 12], 8
-       jne     near FLAC__lpc_compute_autocorrelation_asm_i386.begin
+       ;ASSERT(lag > 0)
+       ;ASSERT(lag <= 8)
+       ;ASSERT(lag <= data_len)
 
        ;       for(coeff = 0; coeff < lag; coeff++)
        ;               autoc[coeff] = 0.0;
+       xorps   xmm5, xmm5
        xorps   xmm6, xmm6
-       xorps   xmm7, xmm7
 
        mov     edx, [esp + 8]                  ; edx == data_len
        mov     eax, [esp + 4]                  ; eax == &data[sample] <- &data[0]
@@ -228,20 +278,18 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386_sse
        movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
        add     eax, 4
        movaps  xmm2, xmm0                      ; xmm2 = 0,0,0,data[0]
-       shufps  xmm0, xmm0, 0                   ; xmm0 = data[0],data[0],data[0],data[0]
-       movaps  xmm1, xmm0                      ; xmm1 = data[0],data[0],data[0],data[0]
+       shufps  xmm0, xmm0, 0                   ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
+       movaps  xmm1, xmm0                      ; xmm1 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
        xorps   xmm3, xmm3                      ; xmm3 = 0,0,0,0
 .warmup:                                       ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
        mulps   xmm0, xmm2
        mulps   xmm1, xmm3                      ; xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
-       addps   xmm6, xmm0
-       addps   xmm7, xmm1                      ; xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2
+       addps   xmm5, xmm0
+       addps   xmm6, xmm1                      ; xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
        dec     edx
-       ;* there's no need to even check for this because we know that lag == 8
-       ;* and data_len >= lag, so our 1-sample warmup cannot finish the loop
-       ; jz    .loop_end
+       jz      .loop_end
        ALIGN 16
-.loop_8:
+.loop_start:
        ; start by reading the next sample
        movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[sample]
        ; here we reorder the instructions; see the (#) indexes for a logical order
@@ -254,15 +302,89 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386_sse
        movss   xmm2, xmm0                      ; (6)
        mulps   xmm1, xmm3                      ; (8)
        mulps   xmm0, xmm2                      ; (7) xmm1:xmm0 = xmm1:xmm0 * xmm3:xmm2
-       addps   xmm7, xmm1                      ; (10)
-       addps   xmm6, xmm0                      ; (9) xmm7:xmm6 += xmm1:xmm0 * xmm3:xmm2
+       addps   xmm6, xmm1                      ; (10)
+       addps   xmm5, xmm0                      ; (9) xmm6:xmm5 += xmm1:xmm0 * xmm3:xmm2
        dec     edx
-       jnz     .loop_8
+       jnz     .loop_start
 .loop_end:
        ; store autoc
        mov     edx, [esp + 16]                 ; edx == autoc
-       movups  [edx], xmm6
-       movups  [edx + 4], xmm7
+       movups  [edx], xmm5
+       movups  [edx + 4], xmm6
+
+.end:
+       ret
+
+       ALIGN 16
+cident FLAC__lpc_compute_autocorrelation_asm_i386_sse_lag_12
+       ;[esp + 16] == autoc[]
+       ;[esp + 12] == lag
+       ;[esp + 8] == data_len
+       ;[esp + 4] == data[]
+
+       ;ASSERT(lag > 0)
+       ;ASSERT(lag <= 12)
+       ;ASSERT(lag <= data_len)
+
+       ;       for(coeff = 0; coeff < lag; coeff++)
+       ;               autoc[coeff] = 0.0;
+       xorps   xmm5, xmm5
+       xorps   xmm6, xmm6
+       xorps   xmm7, xmm7
+
+       mov     edx, [esp + 8]                  ; edx == data_len
+       mov     eax, [esp + 4]                  ; eax == &data[sample] <- &data[0]
+
+       movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[0]
+       add     eax, 4
+       movaps  xmm2, xmm0                      ; xmm2 = 0,0,0,data[0]
+       shufps  xmm0, xmm0, 0                   ; xmm0 == data[sample],data[sample],data[sample],data[sample] = data[0],data[0],data[0],data[0]
+       xorps   xmm3, xmm3                      ; xmm3 = 0,0,0,0
+       xorps   xmm4, xmm4                      ; xmm4 = 0,0,0,0
+.warmup:                                       ; xmm3:xmm2 == data[sample-7],data[sample-6],...,data[sample]
+       movaps  xmm1, xmm0
+       mulps   xmm1, xmm2
+       addps   xmm5, xmm1
+       movaps  xmm1, xmm0
+       mulps   xmm1, xmm3
+       addps   xmm6, xmm1
+       mulps   xmm0, xmm4
+       addps   xmm7, xmm0                      ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm4:xmm3:xmm2
+       dec     edx
+       jz      .loop_end
+       ALIGN 16
+.loop_start:
+       ; start by reading the next sample
+       movss   xmm0, [eax]                     ; xmm0 = 0,0,0,data[sample]
+       add     eax, 4
+       shufps  xmm0, xmm0, 0                   ; xmm0 = data[sample],data[sample],data[sample],data[sample]
+
+       ; shift xmm4:xmm3:xmm2 left by one float
+       shufps  xmm2, xmm2, 93h                 ; 93h=2-1-0-3 => xmm2 gets rotated left by one float
+       shufps  xmm3, xmm3, 93h                 ; 93h=2-1-0-3 => xmm3 gets rotated left by one float
+       shufps  xmm4, xmm4, 93h                 ; 93h=2-1-0-3 => xmm4 gets rotated left by one float
+       movss   xmm4, xmm3
+       movss   xmm3, xmm2
+       movss   xmm2, xmm0
+
+       ; xmm7:xmm6:xmm5 += xmm0:xmm0:xmm0 * xmm3:xmm3:xmm2
+       movaps  xmm1, xmm0
+       mulps   xmm1, xmm2
+       addps   xmm5, xmm1
+       movaps  xmm1, xmm0
+       mulps   xmm1, xmm3
+       addps   xmm6, xmm1
+       mulps   xmm0, xmm4
+       addps   xmm7, xmm0
+
+       dec     edx
+       jnz     .loop_start
+.loop_end:
+       ; store autoc
+       mov     edx, [esp + 16]                 ; edx == autoc
+       movups  [edx], xmm5
+       movups  [edx + 4], xmm6
+       movups  [edx + 8], xmm7
 
 .end:
        ret
@@ -285,6 +407,8 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386
        ;[esp + 24]     data_len
        ;[esp + 20]     data[]
 
+       ;ASSERT(order > 0)
+
        push    ebp
        push    ebx
        push    esi
@@ -493,6 +617,8 @@ cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx
        ;[esp + 24]     data_len
        ;[esp + 20]     data[]
 
+       ;ASSERT(order > 0)
+
        push    ebp
        push    ebx
        push    esi
@@ -682,6 +808,8 @@ cident FLAC__lpc_restore_signal_asm_i386
        ;[esp + 24]     data_len
        ;[esp + 20]     residual[]
 
+       ;ASSERT(order > 0)
+
        push    ebp
        push    ebx
        push    esi
@@ -887,6 +1015,8 @@ cident FLAC__lpc_restore_signal_asm_i386_mmx
        ;[esp + 24]     data_len
        ;[esp + 20]     residual[]
 
+       ;ASSERT(order > 0)
+
        push    ebp
        push    ebx
        push    esi
index c65d00a..4838b32 100644 (file)
@@ -41,7 +41,9 @@ void FLAC__lpc_compute_autocorrelation(const real data[], unsigned data_len, uns
 #ifdef FLAC__CPU_IA32
 #ifdef FLAC__HAS_NASM
 void FLAC__lpc_compute_autocorrelation_asm_i386(const real data[], unsigned data_len, unsigned lag, real autoc[]);
-void FLAC__lpc_compute_autocorrelation_asm_i386_sse(const real data[], unsigned data_len, unsigned lag, real autoc[]);
+void FLAC__lpc_compute_autocorrelation_asm_i386_sse_4(const real data[], unsigned data_len, unsigned lag, real autoc[]);
+void FLAC__lpc_compute_autocorrelation_asm_i386_sse_8(const real data[], unsigned data_len, unsigned lag, real autoc[]);
+void FLAC__lpc_compute_autocorrelation_asm_i386_sse_12(const real data[], unsigned data_len, unsigned lag, real autoc[]);
 #endif
 #endif
 #endif