cglobal FLAC__lpc_compute_autocorrelation_asm_i386
cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse
cglobal FLAC__lpc_restore_signal_asm_i386
+cglobal FLAC__lpc_restore_signal_asm_i386_mmx
code_section
; data[i] = residual[i] + (sum >> lp_quantization);
; }
; }
+ ALIGN 16
FLAC__lpc_restore_signal_asm_i386:
- ; [esp + 20] == residual[]
- ; [esp + 24] == data_len
- ; [esp + 28] == qlp_coeff[]
- ; [esp + 32] == order
- ; [esp + 36] == lp_quantization
- ; [esp + 40] == data[]
+ ;[esp + 40] data[]
+ ;[esp + 36] lp_quantization
+ ;[esp + 32] order
+ ;[esp + 28] qlp_coeff[]
+ ;[esp + 24] data_len
+ ;[esp + 20] residual[]
push ebp
push ebx
push esi
push edi
-
+
mov esi, [esp + 20]
- mov ebx, [esp + 24]
- mov eax, [esp + 32]
mov edi, [esp + 40]
-
+ mov eax, [esp + 32]
+ mov ebx, [esp + 24]
+
+ cmp eax, byte 1
+ jg short .x87_1more
+
+ mov ecx, [esp + 28]
+ mov edx, [ecx]
+ mov eax, [edi - 4]
+ mov cl, [esp + 36]
+ ALIGN 16
+.x87_1_loop_i:
+ imul eax, edx
+ sar eax, cl
+ add eax, [esi]
+ mov [edi], eax
+ add esi, byte 4
+ add edi, byte 4
+ dec ebx
+ jnz .x87_1_loop_i
+
+ jmp .end
+
+.x87_1more:
cmp eax, byte 32 ; for order <= 32 there is a faster routine
- jbe short .b
+ jbe short .x87_32
; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
ALIGN 16
-.loop_i_a
+.x87_32more_loop_i:
xor ebp, ebp
mov ecx, [esp + 32]
mov edx, ecx
shl edx, 2
add edx, [esp + 28]
neg ecx
- ALIGN 16
-.loop_j_a
- sub edx, 4
+ ALIGN 16
+.x87_32more_loop_j:
+ sub edx, byte 4
mov eax, [edx]
imul eax, [edi + 4 * ecx]
add ebp, eax
inc ecx
- jnz .loop_j_a
-
+ jnz short .x87_32more_loop_j
+
mov cl, [esp + 36]
sar ebp, cl
add ebp, [esi]
mov [edi], ebp
add edi, byte 4
add esi, byte 4
-
+
dec ebx
- jnz .loop_i_a
-
+ jnz .x87_32more_loop_i
+
jmp .end
-.b
+.x87_32:
sub esi, edi
neg eax
lea edx, [eax + eax * 8 + .jumper_0]
mov eax, [esp + 28]
xor ebp, ebp
jmp edx
-;.jumper_32
- mov ecx, [eax + 124] ;32
+
+ mov ecx, [eax + 124]
imul ecx, [edi - 128]
add ebp, ecx
mov ecx, [eax + 120]
imul ecx, [edi - 124]
add ebp, ecx
- mov ecx, [eax + 116] ;30
+ mov ecx, [eax + 116]
imul ecx, [edi - 120]
add ebp, ecx
mov ecx, [eax + 112]
mov ecx, [eax + 100]
imul ecx, [edi - 104]
add ebp, ecx
- mov ecx, [eax + 96] ;25
+ mov ecx, [eax + 96]
imul ecx, [edi - 100]
add ebp, ecx
mov ecx, [eax + 92]
mov ecx, [eax + 80]
imul ecx, [edi - 84]
add ebp, ecx
- mov ecx, [eax + 76] ;20
+ mov ecx, [eax + 76]
imul ecx, [edi - 80]
add ebp, ecx
mov ecx, [eax + 72]
mov ecx, [eax + 60]
imul ecx, [edi - 64]
add ebp, ecx
- mov ecx, [eax + 56] ;15
+ mov ecx, [eax + 56]
imul ecx, [edi - 60]
add ebp, ecx
mov ecx, [eax + 52]
mov ecx, [eax + 40]
imul ecx, [edi - 44]
add ebp, ecx
- mov ecx, [eax + 36] ;10
+ mov ecx, [eax + 36]
imul ecx, [edi - 40]
add ebp, ecx
mov ecx, [eax + 32]
mov ecx, [eax + 20]
imul ecx, [edi - 24]
add ebp, ecx
- mov ecx, [eax + 16] ;5
+ mov ecx, [eax + 16]
imul ecx, [edi - 20]
add ebp, ecx
mov ecx, [eax + 12]
mov ecx, [eax] ;there is one byte missing
imul ecx, [edi - 4]
add ebp, ecx
-.jumper_0
+.jumper_0:
mov cl, [esp + 36]
sar ebp, cl
add ebp, [esi + edi]
mov [edi], ebp
add edi, byte 4
-
+
dec ebx
jz short .end
xor ebp, ebp
jmp edx
-
-.end
+
+.end:
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
+; the channel must be <= 16. Especially note that this routine cannot be used
+; for side-channel coded 16bps channels since the effective bps is 17.
+ ALIGN 16
+FLAC__lpc_restore_signal_asm_i386_mmx:
+ ;[esp + 40] data[]
+ ;[esp + 36] lp_quantization
+ ;[esp + 32] order
+ ;[esp + 28] qlp_coeff[]
+ ;[esp + 24] data_len
+ ;[esp + 20] residual[]
+
+ push ebp
+ push ebx
+ push esi
+ push edi
+
+ mov esi, [esp + 20]
+ mov edi, [esp + 40]
+ mov eax, [esp + 32]
+ mov ebx, [esp + 24]
+
+ mov edx, [esp + 28]
+ movd mm6, [esp + 36]
+ mov ebp, esp
+
+ and esp, 0xfffffff8
+
+ xor ecx, ecx
+.copy_qlp_loop:
+ push word [edx + 4 * ecx]
+ inc ecx
+ cmp ecx, eax
+ jnz short .copy_qlp_loop
+
+ and ecx, 0x3
+ test ecx, ecx
+ je short .za_end
+ sub ecx, byte 4
+.za_loop:
+ push word 0
+ inc eax
+ inc ecx
+ jnz short .za_loop
+.za_end:
+
+ movq mm5, [esp + 2 * eax - 8]
+ movd mm4, [edi - 16]
+ punpckldq mm4, [edi - 12]
+ movd mm0, [edi - 8]
+ punpckldq mm0, [edi - 4]
+ packssdw mm4, mm0
+
+ cmp eax, byte 4
+ jnbe short .mmx_4more
+
+ align 16
+.mmx_4_loop_i:
+ movq mm7, mm4
+ pmaddwd mm7, mm5
+ movq mm0, mm7
+ punpckhdq mm7, mm7
+ paddd mm7, mm0
+ psrad mm7, mm6
+ movd mm1, [esi]
+ paddd mm7, mm1
+ movd [edi], mm7
+ psllq mm7, 48
+ psrlq mm4, 16
+ por mm4, mm7
+
+ add esi, byte 4
+ add edi, byte 4
+
+ dec ebx
+ jnz .mmx_4_loop_i
+ jmp .mmx_end
+.mmx_4more:
+ shl eax, 2
+ neg eax
+ add eax, byte 16
+ align 16
+.mmx_4more_loop_i:
+ mov ecx, edi
+ add ecx, eax
+ mov edx, esp
+
+ movq mm7, mm4
+ pmaddwd mm7, mm5
+
+ align 16
+.mmx_4more_loop_j:
+ movd mm0, [ecx - 16]
+ punpckldq mm0, [ecx - 12]
+ movd mm1, [ecx - 8]
+ punpckldq mm1, [ecx - 4]
+ packssdw mm0, mm1
+ pmaddwd mm0, [edx]
+ paddd mm7, mm0
+
+ add edx, byte 8
+ add ecx, byte 16
+ cmp ecx, edi
+ jnz .mmx_4more_loop_j
+
+ movq mm0, mm7
+ punpckhdq mm7, mm7
+ paddd mm7, mm0
+ psrad mm7, mm6
+ movd mm1, [esi]
+ paddd mm7, mm1
+ movd [edi], mm7
+ psllq mm7, 48
+ psrlq mm4, 16
+ por mm4, mm7
+
+ add esi, byte 4
+ add edi, byte 4
+
+ dec ebx
+ jnz short .mmx_4more_loop_i
+.mmx_end:
+ emms
+ mov esp, ebp
+
pop edi
pop esi
pop ebx