cglobal FLAC__lpc_compute_autocorrelation_asm_i386
cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse
+cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386
+cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx
cglobal FLAC__lpc_restore_signal_asm_i386
cglobal FLAC__lpc_restore_signal_asm_i386_mmx
.end:
ret
+;void FLAC__lpc_compute_residual_from_qlp_coefficients(const int32 data[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 residual[])
+;
+; for(i = 0; i < data_len; i++) {
+; sum = 0;
+; for(j = 0; j < order; j++)
+; sum += qlp_coeff[j] * data[i-j-1];
+; residual[i] = data[i] - (sum >> lp_quantization);
+; }
+;
+ ALIGN 16
+cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386:
+ ;[esp + 40] residual[]
+ ;[esp + 36] lp_quantization
+ ;[esp + 32] order
+ ;[esp + 28] qlp_coeff[]
+ ;[esp + 24] data_len
+ ;[esp + 20] data[]
+
+ push ebp
+ push ebx
+ push esi
+ push edi
+
+ mov esi, [esp + 20]
+ mov edi, [esp + 40]
+ mov eax, [esp + 32]
+ mov ebx, [esp + 24]
+
+.begin:
+ cmp eax, byte 1
+ jg short .i_1more
+
+ mov ecx, [esp + 28]
+ mov edx, [ecx]
+ mov eax, [esi - 4]
+ mov cl, [esp + 36]
+ ALIGN 16
+.i_1_loop_i:
+ imul eax, edx
+ sar eax, cl
+ neg eax
+ add eax, [esi]
+ mov [edi], eax
+ mov eax, [esi]
+ add edi, byte 4
+ add esi, byte 4
+ dec ebx
+ jnz .i_1_loop_i
+
+ jmp .end
+
+.i_1more:
+ cmp eax, byte 32 ; for order <= 32 there is a faster routine
+ jbe short .i_32
+
+ ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32
+ ALIGN 16
+.i_32more_loop_i:
+ xor ebp, ebp
+ mov ecx, [esp + 32]
+ mov edx, ecx
+ shl edx, 2
+ add edx, [esp + 28]
+ neg ecx
+ ALIGN 16
+.i_32more_loop_j:
+ sub edx, byte 4
+ mov eax, [edx]
+ imul eax, [esi + 4 * ecx]
+ add ebp, eax
+ inc ecx
+ jnz short .i_32more_loop_j
+
+ mov cl, [esp + 36]
+ sar ebp, cl
+ neg ebp
+ add ebp, [esi]
+ mov [edi], ebp
+ add esi, byte 4
+ add edi, byte 4
+
+ dec ebx
+ jnz .i_32more_loop_i
+
+ jmp .end
+
+.i_32:
+ sub edi, esi
+ neg eax
+ lea edx, [eax + eax * 8 + .jumper_0]
+ inc edx
+ mov eax, [esp + 28]
+ xor ebp, ebp
+ jmp edx
+
+ mov ecx, [eax + 124]
+ imul ecx, [esi - 128]
+ add ebp, ecx
+ mov ecx, [eax + 120]
+ imul ecx, [esi - 124]
+ add ebp, ecx
+ mov ecx, [eax + 116]
+ imul ecx, [esi - 120]
+ add ebp, ecx
+ mov ecx, [eax + 112]
+ imul ecx, [esi - 116]
+ add ebp, ecx
+ mov ecx, [eax + 108]
+ imul ecx, [esi - 112]
+ add ebp, ecx
+ mov ecx, [eax + 104]
+ imul ecx, [esi - 108]
+ add ebp, ecx
+ mov ecx, [eax + 100]
+ imul ecx, [esi - 104]
+ add ebp, ecx
+ mov ecx, [eax + 96]
+ imul ecx, [esi - 100]
+ add ebp, ecx
+ mov ecx, [eax + 92]
+ imul ecx, [esi - 96]
+ add ebp, ecx
+ mov ecx, [eax + 88]
+ imul ecx, [esi - 92]
+ add ebp, ecx
+ mov ecx, [eax + 84]
+ imul ecx, [esi - 88]
+ add ebp, ecx
+ mov ecx, [eax + 80]
+ imul ecx, [esi - 84]
+ add ebp, ecx
+ mov ecx, [eax + 76]
+ imul ecx, [esi - 80]
+ add ebp, ecx
+ mov ecx, [eax + 72]
+ imul ecx, [esi - 76]
+ add ebp, ecx
+ mov ecx, [eax + 68]
+ imul ecx, [esi - 72]
+ add ebp, ecx
+ mov ecx, [eax + 64]
+ imul ecx, [esi - 68]
+ add ebp, ecx
+ mov ecx, [eax + 60]
+ imul ecx, [esi - 64]
+ add ebp, ecx
+ mov ecx, [eax + 56]
+ imul ecx, [esi - 60]
+ add ebp, ecx
+ mov ecx, [eax + 52]
+ imul ecx, [esi - 56]
+ add ebp, ecx
+ mov ecx, [eax + 48]
+ imul ecx, [esi - 52]
+ add ebp, ecx
+ mov ecx, [eax + 44]
+ imul ecx, [esi - 48]
+ add ebp, ecx
+ mov ecx, [eax + 40]
+ imul ecx, [esi - 44]
+ add ebp, ecx
+ mov ecx, [eax + 36]
+ imul ecx, [esi - 40]
+ add ebp, ecx
+ mov ecx, [eax + 32]
+ imul ecx, [esi - 36]
+ add ebp, ecx
+ mov ecx, [eax + 28]
+ imul ecx, [esi - 32]
+ add ebp, ecx
+ mov ecx, [eax + 24]
+ imul ecx, [esi - 28]
+ add ebp, ecx
+ mov ecx, [eax + 20]
+ imul ecx, [esi - 24]
+ add ebp, ecx
+ mov ecx, [eax + 16]
+ imul ecx, [esi - 20]
+ add ebp, ecx
+ mov ecx, [eax + 12]
+ imul ecx, [esi - 16]
+ add ebp, ecx
+ mov ecx, [eax + 8]
+ imul ecx, [esi - 12]
+ add ebp, ecx
+ mov ecx, [eax + 4]
+ imul ecx, [esi - 8]
+ add ebp, ecx
+ mov ecx, [eax] ;there is one byte missing
+ imul ecx, [esi - 4]
+ add ebp, ecx
+.jumper_0:
+
+ mov cl, [esp + 36]
+ sar ebp, cl
+ neg ebp
+ add ebp, [esi]
+ mov [edi + esi], ebp
+ add esi, byte 4
+
+ dec ebx
+ jz short .end
+ xor ebp, ebp
+ jmp edx
+
+.end:
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
+; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for
+; the channel must be <= 16. Especially note that this routine cannot be used
+; for side-channel coded 16bps channels since the effective bps is 17.
+ ALIGN 16
+cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx:
+ ;[esp + 40] residual[]
+ ;[esp + 36] lp_quantization
+ ;[esp + 32] order
+ ;[esp + 28] qlp_coeff[]
+ ;[esp + 24] data_len
+ ;[esp + 20] data[]
+
+ push ebp
+ push ebx
+ push esi
+ push edi
+
+ mov esi, [esp + 20]
+ mov edi, [esp + 40]
+ mov eax, [esp + 32]
+ mov ebx, [esp + 24]
+
+ dec ebx
+ test ebx, ebx
+ jz near .last_one
+
+ mov edx, [esp + 28]
+ movd mm6, [esp + 36]
+ mov ebp, esp
+
+ and esp, 0xfffffff8
+
+ xor ecx, ecx
+.copy_qlp_loop:
+ push word [edx + 4 * ecx]
+ inc ecx
+ cmp ecx, eax
+ jnz short .copy_qlp_loop
+
+ and ecx, 0x3
+ test ecx, ecx
+ je short .za_end
+ sub ecx, byte 4
+.za_loop:
+ push word 0
+ inc eax
+ inc ecx
+ jnz short .za_loop
+.za_end:
+
+ movq mm5, [esp + 2 * eax - 8]
+ movd mm4, [esi - 16]
+ punpckldq mm4, [esi - 12]
+ movd mm0, [esi - 8]
+ punpckldq mm0, [esi - 4]
+ packssdw mm4, mm0
+
+ cmp eax, byte 4
+ jnbe short .mmx_4more
+
+ align 16
+.mmx_4_loop_i:
+ movd mm1, [esi]
+ movq mm3, mm4
+ punpckldq mm1, [esi + 4]
+ psrlq mm4, 16
+ movq mm0, mm1
+ psllq mm0, 48
+ por mm4, mm0
+ movq mm2, mm4
+ psrlq mm4, 16
+ pxor mm0, mm0
+ punpckhdq mm0, mm1
+ pmaddwd mm3, mm5
+ pmaddwd mm2, mm5
+ psllq mm0, 16
+ por mm4, mm0
+ movq mm0, mm3
+ punpckldq mm3, mm2
+ punpckhdq mm0, mm2
+ paddd mm3, mm0
+ psrad mm3, mm6
+ psubd mm1, mm3
+ movd [edi], mm1
+ punpckhdq mm1, mm1
+ movd [edi + 4], mm1
+
+ add edi, byte 8
+ add esi, byte 8
+
+ sub ebx, 2
+ jg .mmx_4_loop_i
+ jmp .mmx_end
+
+.mmx_4more:
+ shl eax, 2
+ neg eax
+ add eax, byte 16
+
+ align 16
+.mmx_4more_loop_i:
+ movd mm1, [esi]
+ punpckldq mm1, [esi + 4]
+ movq mm3, mm4
+ psrlq mm4, 16
+ movq mm0, mm1
+ psllq mm0, 48
+ por mm4, mm0
+ movq mm2, mm4
+ psrlq mm4, 16
+ pxor mm0, mm0
+ punpckhdq mm0, mm1
+ pmaddwd mm3, mm5
+ pmaddwd mm2, mm5
+ psllq mm0, 16
+ por mm4, mm0
+
+ mov ecx, esi
+ add ecx, eax
+ mov edx, esp
+
+ align 16
+.mmx_4more_loop_j:
+ movd mm0, [ecx - 16]
+ movd mm7, [ecx - 8]
+ punpckldq mm0, [ecx - 12]
+ punpckldq mm7, [ecx - 4]
+ packssdw mm0, mm7
+ pmaddwd mm0, [edx]
+ punpckhdq mm7, mm7
+ paddd mm3, mm0
+ movd mm0, [ecx - 12]
+ punpckldq mm0, [ecx - 8]
+ punpckldq mm7, [ecx]
+ packssdw mm0, mm7
+ pmaddwd mm0, [edx]
+ paddd mm2, mm0
+
+ add edx, byte 8
+ add ecx, byte 16
+ cmp ecx, esi
+ jnz .mmx_4more_loop_j
+
+ movq mm0, mm3
+ punpckldq mm3, mm2
+ punpckhdq mm0, mm2
+ paddd mm3, mm0
+ psrad mm3, mm6
+ psubd mm1, mm3
+ movd [edi], mm1
+ punpckhdq mm1, mm1
+ movd [edi + 4], mm1
+
+ add edi, byte 8
+ add esi, byte 8
+
+ sub ebx, 2
+ jg near .mmx_4more_loop_i
+
+.mmx_end:
+ emms
+ mov esp, ebp
+.last_one:
+ mov eax, [esp + 32]
+ inc ebx
+ jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386.begin
+
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
; **********************************************************************
;
; void FLAC__lpc_restore_signal(const int32 residual[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 data[])