From: Josh Coalson Date: Tue, 29 May 2001 19:40:57 +0000 (+0000) Subject: add miroslav's versions of FLAC__lpc_compute_residual_from_qlp_coeffients X-Git-Tag: 1.2.0~2335 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0d624c7648d2564c467881874eda7483ef328080;p=platform%2Fupstream%2Fflac.git add miroslav's versions of FLAC__lpc_compute_residual_from_qlp_coeffients --- diff --git a/src/libFLAC/i386/lpc_asm.s b/src/libFLAC/i386/lpc_asm.s index bbe7561..9a47ea1 100644 --- a/src/libFLAC/i386/lpc_asm.s +++ b/src/libFLAC/i386/lpc_asm.s @@ -22,6 +22,8 @@ cglobal FLAC__lpc_compute_autocorrelation_asm_i386 cglobal FLAC__lpc_compute_autocorrelation_asm_i386_sse +cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386 +cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx cglobal FLAC__lpc_restore_signal_asm_i386 cglobal FLAC__lpc_restore_signal_asm_i386_mmx @@ -268,6 +270,391 @@ cident FLAC__lpc_compute_autocorrelation_asm_i386_sse: .end: ret +;void FLAC__lpc_compute_residual_from_qlp_coefficients(const int32 data[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 residual[]) +; +; for(i = 0; i < data_len; i++) { +; sum = 0; +; for(j = 0; j < order; j++) +; sum += qlp_coeff[j] * data[i-j-1]; +; residual[i] = data[i] - (sum >> lp_quantization); +; } +; + ALIGN 16 +cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386: + ;[esp + 40] residual[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] data[] + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] + mov edi, [esp + 40] + mov eax, [esp + 32] + mov ebx, [esp + 24] + +.begin: + cmp eax, byte 1 + jg short .i_1more + + mov ecx, [esp + 28] + mov edx, [ecx] + mov eax, [esi - 4] + mov cl, [esp + 36] + ALIGN 16 +.i_1_loop_i: + imul eax, edx + sar eax, cl + neg eax + add eax, [esi] + mov [edi], eax + mov eax, [esi] + add edi, byte 4 + add esi, byte 4 + dec ebx + jnz .i_1_loop_i + + jmp .end + +.i_1more: + cmp eax, byte 32 ; for order <= 32 there is a faster routine + jbe short .i_32 + + ; This version is here just for completeness, since FLAC__MAX_LPC_ORDER == 32 + ALIGN 16 +.i_32more_loop_i: + xor ebp, ebp + mov ecx, [esp + 32] + mov edx, ecx + shl edx, 2 + add edx, [esp + 28] + neg ecx + ALIGN 16 +.i_32more_loop_j: + sub edx, byte 4 + mov eax, [edx] + imul eax, [esi + 4 * ecx] + add ebp, eax + inc ecx + jnz short .i_32more_loop_j + + mov cl, [esp + 36] + sar ebp, cl + neg ebp + add ebp, [esi] + mov [edi], ebp + add esi, byte 4 + add edi, byte 4 + + dec ebx + jnz .i_32more_loop_i + + jmp .end + +.i_32: + sub edi, esi + neg eax + lea edx, [eax + eax * 8 + .jumper_0] + inc edx + mov eax, [esp + 28] + xor ebp, ebp + jmp edx + + mov ecx, [eax + 124] + imul ecx, [esi - 128] + add ebp, ecx + mov ecx, [eax + 120] + imul ecx, [esi - 124] + add ebp, ecx + mov ecx, [eax + 116] + imul ecx, [esi - 120] + add ebp, ecx + mov ecx, [eax + 112] + imul ecx, [esi - 116] + add ebp, ecx + mov ecx, [eax + 108] + imul ecx, [esi - 112] + add ebp, ecx + mov ecx, [eax + 104] + imul ecx, [esi - 108] + add ebp, ecx + mov ecx, [eax + 100] + imul ecx, [esi - 104] + add ebp, ecx + mov ecx, [eax + 96] + imul ecx, [esi - 100] + add ebp, ecx + mov ecx, [eax + 92] + imul ecx, [esi - 96] + add ebp, ecx + mov ecx, [eax + 88] + imul ecx, [esi - 92] + add ebp, ecx + mov ecx, [eax + 84] + imul ecx, [esi - 88] + add ebp, ecx + mov ecx, [eax + 80] + imul ecx, [esi - 84] + add ebp, ecx + mov ecx, [eax + 76] + imul ecx, [esi - 80] + add ebp, ecx + mov ecx, [eax + 72] + imul ecx, [esi - 76] + add ebp, ecx + mov ecx, [eax + 68] + imul ecx, [esi - 72] + add ebp, ecx + mov ecx, [eax + 64] + imul ecx, [esi - 68] + add ebp, ecx + mov ecx, [eax + 60] + imul ecx, [esi - 64] + add ebp, ecx + mov ecx, [eax + 56] + imul ecx, [esi - 60] + add ebp, ecx + mov ecx, [eax + 52] + imul ecx, [esi - 56] + add ebp, ecx + mov ecx, [eax + 48] + imul ecx, [esi - 52] + add ebp, ecx + mov ecx, [eax + 44] + imul ecx, [esi - 48] + add ebp, ecx + mov ecx, [eax + 40] + imul ecx, [esi - 44] + add ebp, ecx + mov ecx, [eax + 36] + imul ecx, [esi - 40] + add ebp, ecx + mov ecx, [eax + 32] + imul ecx, [esi - 36] + add ebp, ecx + mov ecx, [eax + 28] + imul ecx, [esi - 32] + add ebp, ecx + mov ecx, [eax + 24] + imul ecx, [esi - 28] + add ebp, ecx + mov ecx, [eax + 20] + imul ecx, [esi - 24] + add ebp, ecx + mov ecx, [eax + 16] + imul ecx, [esi - 20] + add ebp, ecx + mov ecx, [eax + 12] + imul ecx, [esi - 16] + add ebp, ecx + mov ecx, [eax + 8] + imul ecx, [esi - 12] + add ebp, ecx + mov ecx, [eax + 4] + imul ecx, [esi - 8] + add ebp, ecx + mov ecx, [eax] ;there is one byte missing + imul ecx, [esi - 4] + add ebp, ecx +.jumper_0: + + mov cl, [esp + 36] + sar ebp, cl + neg ebp + add ebp, [esi] + mov [edi + esi], ebp + add esi, byte 4 + + dec ebx + jz short .end + xor ebp, ebp + jmp edx + +.end: + pop edi + pop esi + pop ebx + pop ebp + ret + +; WATCHOUT: this routine works on 16 bit data which means bits-per-sample for +; the channel must be <= 16. Especially note that this routine cannot be used +; for side-channel coded 16bps channels since the effective bps is 17. + ALIGN 16 +cident FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386_mmx: + ;[esp + 40] residual[] + ;[esp + 36] lp_quantization + ;[esp + 32] order + ;[esp + 28] qlp_coeff[] + ;[esp + 24] data_len + ;[esp + 20] data[] + + push ebp + push ebx + push esi + push edi + + mov esi, [esp + 20] + mov edi, [esp + 40] + mov eax, [esp + 32] + mov ebx, [esp + 24] + + dec ebx + test ebx, ebx + jz near .last_one + + mov edx, [esp + 28] + movd mm6, [esp + 36] + mov ebp, esp + + and esp, 0xfffffff8 + + xor ecx, ecx +.copy_qlp_loop: + push word [edx + 4 * ecx] + inc ecx + cmp ecx, eax + jnz short .copy_qlp_loop + + and ecx, 0x3 + test ecx, ecx + je short .za_end + sub ecx, byte 4 +.za_loop: + push word 0 + inc eax + inc ecx + jnz short .za_loop +.za_end: + + movq mm5, [esp + 2 * eax - 8] + movd mm4, [esi - 16] + punpckldq mm4, [esi - 12] + movd mm0, [esi - 8] + punpckldq mm0, [esi - 4] + packssdw mm4, mm0 + + cmp eax, byte 4 + jnbe short .mmx_4more + + align 16 +.mmx_4_loop_i: + movd mm1, [esi] + movq mm3, mm4 + punpckldq mm1, [esi + 4] + psrlq mm4, 16 + movq mm0, mm1 + psllq mm0, 48 + por mm4, mm0 + movq mm2, mm4 + psrlq mm4, 16 + pxor mm0, mm0 + punpckhdq mm0, mm1 + pmaddwd mm3, mm5 + pmaddwd mm2, mm5 + psllq mm0, 16 + por mm4, mm0 + movq mm0, mm3 + punpckldq mm3, mm2 + punpckhdq mm0, mm2 + paddd mm3, mm0 + psrad mm3, mm6 + psubd mm1, mm3 + movd [edi], mm1 + punpckhdq mm1, mm1 + movd [edi + 4], mm1 + + add edi, byte 8 + add esi, byte 8 + + sub ebx, 2 + jg .mmx_4_loop_i + jmp .mmx_end + +.mmx_4more: + shl eax, 2 + neg eax + add eax, byte 16 + + align 16 +.mmx_4more_loop_i: + movd mm1, [esi] + punpckldq mm1, [esi + 4] + movq mm3, mm4 + psrlq mm4, 16 + movq mm0, mm1 + psllq mm0, 48 + por mm4, mm0 + movq mm2, mm4 + psrlq mm4, 16 + pxor mm0, mm0 + punpckhdq mm0, mm1 + pmaddwd mm3, mm5 + pmaddwd mm2, mm5 + psllq mm0, 16 + por mm4, mm0 + + mov ecx, esi + add ecx, eax + mov edx, esp + + align 16 +.mmx_4more_loop_j: + movd mm0, [ecx - 16] + movd mm7, [ecx - 8] + punpckldq mm0, [ecx - 12] + punpckldq mm7, [ecx - 4] + packssdw mm0, mm7 + pmaddwd mm0, [edx] + punpckhdq mm7, mm7 + paddd mm3, mm0 + movd mm0, [ecx - 12] + punpckldq mm0, [ecx - 8] + punpckldq mm7, [ecx] + packssdw mm0, mm7 + pmaddwd mm0, [edx] + paddd mm2, mm0 + + add edx, byte 8 + add ecx, byte 16 + cmp ecx, esi + jnz .mmx_4more_loop_j + + movq mm0, mm3 + punpckldq mm3, mm2 + punpckhdq mm0, mm2 + paddd mm3, mm0 + psrad mm3, mm6 + psubd mm1, mm3 + movd [edi], mm1 + punpckhdq mm1, mm1 + movd [edi + 4], mm1 + + add edi, byte 8 + add esi, byte 8 + + sub ebx, 2 + jg near .mmx_4more_loop_i + +.mmx_end: + emms + mov esp, ebp +.last_one: + mov eax, [esp + 32] + inc ebx + jnz near FLAC__lpc_compute_residual_from_qlp_coefficients_asm_i386.begin + + pop edi + pop esi + pop ebx + pop ebp + ret + ; ********************************************************************** ; ; void FLAC__lpc_restore_signal(const int32 residual[], unsigned data_len, const int32 qlp_coeff[], unsigned order, int lp_quantization, int32 data[])