From: Josh Coalson Date: Thu, 19 Sep 2002 05:19:14 +0000 (+0000) Subject: check in Miroslav's speedup patch X-Git-Tag: 1.2.0~1502 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=9945f8e44a8626f9d27d58d0ea55a11ed3e07e30;p=platform%2Fupstream%2Fflac.git check in Miroslav's speedup patch --- diff --git a/src/libFLAC/ia32/fixed_asm.nasm b/src/libFLAC/ia32/fixed_asm.nasm index 93a8413..a6e9b4c 100644 --- a/src/libFLAC/ia32/fixed_asm.nasm +++ b/src/libFLAC/ia32/fixed_asm.nasm @@ -76,107 +76,73 @@ cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov push edi sub esp, byte 16 ; qword [esp] == temp space for loading FLAC__uint64s to FPU regs - ; dword [esp] == last_error_0 - ; dword [esp + 4] == last_error_1 - ; dword [esp + 8] == last_error_2 - ; dword [esp + 12] == last_error_3 - ; eax == error ; ebx == &data[i] ; ecx == loop counter (i) - ; edx == temp - ; edi == save ; ebp == order ; mm0 == total_error_1:total_error_0 - ; mm1 == total_error_3:total_error_2 - ; mm2 == 0:total_error_4 - ; mm3/4 == 0:unpackarea - ; mm5 == abs(error_1):abs(error_0) - ; mm5 == abs(error_3):abs(error_2) + ; mm1 == total_error_2:total_error_3 + ; mm2 == :total_error_4 + ; mm3 == last_error_1:last_error_0 + ; mm4 == last_error_2:last_error_3 - pxor mm0, mm0 ; total_error_1 = total_error_0 = 0 - pxor mm1, mm1 ; total_error_3 = total_error_2 = 0 - pxor mm2, mm2 ; total_error_4 = 0 - mov ebx, [esp + 36] ; ebx = data[] - mov ecx, [ebx - 4] ; ecx == data[-1] last_error_0 = data[-1] - mov eax, [ebx - 8] ; eax == data[-2] - mov ebp, [ebx - 16] ; ebp == data[-4] - mov ebx, [ebx - 12] ; ebx == data[-3] - mov edx, ecx - sub edx, eax ; last_error_1 = data[-1] - data[-2] - mov esi, edx - sub esi, eax - add esi, ebx ; last_error_2 = last_error_1 - (data[-2] - data[-3]) - shl ebx, 1 - mov edi, esi - sub edi, eax - add edi, ebx - sub edi, ebp ; last_error_3 = last_error_2 - (data[-2] - 2*data[-3] + data[-4]); - mov ebx, [esp + 36] ; ebx = data[] - mov [esp], ecx ; [esp] = last_error_0 - mov [esp + 4], edx ; [esp + 4] = last_error_1 - mov [esp + 8], esi ; [esp + 8] = last_error_2 - mov [esp + 12], edi ; [esp + 12] = last_error_3 mov ecx, [esp + 40] ; ecx = data_len + test ecx, ecx + jz near .data_len_is_0 + + mov ebx, [esp + 36] ; ebx = data[] + movd mm3, [ebx - 4] ; mm3 = 0:last_error_0 + movd mm2, [ebx - 8] ; mm2 = 0:data[-2] + movd mm1, [ebx - 12] ; mm1 = 0:data[-3] + movd mm0, [ebx - 16] ; mm0 = 0:data[-4] + movq mm5, mm3 ; mm5 = 0:last_error_0 + psubd mm5, mm2 ; mm5 = 0:last_error_1 + punpckldq mm3, mm5 ; mm3 = last_error_1:last_error_0 + psubd mm2, mm1 ; mm2 = 0:data[-2] - data[-3] + psubd mm5, mm2 ; mm5 = 0:last_error_2 + movq mm4, mm5 ; mm4 = 0:last_error_2 + psubd mm4, mm2 ; mm4 = 0:last_error_2 - (data[-2] - data[-3]) + paddd mm4, mm1 ; mm4 = 0:last_error_2 - (data[-2] - 2 * data[-3]) + psubd mm4, mm0 ; mm4 = 0:last_error_3 + punpckldq mm4, mm5 ; mm4 = last_error_2:last_error_3 + pxor mm0, mm0 ; mm0 = total_error_1:total_error_0 + pxor mm1, mm1 ; mm1 = total_error_2:total_error_3 + pxor mm2, mm2 ; mm2 = 0:total_error_4 - ; for(i = 0; i < data_len; i++) { - ; error_0 = data[i] ; save = error_0; total_error_0 += local_abs(error_0); - ; error_1 -= last_error_0; last_error_0 = save; save = error_1; total_error_1 += local_abs(error_1); - ; error_2 -= last_error_1; last_error_1 = save; save = error_2; total_error_2 += local_abs(error_2); - ; error_3 -= last_error_2; last_error_2 = save; save = error_3; total_error_3 += local_abs(error_3); - ; error_4 -= last_error_3; last_error_3 = save; total_error_4 += local_abs(error_4); - ; } ALIGN 16 .loop: - mov eax, [ebx] ; eax = error_0 = data[i] - add ebx, 4 - mov edi, eax ; edi == save = error_0 - mov edx, eax ; edx = error_0 - neg edx ; edx = -error_0 - cmovns eax, edx ; eax = abs(error_0) - movd mm5, eax ; mm5 = 0:abs(error_0) - mov edx, [esp] ; edx = last_error_0 - mov eax, edi ; eax = error(error_0) - mov [esp], edi ; [esp] == last_error_0 = save - sub eax, edx ; error -= last_error_0 - mov edi, eax ; edi == save = error_1 - mov edx, eax ; edx = error_1 - neg edx ; edx = -error_1 - cmovns eax, edx ; eax = abs(error_1) - movd mm4, eax ; mm4 = 0:abs(error_1) - punpckldq mm5, mm4 ; mm5 = abs(error_1):abs(error_0) - mov edx, [esp + 4] ; edx = last_error_1 - mov eax, edi ; eax = error(error_1) - mov [esp + 4], edi ; [esp + 4] == last_error_1 = save - sub eax, edx ; error -= last_error_1 - mov edi, eax ; edi == save = error_2 - mov edx, eax ; edx = error_2 - paddd mm0, mm5 ; [CR] total_error_1 += abs(error_1) ; total_error_0 += abs(error_0) - neg edx ; edx = -error_2 - cmovns eax, edx ; eax = abs(error_2) - movd mm5, eax ; mm5 = 0:abs(error_2) - mov edx, [esp + 8] ; edx = last_error_2 - mov eax, edi ; eax = error(error_2) - mov [esp + 8], edi ; [esp + 8] == last_error_2 = save - sub eax, edx ; error -= last_error_2 - mov edi, eax ; edi == save = error_3 - mov edx, eax ; edx = error_3 - neg edx ; edx = -error_3 - cmovns eax, edx ; eax = abs(error_3) - movd mm4, eax ; mm4 = 0:abs(error_3) - punpckldq mm5, mm4 ; mm5 = abs(error_3):abs(error_2) - mov edx, [esp + 12] ; edx = last_error_3 - mov eax, edi ; eax = error(error_3) - mov [esp + 12], edi ; [esp + 12] == last_error_3 = save - sub eax, edx ; error -= last_error_3 - mov edx, eax ; edx = error_4 - paddd mm1, mm5 ; [CR] total_error_3 += abs(error_3) ; total_error_2 += abs(error_2) - neg edx ; edx = -error_4 - cmovns eax, edx ; eax = abs(error_4) - movd mm5, eax ; mm5 = 0:abs(error_4) - paddd mm2, mm5 ; total_error_4 += abs(error_4) + movd mm7, [ebx] ; mm7 = 0:error_0 + add ebx, byte 4 + movq mm6, mm7 ; mm6 = 0:error_0 + psubd mm7, mm3 ; mm7 = :error_1 + punpckldq mm6, mm7 ; mm6 = error_1:error_0 + movq mm5, mm6 ; mm5 = error_1:error_0 + movq mm7, mm6 ; mm7 = error_1:error_0 + psubd mm5, mm3 ; mm5 = error_2: + movq mm3, mm6 ; mm3 = error_1:error_0 + psrad mm6, 31 + pxor mm7, mm6 + psubd mm7, mm6 ; mm7 = abs(error_1):abs(error_0) + paddd mm0, mm7 ; mm0 = total_error_1:total_error_0 + movq mm6, mm5 ; mm6 = error_2: + psubd mm5, mm4 ; mm5 = error_3: + punpckhdq mm5, mm6 ; mm5 = error_2:error_3 + movq mm7, mm5 ; mm7 = error_2:error_3 + movq mm6, mm5 ; mm6 = error_2:error_3 + psubd mm5, mm4 ; mm5 = :error_4 + movq mm4, mm6 ; mm4 = error_2:error_3 + psrad mm6, 31 + pxor mm7, mm6 + psubd mm7, mm6 ; mm7 = abs(error_2):abs(error_3) + paddd mm1, mm7 ; mm1 = total_error_2:total_error_3 + movq mm6, mm5 ; mm6 = :error_4 + psrad mm5, 31 + pxor mm6, mm5 + psubd mm6, mm5 ; mm6 = :abs(error_4) + paddd mm2, mm6 ; mm2 = :total_error_4 + dec ecx - jnz near .loop + jnz short .loop ; if(total_error_0 < min(min(min(total_error_1, total_error_2), total_error_3), total_error_4)) ; order = 0; @@ -188,56 +154,42 @@ cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov ; order = 3; ; else ; order = 4; + movq mm3, mm0 ; mm3 = total_error_1:total_error_0 movd edi, mm2 ; edi = total_error_4 - movq mm4, mm1 ; mm4 = total_error_3:total_error_2 - psrlq mm4, 32 ; mm4 = 0:total_error_3 + movd esi, mm1 ; esi = total_error_3 + movd eax, mm0 ; eax = total_error_0 + punpckhdq mm1, mm1 ; mm1 = total_error_2:total_error_2 + punpckhdq mm3, mm3 ; mm3 = total_error_1:total_error_1 movd edx, mm1 ; edx = total_error_2 - movd esi, mm4 ; esi = total_error_3 - movq mm3, mm0 ; mm3 = total_error_1:total_error_0 - psrlq mm3, 32 ; mm3 = 0:total_error_1 - movd ebx, mm0 ; ebx = total_error_0 movd ecx, mm3 ; ecx = total_error_1 - emms - mov eax, ebx ; eax = total_error_0 - cmp ecx, ebx + + xor ebx, ebx + xor ebp, ebp + inc ebx + cmp ecx, eax cmovb eax, ecx ; eax = min(total_error_0, total_error_1) + cmovbe ebp, ebx + inc ebx cmp edx, eax cmovb eax, edx ; eax = min(total_error_0, total_error_1, total_error_2) + cmovbe ebp, ebx + inc ebx cmp esi, eax cmovb eax, esi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3) + cmovbe ebp, ebx + inc ebx cmp edi, eax cmovb eax, edi ; eax = min(total_error_0, total_error_1, total_error_2, total_error_3, total_error_4) + cmovbe ebp, ebx + movd ebx, mm0 ; ebx = total_error_0 + emms - cmp eax, ebx - jne .not_order_0 - xor ebp, ebp - jmp short .got_order -.not_order_0: - cmp eax, ecx - jne .not_order_1 - mov ebp, 1 - jmp short .got_order -.not_order_1: - cmp eax, edx - jne .not_order_2 - mov ebp, 2 - jmp short .got_order -.not_order_2: - cmp eax, esi - jne .not_order_3 - mov ebp, 3 - jmp short .got_order -.not_order_3: - mov ebp, 4 -.got_order: ; residual_bits_per_sample[0] = (FLAC__real)((data_len > 0 && total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[1] = (FLAC__real)((data_len > 0 && total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[2] = (FLAC__real)((data_len > 0 && total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[3] = (FLAC__real)((data_len > 0 && total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); ; residual_bits_per_sample[4] = (FLAC__real)((data_len > 0 && total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); xor eax, eax - cmp eax, [esp + 40] - je near .data_len_is_0 fild dword [esp + 40] ; ST = data_len (NOTE: assumes data_len is <2gigs) .rbps_0: test ebx, ebx @@ -321,9 +273,14 @@ cident FLAC__fixed_compute_best_predictor_asm_ia32_mmx_cmov jmp short .end .data_len_is_0: ; data_len == 0, so residual_bits_per_sample[*] = 0.0 - mov ecx, 5 ; eax still == 0, ecx = # of dwords of 0 to store + xor ebp, ebp mov edi, [esp + 44] - rep stosd + mov [edi], ebp + mov [edi + 4], ebp + mov [edi + 8], ebp + mov [edi + 12], ebp + mov [edi + 16], ebp + add ebp, byte 4 ; order = 4 .end: mov eax, ebp ; return order