const unsigned FLAC__CPUINFO_IA32_CPUID_SSE = 0x02000000;
const unsigned FLAC__CPUINFO_IA32_CPUID_SSE2 = 0x04000000;
+const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW = 0x80000000;
+const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW = 0x40000000;
+const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX = 0x00400000;
+
void FLAC__cpu_info(FLAC__CPUInfo *info)
{
info->data.ia32.fxsr = (cpuid & FLAC__CPUINFO_IA32_CPUID_FXSR)? true : false;
info->data.ia32.sse = (cpuid & FLAC__CPUINFO_IA32_CPUID_SSE)? true : false; /* @@@ also need to check for operating system support */
info->data.ia32.sse2 = (cpuid & FLAC__CPUINFO_IA32_CPUID_SSE2)? true : false; /* @@@ also need to check for operating system support */
+
+ cpuid = FLAC__cpu_info_extended_amd_asm_ia32();
+ info->data.ia32._3dnow = (cpuid & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW)? true : false;
+ info->data.ia32.ext3dnow = (cpuid & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW)? true : false;
+ info->data.ia32.extmmx = (cpuid & FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX)? true : false;
}
#else
info->use_asm = false;
data_section
cglobal FLAC__cpu_info_asm_ia32
+cglobal FLAC__cpu_info_extended_amd_asm_ia32
code_section
; **********************************************************************
;
- ALIGN 16
-cident FLAC__cpu_info_asm_ia32
-
- push ebx
+have_cpuid:
pushfd
pop eax
mov edx, eax
- xor eax, 00200000h
+ xor eax, 0x00200000
push eax
popfd
pushfd
cmp eax, edx
jz .no_cpuid
mov eax, 1
+ jmp .end
+.no_cpuid:
+ xor eax, eax
+.end:
+ ret
+
+cident FLAC__cpu_info_asm_ia32
+ push ebx
+ call have_cpuid
+ test eax, eax
+ jz .no_cpuid
+ mov eax, 1
cpuid
mov eax, edx
- jmp short .end
+ jmp .end
.no_cpuid:
- xor eax, eax ; return 0
-.end:
+ xor eax, eax
+.end
+ pop ebx
+ ret
+
+cident FLAC__cpu_info_extended_amd_asm_ia32
+ push ebx
+ call have_cpuid
+ test eax, eax
+ jz .no_cpuid
+ mov eax, 0x80000000
+ cpuid
+ cmp eax, 0x80000001
+ jb .no_cpuid
+ mov eax, 0x80000001
+ cpuid
+ mov eax, edx
+ jmp .end
+.no_cpuid
+ xor eax, eax
+.end
pop ebx
ret
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8
cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12
+cglobal FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32
cglobal FLAC__lpc_compute_residual_from_qlp_coefficients_asm_ia32_mmx
cglobal FLAC__lpc_restore_signal_asm_ia32
.end:
ret
+ align 16
+cident FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow
+ ;[ebp + 32] autoc
+ ;[ebp + 28] lag
+ ;[ebp + 24] data_len
+ ;[ebp + 20] data
+
+ push ebp
+ push ebx
+ push esi
+ push edi
+ mov ebp, esp
+
+ mov esi, [ebp + 20]
+ mov edi, [ebp + 24]
+ mov edx, [ebp + 28]
+ mov eax, edx
+ neg eax
+ and esp, byte -8
+ lea esp, [esp + 4 * eax]
+ mov ecx, edx
+ xor eax, eax
+.loop0:
+ dec ecx
+ mov [esp + 4 * ecx], eax
+ jnz short .loop0
+
+ mov eax, edi
+ sub eax, edx
+ mov ebx, edx
+ and ebx, byte 1
+ sub eax, ebx
+ lea ecx, [esi + 4 * eax - 12]
+ cmp esi, ecx
+ mov eax, esi
+ ja short .loop2_pre
+ align 16 ;8 nops
+.loop1_i:
+ movd mm0, [eax]
+ movd mm2, [eax + 4]
+ movd mm4, [eax + 8]
+ movd mm6, [eax + 12]
+ mov ebx, edx
+ punpckldq mm0, mm0
+ punpckldq mm2, mm2
+ punpckldq mm4, mm4
+ punpckldq mm6, mm6
+ align 16 ;3 nops
+.loop1_j:
+ sub ebx, byte 2
+ movd mm1, [eax + 4 * ebx]
+ movd mm3, [eax + 4 * ebx + 4]
+ movd mm5, [eax + 4 * ebx + 8]
+ movd mm7, [eax + 4 * ebx + 12]
+ punpckldq mm1, mm3
+ punpckldq mm3, mm5
+ pfmul mm1, mm0
+ punpckldq mm5, mm7
+ pfmul mm3, mm2
+ punpckldq mm7, [eax + 4 * ebx + 16]
+ pfmul mm5, mm4
+ pfmul mm7, mm6
+ pfadd mm1, mm3
+ movq mm3, [esp + 4 * ebx]
+ pfadd mm5, mm7
+ pfadd mm1, mm5
+ pfadd mm3, mm1
+ movq [esp + 4 * ebx], mm3
+ jg short .loop1_j
+
+ add eax, byte 16
+ cmp eax, ecx
+ jb short .loop1_i
+
+.loop2_pre:
+ mov ebx, eax
+ sub eax, esi
+ shr eax, 2
+ lea ecx, [esi + 4 * edi]
+ mov esi, ebx
+.loop2_i:
+ movd mm0, [esi]
+ mov ebx, edi
+ sub ebx, eax
+ cmp ebx, edx
+ jbe short .loop2_j
+ mov ebx, edx
+.loop2_j:
+ dec ebx
+ movd mm1, [esi + 4 * ebx]
+ pfmul mm1, mm0
+ movd mm2, [esp + 4 * ebx]
+ pfadd mm1, mm2
+ movd [esp + 4 * ebx], mm1
+
+ jnz short .loop2_j
+
+ add esi, byte 4
+ inc eax
+ cmp esi, ecx
+ jnz short .loop2_i
+
+ mov edi, [ebp + 32]
+.loop3:
+ dec edx
+ mov eax, [esp + 4 * edx]
+ mov [edi + 4 * edx], eax
+ jnz short .loop3
+
+ femms
+
+ mov esp, ebp
+ pop edi
+ pop esi
+ pop ebx
+ pop ebp
+ ret
+
;void FLAC__lpc_compute_residual_from_qlp_coefficients(const FLAC__int32 data[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
;
; for(i = 0; i < data_len; i++) {
FLAC__bool fxsr;
FLAC__bool sse;
FLAC__bool sse2;
+ FLAC__bool _3dnow;
+ FLAC__bool ext3dnow;
+ FLAC__bool extmmx;
} FLAC__CPUInfo_IA32;
+
extern const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_MMX;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_SSE;
extern const unsigned FLAC__CPUINFO_IA32_CPUID_SSE2;
+extern const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_3DNOW;
+extern const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXT3DNOW;
+extern const unsigned FLAC__CPUINFO_IA32_CPUID_EXTENDED_AMD_EXTMMX;
+
typedef struct {
FLAC__bool use_asm;
FLAC__CPUInfo_Type type;
#ifdef FLAC__CPU_IA32
#ifdef FLAC__HAS_NASM
unsigned FLAC__cpu_info_asm_ia32();
+unsigned FLAC__cpu_info_extended_amd_asm_ia32();
#endif
#endif
#endif
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_8(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
void FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_12(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
+void FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow(const FLAC__real data[], unsigned data_len, unsigned lag, FLAC__real autoc[]);
#endif
#endif
#endif
#ifdef FLAC__CPU_IA32
FLAC__ASSERT(encoder->private->cpuinfo.type == FLAC__CPUINFO_TYPE_IA32);
#ifdef FLAC__HAS_NASM
- if(0 && encoder->private->cpuinfo.data.ia32.sse) { /* SSE version lacks necessary resolution, plus SSE flag doesn't check for OS support */
+ if(0 && encoder->private->cpuinfo.data.ia32.sse) { /*@@@ SSE version lacks necessary resolution, plus SSE flag doesn't check for OS support */
if(encoder->protected->max_lpc_order < 4)
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_sse_lag_4;
else if(encoder->protected->max_lpc_order < 8)
else
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
}
+ else if(0 && encoder->private->cpuinfo.data.ia32._3dnow) /*@@@ turn back on in first beta after 1.0 */
+ encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32_3dnow;
else
encoder->private->local_lpc_compute_autocorrelation = FLAC__lpc_compute_autocorrelation_asm_ia32;
if(encoder->private->cpuinfo.data.ia32.mmx && encoder->private->cpuinfo.data.ia32.cmov)