#define FF_MM_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext
#define FF_MM_SSE 0x0008 ///< SSE functions
#define FF_MM_SSE2 0x0010 ///< PIV SSE2 functions
+#define FF_MM_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster
+ ///< than regular MMX/SSE (e.g. Core1)
#define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt
#define FF_MM_SSE3 0x0040 ///< Prescott SSE3 functions
+#define FF_MM_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster
+ ///< than regular MMX/SSE (e.g. Core1)
#define FF_MM_SSSE3 0x0080 ///< Conroe SSSE3 functions
#define FF_MM_SSE4 0x0100 ///< Penryn SSE4.1 functions
#define FF_MM_SSE42 0x0200 ///< Nehalem SSE4.2 functions
int rval = 0;
int eax, ebx, ecx, edx;
int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
+ int family=0, model=0;
+ union { int i[3]; char c[12]; } vendor;
#if ARCH_X86_32
x86_reg a, c;
return 0; /* CPUID not supported */
#endif
- cpuid(0, max_std_level, ebx, ecx, edx);
+ cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]);
if(max_std_level >= 1){
cpuid(1, eax, ebx, ecx, std_caps);
+ family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+ model = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
if (std_caps & (1<<23))
rval |= FF_MM_MMX;
if (std_caps & (1<<25))
rval |= FF_MM_MMX2;
}
+ if (!strncmp(vendor.c, "GenuineIntel", 12) &&
+ family == 6 && (model == 9 || model == 13 || model == 14)) {
+ /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
+ * theoretically support sse2, but it's usually slower than mmx,
+ * so let's just pretend they don't. */
+ if (rval & FF_MM_SSE2) rval ^= FF_MM_SSE2SLOW|FF_MM_SSE2;
+ if (rval & FF_MM_SSE3) rval ^= FF_MM_SSE3SLOW|FF_MM_SSE3;
+ }
+
#if 0
- av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n",
+ av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s%s%s\n",
(rval&FF_MM_MMX) ? "MMX ":"",
(rval&FF_MM_MMX2) ? "MMX2 ":"",
(rval&FF_MM_SSE) ? "SSE ":"",
(rval&FF_MM_SSE2) ? "SSE2 ":"",
+ (rval&FF_MM_SSE2SLOW) ? "SSE2(slow) ":"",
(rval&FF_MM_SSE3) ? "SSE3 ":"",
+ (rval&FF_MM_SSE3SLOW) ? "SSE3(slow) ":"",
(rval&FF_MM_SSSE3) ? "SSSE3 ":"",
(rval&FF_MM_SSE4) ? "SSE4.1 ":"",
(rval&FF_MM_SSE42) ? "SSE4.2 ":"",
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
}
- if (mm_flags & FF_MM_SSE2) {
+ if (mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) {
VP8_LUMA_MC_FUNC(0, 16, sse2);
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
- c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
+ }
+
+ if (mm_flags & FF_MM_SSE2) {
+ c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
}