Remove FF_MM_SSE2/3 flags for CPUs where this is generally not faster than

author Ronald S. Bultje <rsbultje@gmail.com>

Mon, 19 Jul 2010 22:38:23 +0000 (22:38 +0000)

committer Ronald S. Bultje <rsbultje@gmail.com>

Mon, 19 Jul 2010 22:38:23 +0000 (22:38 +0000)
author Ronald S. Bultje <rsbultje@gmail.com>
Mon, 19 Jul 2010 22:38:23 +0000 (22:38 +0000)
committer Ronald S. Bultje <rsbultje@gmail.com>
Mon, 19 Jul 2010 22:38:23 +0000 (22:38 +0000)
diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h

index ff29576..560d575 100644 (file)
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -1656,8 +1656,12 @@ typedef struct AVCodecContext {
  #define FF_MM_MMX2     0x0002 ///< SSE integer functions or AMD MMX ext
  #define FF_MM_SSE      0x0008 ///< SSE functions
  #define FF_MM_SSE2     0x0010 ///< PIV SSE2 functions
+#define FF_MM_SSE2SLOW 0x40000000 ///< SSE2 supported, but usually not faster
+                                  ///< than regular MMX/SSE (e.g. Core1)
  #define FF_MM_3DNOWEXT 0x0020 ///< AMD 3DNowExt
  #define FF_MM_SSE3     0x0040 ///< Prescott SSE3 functions
+#define FF_MM_SSE3SLOW 0x20000000 ///< SSE3 supported, but usually not faster
+                                  ///< than regular MMX/SSE (e.g. Core1)
  #define FF_MM_SSSE3    0x0080 ///< Conroe SSSE3 functions
  #define FF_MM_SSE4     0x0100 ///< Penryn SSE4.1 functions
  #define FF_MM_SSE42    0x0200 ///< Nehalem SSE4.2 functions
diff --git a/libavcodec/x86/cpuid.c b/libavcodec/x86/cpuid.c

index 1ed4d2e..f9afd6e 100644 (file)
--- a/libavcodec/x86/cpuid.c
+++ b/libavcodec/x86/cpuid.c
@@ -42,6 +42,8 @@ int mm_support(void)
      int rval = 0;
      int eax, ebx, ecx, edx;
      int max_std_level, max_ext_level, std_caps=0, ext_caps=0;
+    int family=0, model=0;
+    union { int i[3]; char c[12]; } vendor;
  
  #if ARCH_X86_32
      x86_reg a, c;
@@ -70,10 +72,12 @@ int mm_support(void)
          return 0; /* CPUID not supported */
  #endif
  
-    cpuid(0, max_std_level, ebx, ecx, edx);
+    cpuid(0, max_std_level, vendor.i[0], vendor.i[2], vendor.i[1]);
  
      if(max_std_level >= 1){
          cpuid(1, eax, ebx, ecx, std_caps);
+        family = ((eax>>8)&0xf) + ((eax>>20)&0xff);
+        model  = ((eax>>4)&0xf) + ((eax>>12)&0xf0);
          if (std_caps & (1<<23))
              rval |= FF_MM_MMX;
          if (std_caps & (1<<25))
@@ -108,13 +112,24 @@ int mm_support(void)
              rval |= FF_MM_MMX2;
      }
  
+    if (!strncmp(vendor.c, "GenuineIntel", 12) &&
+        family == 6 && (model == 9 || model == 13 || model == 14)) {
+        /* 6/9 (pentium-m "banias"), 6/13 (pentium-m "dothan"), and 6/14 (core1 "yonah")
+         * theoretically support sse2, but it's usually slower than mmx,
+         * so let's just pretend they don't. */
+        if (rval & FF_MM_SSE2) rval ^= FF_MM_SSE2SLOW|FF_MM_SSE2;
+        if (rval & FF_MM_SSE3) rval ^= FF_MM_SSE3SLOW|FF_MM_SSE3;
+    }
+
  #if 0
-    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s\n",
+    av_log(NULL, AV_LOG_DEBUG, "%s%s%s%s%s%s%s%s%s%s%s%s\n",
          (rval&FF_MM_MMX) ? "MMX ":"",
          (rval&FF_MM_MMX2) ? "MMX2 ":"",
          (rval&FF_MM_SSE) ? "SSE ":"",
          (rval&FF_MM_SSE2) ? "SSE2 ":"",
+        (rval&FF_MM_SSE2SLOW) ? "SSE2(slow) ":"",
          (rval&FF_MM_SSE3) ? "SSE3 ":"",
+        (rval&FF_MM_SSE3SLOW) ? "SSE3(slow) ":"",
          (rval&FF_MM_SSSE3) ? "SSSE3 ":"",
          (rval&FF_MM_SSE4) ? "SSE4.1 ":"",
          (rval&FF_MM_SSE42) ? "SSE4.2 ":"",
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c

index f491111..d3e412a 100644 (file)
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -1409,9 +1409,10 @@ void dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
              c->sum_abs_dctelem= sum_abs_dctelem_sse2;
              c->hadamard8_diff[0]= hadamard8_diff16_sse2;
              c->hadamard8_diff[1]= hadamard8_diff_sse2;
-#if CONFIG_LPC
+        }
+
+        if (CONFIG_LPC && mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) {
              c->lpc_compute_autocorr = ff_lpc_compute_autocorr_sse2;
-#endif
          }
  
  #if HAVE_SSSE3
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c

index c7b02d1..f8de2d2 100644 (file)
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -328,7 +328,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
          c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
      }
  
-    if (mm_flags & FF_MM_SSE2) {
+    if (mm_flags & (FF_MM_SSE2|FF_MM_SSE2SLOW)) {
          VP8_LUMA_MC_FUNC(0, 16, sse2);
          VP8_MC_FUNC(1, 8, sse2);
          VP8_BILINEAR_MC_FUNC(0, 16, sse2);
@@ -338,8 +338,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
          c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
  
          c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
-        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
          c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
+    }
+
+    if (mm_flags & FF_MM_SSE2) {
+        c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
          c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
      }
author	Ronald S. Bultje <rsbultje@gmail.com>
	Mon, 19 Jul 2010 22:38:23 +0000 (22:38 +0000)
committer	Ronald S. Bultje <rsbultje@gmail.com>
	Mon, 19 Jul 2010 22:38:23 +0000 (22:38 +0000)
libavcodec/avcodec.h		patch \| blob \| history
libavcodec/x86/cpuid.c		patch \| blob \| history
libavcodec/x86/dsputilenc_mmx.c		patch \| blob \| history
libavcodec/x86/vp8dsp-init.c		patch \| blob \| history