[x86] Add a feature bit: Fast_Unaligned_Copy
authorH.J. Lu <hjl.tools@gmail.com>
Mon, 28 Mar 2016 11:39:48 +0000 (04:39 -0700)
committerH.J. Lu <hjl.tools@gmail.com>
Mon, 28 Mar 2016 11:40:03 +0000 (04:40 -0700)
On AMD processors, memcpy optimized with unaligned SSE load is
slower than emcpy optimized with aligned SSSE3 while other string
functions are faster with unaligned SSE load.  A feature bit,
Fast_Unaligned_Copy, is added to select memcpy optimized with
unaligned SSE load.

[BZ #19583]
* sysdeps/x86/cpu-features.c (init_cpu_features): Set
Fast_Unaligned_Copy with Fast_Unaligned_Load for Intel
processors.  Set Fast_Copy_Backward for AMD Excavator
processors.
* sysdeps/x86/cpu-features.h (bit_arch_Fast_Unaligned_Copy):
New.
(index_arch_Fast_Unaligned_Copy): Likewise.
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check
Fast_Unaligned_Copy instead of Fast_Unaligned_Load.

ChangeLog
sysdeps/x86/cpu-features.c
sysdeps/x86/cpu-features.h
sysdeps/x86_64/multiarch/memcpy.S

index 7f629ac..5375f3b 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2016-03-28   H.J. Lu  <hongjiu.lu@intel.com>
+            Amit Pawar  <Amit.Pawar@amd.com>
+
+       [BZ #19583]
+       * sysdeps/x86/cpu-features.c (init_cpu_features): Set
+       Fast_Unaligned_Copy with Fast_Unaligned_Load for Intel
+       processors.  Set Fast_Copy_Backward for AMD Excavator
+       processors.
+       * sysdeps/x86/cpu-features.h (bit_arch_Fast_Unaligned_Copy):
+       New.
+       (index_arch_Fast_Unaligned_Copy): Likewise.
+       * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check
+       Fast_Unaligned_Copy instead of Fast_Unaligned_Load.
+
 2016-03-25  Florian Weimer  <fweimer@redhat.com>
 
        [BZ #19791]
index c8f81ef..de75c79 100644 (file)
@@ -153,8 +153,12 @@ init_cpu_features (struct cpu_features *cpu_features)
 #if index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
 # error index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
 #endif
+#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
+# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
+#endif
              cpu_features->feature[index_arch_Fast_Unaligned_Load]
                |= (bit_arch_Fast_Unaligned_Load
+                   | bit_arch_Fast_Unaligned_Copy
                    | bit_arch_Prefer_PMINUB_for_stringop
                    | bit_arch_Slow_SSE4_2);
              break;
@@ -183,10 +187,14 @@ init_cpu_features (struct cpu_features *cpu_features)
 #if index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
 # error index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
 #endif
+#if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
+# error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
+#endif
              cpu_features->feature[index_arch_Fast_Rep_String]
                |= (bit_arch_Fast_Rep_String
                    | bit_arch_Fast_Copy_Backward
                    | bit_arch_Fast_Unaligned_Load
+                   | bit_arch_Fast_Unaligned_Copy
                    | bit_arch_Prefer_PMINUB_for_stringop);
              break;
            }
@@ -220,10 +228,14 @@ init_cpu_features (struct cpu_features *cpu_features)
 
       if (family == 0x15)
        {
+#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
+# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
+#endif
          /* "Excavator"   */
          if (model >= 0x60 && model <= 0x7f)
            cpu_features->feature[index_arch_Fast_Unaligned_Load]
-             |= bit_arch_Fast_Unaligned_Load;
+             |= (bit_arch_Fast_Unaligned_Load
+                 | bit_arch_Fast_Copy_Backward);
        }
     }
   else
index e06eb7e..bfe1f4c 100644 (file)
@@ -35,6 +35,7 @@
 #define bit_arch_I686                          (1 << 15)
 #define bit_arch_Prefer_MAP_32BIT_EXEC         (1 << 16)
 #define bit_arch_Prefer_No_VZEROUPPER          (1 << 17)
+#define bit_arch_Fast_Unaligned_Copy           (1 << 18)
 
 /* CPUID Feature flags.  */
 
 # define index_arch_I686               FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Fast_Unaligned_Copy        FEATURE_INDEX_1*FEATURE_SIZE
 
 
 # if defined (_LIBC) && !IS_IN (nonlib)
@@ -265,6 +267,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_arch_I686               FEATURE_INDEX_1
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
+# define index_arch_Fast_Unaligned_Copy        FEATURE_INDEX_1
 
 #endif /* !__ASSEMBLER__ */
 
index 8882590..5b045d7 100644 (file)
@@ -42,7 +42,7 @@ ENTRY(__new_memcpy)
        HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
        jnz     2f
        lea     __memcpy_sse2_unaligned(%rip), %RAX_LP
-       HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+       HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
        jnz     2f
        lea     __memcpy_sse2(%rip), %RAX_LP
        HAS_CPU_FEATURE (SSSE3)