Check the HTT bit before counting logical threads
authorH.J. Lu <hjl.tools@gmail.com>
Thu, 19 May 2016 16:09:00 +0000 (09:09 -0700)
committerH.J. Lu <hjl.tools@gmail.com>
Thu, 19 May 2016 16:09:00 +0000 (09:09 -0700)
Skip counting logical threads for Intel processors if the HTT bit is 0
which indicates there is only a single logical processor.

* sysdeps/x86/cacheinfo.c (init_cacheinfo): Skip counting
logical threads if the HTT bit is 0.
* sysdeps/x86/cpu-features.h (bit_cpu_HTT): New.
(index_cpu_HTT): Likewise.
(reg_HTT): Likewise.

ChangeLog
sysdeps/x86/cacheinfo.c
sysdeps/x86/cpu-features.h

index 8a4918c..8adf828 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,13 @@
 2016-05-19  H.J. Lu  <hongjiu.lu@intel.com>
 
+       * sysdeps/x86/cacheinfo.c (init_cacheinfo): Skip counting
+       logical threads if the HTT bit is 0.
+       * sysdeps/x86/cpu-features.h (bit_cpu_HTT): New.
+       (index_cpu_HTT): Likewise.
+       (reg_HTT): Likewise.
+
+2016-05-19  H.J. Lu  <hongjiu.lu@intel.com>
+
        [BZ #20115]
        * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S (__memset):
        Remove alignments on jump targets.
index 8408624..1f46d9d 100644 (file)
@@ -506,99 +506,105 @@ init_cacheinfo (void)
          shared = core;
        }
 
-      /* Figure out the number of logical threads that share the
-        highest cache level.  */
-      if (max_cpuid >= 4)
+      /* A value of 0 for the HTT bit indicates there is only a single
+        logical processor.  */
+      if (HAS_CPU_FEATURE (HTT))
        {
-         unsigned int family = GLRO(dl_x86_cpu_features).family;
-         unsigned int model = GLRO(dl_x86_cpu_features).model;
+         /* Figure out the number of logical threads that share the
+            highest cache level.  */
+         if (max_cpuid >= 4)
+           {
+             unsigned int family = GLRO(dl_x86_cpu_features).family;
+             unsigned int model = GLRO(dl_x86_cpu_features).model;
 
-         int i = 0;
+             int i = 0;
 
-         /* Query until desired cache level is enumerated.  */
-         do
-           {
-             __cpuid_count (4, i++, eax, ebx, ecx, edx);
-
-             /* There seems to be a bug in at least some Pentium Ds
-                which sometimes fail to iterate all cache parameters.
-                Do not loop indefinitely here, stop in this case and
-                assume there is no such information.  */
-             if ((eax & 0x1f) == 0)
-               goto intel_bug_no_cache_info;
-           }
-         while (((eax >> 5) & 0x7) != level);
+             /* Query until desired cache level is enumerated.  */
+             do
+               {
+                 __cpuid_count (4, i++, eax, ebx, ecx, edx);
+
+                 /* There seems to be a bug in at least some Pentium Ds
+                    which sometimes fail to iterate all cache parameters.
+                    Do not loop indefinitely here, stop in this case and
+                    assume there is no such information.  */
+                 if ((eax & 0x1f) == 0)
+                   goto intel_bug_no_cache_info;
+               }
+             while (((eax >> 5) & 0x7) != level);
 
-         /* Check if cache is inclusive of lower cache levels.  */
-         inclusive_cache = (edx & 0x2) != 0;
+             /* Check if cache is inclusive of lower cache levels.  */
+             inclusive_cache = (edx & 0x2) != 0;
 
-         threads = (eax >> 14) & 0x3ff;
+             threads = (eax >> 14) & 0x3ff;
 
-         /* If max_cpuid >= 11, THREADS is the maximum number of
-             addressable IDs for logical processors sharing the
-             cache, instead of the maximum number of threads
-             sharing the cache.  */
-         if (threads && max_cpuid >= 11)
-           {
-             /* Find the number of logical processors shipped in
-                one core and apply count mask.  */
-             i = 0;
-             while (1)
+             /* If max_cpuid >= 11, THREADS is the maximum number of
+                addressable IDs for logical processors sharing the
+                cache, instead of the maximum number of threads
+                sharing the cache.  */
+             if (threads && max_cpuid >= 11)
                {
-                 __cpuid_count (11, i++, eax, ebx, ecx, edx);
-
-                 int shipped = ebx & 0xff;
-                 int type = ecx & 0xff0;
-                 if (shipped == 0 || type == 0)
-                   break;
-                 else if (type == 0x200)
+                 /* Find the number of logical processors shipped in
+                    one core and apply count mask.  */
+                 i = 0;
+                 while (1)
                    {
-                     int count_mask;
-
-                     /* Compute count mask.  */
-                     asm ("bsr %1, %0"
-                          : "=r" (count_mask) : "g" (threads));
-                     count_mask = ~(-1 << (count_mask + 1));
-                     threads = (shipped - 1) & count_mask;
+                     __cpuid_count (11, i++, eax, ebx, ecx, edx);
+
+                     int shipped = ebx & 0xff;
+                     int type = ecx & 0xff0;
+                     if (shipped == 0 || type == 0)
+                       break;
+                     else if (type == 0x200)
+                       {
+                         int count_mask;
+
+                         /* Compute count mask.  */
+                         asm ("bsr %1, %0"
+                              : "=r" (count_mask) : "g" (threads));
+                         count_mask = ~(-1 << (count_mask + 1));
+                         threads = (shipped - 1) & count_mask;
+                         break;
+                       }
+                   }
+               }
+             threads += 1;
+             if (threads > 2 && level == 2 && family == 6)
+               {
+                 switch (model)
+                   {
+                   case 0x57:
+                     /* Knights Landing has L2 cache shared by 2 cores.  */
+                   case 0x37:
+                   case 0x4a:
+                   case 0x4d:
+                   case 0x5a:
+                   case 0x5d:
+                     /* Silvermont has L2 cache shared by 2 cores.  */
+                     threads = 2;
+                     break;
+                   default:
                      break;
                    }
                }
            }
-         threads += 1;
-         if (threads > 2 && level == 2 && family == 6)
+         else
            {
-             switch (model)
-               {
-               case 0x57:
-                 /* Knights Landing has L2 cache shared by 2 cores.  */
-               case 0x37:
-               case 0x4a:
-               case 0x4d:
-               case 0x5a:
-               case 0x5d:
-                 /* Silvermont has L2 cache shared by 2 cores.  */
-                 threads = 2;
-                 break;
-               default:
-                 break;
-               }
+intel_bug_no_cache_info:
+             /* Assume that all logical threads share the highest cache
+                level.  */
+
+             threads
+               = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
+                   >> 16) & 0xff);
            }
-       }
-      else
-       {
-       intel_bug_no_cache_info:
-         /* Assume that all logical threads share the highest cache level.  */
 
-         threads
-           = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
-               >> 16) & 0xff);
+         /* Cap usage of highest cache level to the number of supported
+            threads.  */
+         if (shared > 0 && threads > 0)
+           shared /= threads;
        }
 
-      /* Cap usage of highest cache level to the number of supported
-        threads.  */
-      if (shared > 0 && threads > 0)
-       shared /= threads;
-
       /* Account for non-inclusive L2 and L3 caches.  */
       if (level == 3 && !inclusive_cache)
        shared += core;
index 9529d61..2bd9371 100644 (file)
@@ -51,6 +51,7 @@
 #define bit_cpu_POPCOUNT       (1 << 23)
 #define bit_cpu_FMA            (1 << 12)
 #define bit_cpu_FMA4           (1 << 16)
+#define bit_cpu_HTT            (1 << 28)
 
 /* COMMON_CPUID_INDEX_7.  */
 #define bit_cpu_ERMS           (1 << 9)
@@ -235,6 +236,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_cpu_FMA4                COMMON_CPUID_INDEX_80000001
 # define index_cpu_POPCOUNT    COMMON_CPUID_INDEX_1
 # define index_cpu_OSXSAVE     COMMON_CPUID_INDEX_1
+# define index_cpu_HTT         COMMON_CPUID_INDEX_1
 
 # define reg_CX8               edx
 # define reg_CMOV              edx
@@ -252,6 +254,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define reg_FMA4              ecx
 # define reg_POPCOUNT          ecx
 # define reg_OSXSAVE           ecx
+# define reg_HTT               edx
 
 # define index_arch_Fast_Rep_String    FEATURE_INDEX_1
 # define index_arch_Fast_Copy_Backward FEATURE_INDEX_1