[OpenMP] Add cpuid leaf 1f topology discovery

author Peyton, Jonathan L <jonathan.l.peyton@intel.com>

Fri, 15 Jan 2021 19:38:50 +0000 (13:38 -0600)

committer Peyton, Jonathan L <jonathan.l.peyton@intel.com>

Wed, 27 Jan 2021 20:27:23 +0000 (14:27 -0600)
author Peyton, Jonathan L <jonathan.l.peyton@intel.com>
Fri, 15 Jan 2021 19:38:50 +0000 (13:38 -0600)
committer Peyton, Jonathan L <jonathan.l.peyton@intel.com>
Wed, 27 Jan 2021 20:27:23 +0000 (14:27 -0600)
diff --git a/openmp/runtime/src/i18n/en_US.txt b/openmp/runtime/src/i18n/en_US.txt

index 8483a87..26a164d 100644 (file)
--- a/openmp/runtime/src/i18n/en_US.txt
+++ b/openmp/runtime/src/i18n/en_US.txt
@@ -123,6 +123,7 @@ NumaDomains                  "NUMA domains"
  ProcGroup                    "processor group"
  ProcGroups                   "processor groups"
  Unknown                      "unknown"
+NoLeaf31Support              "cpuid leaf 31 not supported"
  
  
  
@@ -383,8 +384,8 @@ StaticLibNotSupport          "Static %1$s does not support %2$s. Continuing with
  OBSOLETE                     "KMP_DYNAMIC_MODE=irml cannot be used with KMP_USE_IRML=0"
  IttUnknownGroup              "ittnotify: Unknown group \"%2$s\" specified in environment variable \"%1$s\"."
  IttEnvVarTooLong             "ittnotify: Environment variable \"%1$s\" too long: Actual lengths is %2$lu, max allowed length is %3$lu."
-AffUseGlobCpuidL11           "%1$s: Affinity capable, using global cpuid leaf 11 info"
-AffNotCapableUseLocCpuidL11  "%1$s: Affinity not capable, using local cpuid leaf 11 info"
+OBSOLETE                     "%1$s: Affinity capable, using global cpuid leaf 11 info"
+OBSOLETE                     "%1$s: Affinity not capable, using local cpuid leaf 11 info"
  AffInfoStr                   "%1$s: %2$s."
  AffInfoStrStr                "%1$s: %2$s - %3$s."
  OSProcToPhysicalThreadMap    "%1$s: OS proc to physical thread map:"
@@ -450,6 +451,10 @@ HierSchedInvalid             "Hierarchy ignored: unsupported level: %1$s."
  AffFormatDefault             "OMP: pid %1$s tid %2$s thread %3$s bound to OS proc set {%4$s}"
  APIDeprecated                "%1$s routine deprecated, please use %2$s instead."
  GompFeatureNotSupported      "libgomp compatibility layer does not support OpenMP feature: %1$s"
+AffHWSubsetManyDies          "KMP_HW_SUBSET ignored: too many Dies requested."
+AffUseGlobCpuidL             "%1$s: Affinity capable, using global cpuid leaf %2$d info"
+AffNotCapableUseLocCpuidL    "%1$s: Affinity not capable, using local cpuid leaf %2$d info"
+AffNotUsingHwloc             "%1$s: Affinity not capable, using hwloc."
  
  # --------------------------------------------------------------------------------------------------
  -*- HINTS -*-
diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h

index 78c9997..ca57bf7 100644 (file)
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -789,6 +789,7 @@ enum affinity_gran {
    affinity_gran_thread,
    affinity_gran_core,
    affinity_gran_tile,
+  affinity_gran_die,
    affinity_gran_numa,
    affinity_gran_package,
    affinity_gran_node,
@@ -805,6 +806,7 @@ enum affinity_top_method {
  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
    affinity_top_method_apicid,
    affinity_top_method_x2apicid,
+  affinity_top_method_x2apicid_1f,
  #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
    affinity_top_method_cpuinfo, // KMP_CPUINFO_FILE is usable on Windows* OS, too
  #if KMP_GROUP_AFFINITY
@@ -889,6 +891,7 @@ typedef struct kmp_hws_item {
  } kmp_hws_item_t;
  
  extern kmp_hws_item_t __kmp_hws_socket;
+extern kmp_hws_item_t __kmp_hws_die;
  extern kmp_hws_item_t __kmp_hws_node;
  extern kmp_hws_item_t __kmp_hws_tile;
  extern kmp_hws_item_t __kmp_hws_core;
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp

index caa6333..0161aca 100644 (file)
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -45,6 +45,8 @@ void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
    thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
  }
  
+#if KMP_AFFINITY_SUPPORTED
+
  const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
    switch (type) {
    case KMP_HW_SOCKET:
@@ -73,7 +75,6 @@ const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
    return KMP_I18N_STR(Unknown);
  }
  
-#if KMP_USE_HWLOC
  // This function removes the topology levels that are radix 1 and don't offer
  // further information about the topology.  The most common example is when you
  // have one thread context per core, we don't want the extra thread context
@@ -213,8 +214,8 @@ static bool __kmp_affinity_discover_uniformity(int depth, int *ratio,
  }
  
  // calculate the number of X's per Y
-static inline int __kmp_hwloc_calculate_ratio(int *ratio, int deep_level,
-                                              int shallow_level) {
+static inline int __kmp_affinity_calculate_ratio(int *ratio, int deep_level,
+                                                 int shallow_level) {
    int retval = 1;
    if (deep_level < 0 || shallow_level < 0)
      return retval;
@@ -222,7 +223,23 @@ static inline int __kmp_hwloc_calculate_ratio(int *ratio, int deep_level,
      retval *= ratio[level];
    return retval;
  }
-#endif // KMP_USE_HWLOC
+
+static void __kmp_affinity_print_topology(AddrUnsPair *addrP, int len,
+                                          int depth, kmp_hw_t *types) {
+  int proc;
+  kmp_str_buf_t buf;
+  __kmp_str_buf_init(&buf);
+  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
+  for (proc = 0; proc < len; proc++) {
+    for (int i = 0; i < depth; ++i) {
+      __kmp_str_buf_print(&buf, "%s %d ", __kmp_hw_get_catalog_string(types[i]),
+                          addrP[proc].first.labels[i]);
+    }
+    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
+    __kmp_str_buf_clear(&buf);
+  }
+  __kmp_str_buf_free(&buf);
+}
  
  // Print out the detailed machine topology map, i.e. the physical locations
  // of each OS proc.
@@ -257,8 +274,6 @@ static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
    }
  }
  
-#if KMP_AFFINITY_SUPPORTED
-
  bool KMPAffinity::picked_api = false;
  
  void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
@@ -516,34 +531,19 @@ static int __kmp_nThreadsPerCore;
  static int __kmp_ncores;
  #endif
  static int *__kmp_pu_os_idx = NULL;
+static int nDiesPerPkg = 1;
  
  // __kmp_affinity_uniform_topology() doesn't work when called from
  // places which support arbitrarily many levels in the machine topology
  // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
  // __kmp_affinity_create_x2apicid_map().
  inline static bool __kmp_affinity_uniform_topology() {
-  return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
+  return __kmp_avail_proc ==
+         (__kmp_nThreadsPerCore * nCoresPerPkg * nDiesPerPkg * nPackages);
  }
  
  #if KMP_USE_HWLOC
  
-static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
-                                          int depth, kmp_hw_t *types) {
-  int proc;
-  kmp_str_buf_t buf;
-  __kmp_str_buf_init(&buf);
-  KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
-  for (proc = 0; proc < len; proc++) {
-    for (int i = 0; i < depth; ++i) {
-      __kmp_str_buf_print(&buf, "%s %d ", __kmp_hw_get_catalog_string(types[i]),
-                          addrP[proc].first.labels[i]);
-    }
-    KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
-    __kmp_str_buf_clear(&buf);
-  }
-  __kmp_str_buf_free(&buf);
-}
-
  static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
  #if HWLOC_API_VERSION >= 0x00020000
    return hwloc_obj_type_is_cache(obj->type);
@@ -699,7 +699,7 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
        nCoresPerPkg = 1; // to prevent possible division by 0
      nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
      if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
+      KMP_INFORM(AffNotUsingHwloc, "KMP_AFFINITY");
        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
        if (__kmp_affinity_uniform_topology()) {
          KMP_INFORM(Uniform, "KMP_AFFINITY");
@@ -886,8 +886,9 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
        numa_level = level;
    }
    __kmp_nThreadsPerCore =
-      __kmp_hwloc_calculate_ratio(ratio, thread_level, core_level);
-  nCoresPerPkg = __kmp_hwloc_calculate_ratio(ratio, core_level, socket_level);
+      __kmp_affinity_calculate_ratio(ratio, thread_level, core_level);
+  nCoresPerPkg =
+      __kmp_affinity_calculate_ratio(ratio, core_level, socket_level);
    if (socket_level >= 0)
      nPackages = count[socket_level];
    else
@@ -964,7 +965,7 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
    }
  
    if (__kmp_affinity_verbose)
-    __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, types);
+    __kmp_affinity_print_topology(retval, nActiveThreads, depth, types);
  
    KMP_CPU_FREE(oldMask);
    *address2os = retval;
@@ -1126,6 +1127,123 @@ static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
  
  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
  
+/*
+ * CPUID.B or 1F, Input ECX (sub leaf # aka level number)
+    Bits            Bits            Bits           Bits
+    31-16           15-8            7-4            4-0
+---+-----------+--------------+-------------+-----------------+
+EAX| reserved  |   reserved   |   reserved  |  Bits to Shift  |
+---+-----------|--------------+-------------+-----------------|
+EBX| reserved  | Num logical processors at level (16 bits)    |
+---+-----------|--------------+-------------------------------|
+ECX| reserved  |   Level Type |      Level Number (8 bits)    |
+---+-----------+--------------+-------------------------------|
+EDX|                    X2APIC ID (32 bits)                   |
+---+----------------------------------------------------------+
+*/
+
+enum {
+  INTEL_LEVEL_TYPE_INVALID = 0, // Package level
+  INTEL_LEVEL_TYPE_SMT = 1,
+  INTEL_LEVEL_TYPE_CORE = 2,
+  INTEL_LEVEL_TYPE_TILE = 3,
+  INTEL_LEVEL_TYPE_MODULE = 4,
+  INTEL_LEVEL_TYPE_DIE = 5,
+  INTEL_LEVEL_TYPE_LAST = 6,
+};
+
+struct cpuid_level_info_t {
+  unsigned level_type, mask, mask_width, nitems, cache_mask;
+};
+
+template <kmp_uint32 LSB, kmp_uint32 MSB>
+static inline unsigned __kmp_extract_bits(kmp_uint32 v) {
+  const kmp_uint32 SHIFT_LEFT = sizeof(kmp_uint32) * 8 - 1 - MSB;
+  const kmp_uint32 SHIFT_RIGHT = LSB;
+  kmp_uint32 retval = v;
+  retval <<= SHIFT_LEFT;
+  retval >>= (SHIFT_LEFT + SHIFT_RIGHT);
+  return retval;
+}
+
+static kmp_hw_t __kmp_intel_type_2_topology_type(int intel_type) {
+  switch (intel_type) {
+  case INTEL_LEVEL_TYPE_INVALID:
+    return KMP_HW_SOCKET;
+  case INTEL_LEVEL_TYPE_SMT:
+    return KMP_HW_THREAD;
+  case INTEL_LEVEL_TYPE_CORE:
+    return KMP_HW_CORE;
+  // TODO: add support for the tile and module
+  case INTEL_LEVEL_TYPE_TILE:
+    return KMP_HW_UNKNOWN;
+  case INTEL_LEVEL_TYPE_MODULE:
+    return KMP_HW_UNKNOWN;
+  case INTEL_LEVEL_TYPE_DIE:
+    return KMP_HW_DIE;
+  }
+  return KMP_HW_UNKNOWN;
+}
+
+// This function takes the topology leaf, a levels array to store the levels
+// detected and a bitmap of the known levels.
+// Returns the number of levels in the topology
+static unsigned
+__kmp_x2apicid_get_levels(int leaf,
+                          cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST],
+                          kmp_uint64 known_levels) {
+  unsigned level, levels_index;
+  unsigned level_type, mask_width, nitems;
+  kmp_cpuid buf;
+
+  // The new algorithm has known topology layers act as highest unknown topology
+  // layers when unknown topology layers exist.
+  // e.g., Suppose layers were SMT CORE <Y> <Z> PACKAGE
+  // Then CORE will take the characteristics (nitems and mask width) of <Z>.
+  // In developing the id mask for each layer, this eliminates unknown portions
+  // of the topology while still keeping the correct underlying structure.
+  level = levels_index = 0;
+  do {
+    __kmp_x86_cpuid(leaf, level, &buf);
+    level_type = __kmp_extract_bits<8, 15>(buf.ecx);
+    mask_width = __kmp_extract_bits<0, 4>(buf.eax);
+    nitems = __kmp_extract_bits<0, 15>(buf.ebx);
+    if (level_type != INTEL_LEVEL_TYPE_INVALID && nitems == 0)
+      return 0;
+
+    if (known_levels & (1ull << level_type)) {
+      // Add a new level to the topology
+      KMP_ASSERT(levels_index < INTEL_LEVEL_TYPE_LAST);
+      levels[levels_index].level_type = level_type;
+      levels[levels_index].mask_width = mask_width;
+      levels[levels_index].nitems = nitems;
+      levels_index++;
+    } else {
+      // If it is an unknown level, then logically move the previous layer up
+      if (levels_index > 0) {
+        levels[levels_index - 1].mask_width = mask_width;
+        levels[levels_index - 1].nitems = nitems;
+      }
+    }
+    level++;
+  } while (level_type != INTEL_LEVEL_TYPE_INVALID);
+
+  // Set the masks to & with apicid
+  for (unsigned i = 0; i < levels_index; ++i) {
+    if (levels[i].level_type != INTEL_LEVEL_TYPE_INVALID) {
+      levels[i].mask = ~((-1) << levels[i].mask_width);
+      levels[i].cache_mask = (-1) << levels[i].mask_width;
+      for (unsigned j = 0; j < i; ++j)
+        levels[i].mask ^= levels[j].mask;
+    } else {
+      KMP_DEBUG_ASSERT(levels_index > 0);
+      levels[i].mask = (-1) << levels[i - 1].mask_width;
+      levels[i].cache_mask = 0;
+    }
+  }
+  return levels_index;
+}
+
  static int __kmp_cpuid_mask_width(int count) {
    int r = 0;
  
@@ -1573,123 +1691,102 @@ static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
  
  // Intel(R) microarchitecture code name Nehalem, Dunnington and later
  // architectures support a newer interface for specifying the x2APIC Ids,
-// based on cpuid leaf 11.
+// based on CPUID.B or CPUID.1F
  static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
                                                kmp_i18n_id_t *const msg_id) {
+
+  cpuid_level_info_t levels[INTEL_LEVEL_TYPE_LAST];
+  int ratio[KMP_HW_LAST];
+  int count[KMP_HW_LAST];
+  kmp_hw_t types[INTEL_LEVEL_TYPE_LAST];
+  unsigned levels_index;
    kmp_cpuid buf;
-  *address2os = NULL;
-  *msg_id = kmp_i18n_null;
+  kmp_uint64 known_levels;
+  int topology_leaf, highest_leaf, apic_id;
+  int num_leaves;
+  static int leaves[] = {0, 0};
  
-  // Check to see if cpuid leaf 11 is supported.
-  __kmp_x86_cpuid(0, 0, &buf);
-  if (buf.eax < 11) {
-    *msg_id = kmp_i18n_str_NoLeaf11Support;
-    return -1;
-  }
-  __kmp_x86_cpuid(11, 0, &buf);
-  if (buf.ebx == 0) {
-    *msg_id = kmp_i18n_str_NoLeaf11Support;
-    return -1;
-  }
+  kmp_i18n_id_t leaf_message_id;
  
-  // Find the number of levels in the machine topology. While we're at it, get
-  // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
-  // get more accurate values later by explicitly counting them, but get
-  // reasonable defaults now, in case we return early.
-  int level;
-  int threadLevel = -1;
-  int coreLevel = -1;
-  int pkgLevel = -1;
-  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+  KMP_BUILD_ASSERT(sizeof(known_levels) * CHAR_BIT > KMP_HW_LAST);
  
-  for (level = 0;; level++) {
-    if (level > 31) {
-      // FIXME: Hack for DPD200163180
-      //
-      // If level is big then something went wrong -> exiting
-      //
-      // There could actually be 32 valid levels in the machine topology, but so
-      // far, the only machine we have seen which does not exit this loop before
-      // iteration 32 has fubar x2APIC settings.
-      //
-      // For now, just reject this case based upon loop trip count.
-      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-      return -1;
-    }
-    __kmp_x86_cpuid(11, level, &buf);
-    if (buf.ebx == 0) {
-      if (pkgLevel < 0) {
-        // Will infer nPackages from __kmp_xproc
-        pkgLevel = level;
-        level++;
-      }
-      break;
-    }
-    int kind = (buf.ecx >> 8) & 0xff;
-    if (kind == 1) {
-      // SMT level
-      threadLevel = level;
-      coreLevel = -1;
-      pkgLevel = -1;
-      __kmp_nThreadsPerCore = buf.ebx & 0xffff;
-      if (__kmp_nThreadsPerCore == 0) {
-        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-        return -1;
-      }
-    } else if (kind == 2) {
-      // core level
-      coreLevel = level;
-      pkgLevel = -1;
-      nCoresPerPkg = buf.ebx & 0xffff;
-      if (nCoresPerPkg == 0) {
-        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-        return -1;
-      }
-    } else {
-      if (level <= 0) {
-        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-        return -1;
-      }
-      if (pkgLevel >= 0) {
-        continue;
-      }
-      pkgLevel = level;
-      nPackages = buf.ebx & 0xffff;
-      if (nPackages == 0) {
-        *msg_id = kmp_i18n_str_InvalidCpuidInfo;
-        return -1;
-      }
+  *msg_id = kmp_i18n_null;
+
+  // Figure out the known topology levels
+  known_levels = 0ull;
+  for (int i = 0; i < INTEL_LEVEL_TYPE_LAST; ++i) {
+    if (__kmp_intel_type_2_topology_type(i) != KMP_HW_UNKNOWN) {
+      known_levels |= (1ull << i);
      }
    }
-  int depth = level;
  
-  // In the above loop, "level" was counted from the finest level (usually
-  // thread) to the coarsest.  The caller expects that we will place the labels
-  // in (*address2os)[].first.labels[] in the inverse order, so we need to
-  // invert the vars saying which level means what.
-  if (threadLevel >= 0) {
-    threadLevel = depth - threadLevel - 1;
+  // Get the highest cpuid leaf supported
+  __kmp_x86_cpuid(0, 0, &buf);
+  highest_leaf = buf.eax;
+
+  // If a specific topology method was requested, only allow that specific leaf
+  // otherwise, try both leaves 31 and 11 in that order
+  num_leaves = 0;
+  if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
+    num_leaves = 1;
+    leaves[0] = 11;
+    leaf_message_id = kmp_i18n_str_NoLeaf11Support;
+  } else if (__kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
+    num_leaves = 1;
+    leaves[0] = 31;
+    leaf_message_id = kmp_i18n_str_NoLeaf31Support;
+  } else {
+    num_leaves = 2;
+    leaves[0] = 31;
+    leaves[1] = 11;
+    leaf_message_id = kmp_i18n_str_NoLeaf11Support;
    }
-  if (coreLevel >= 0) {
-    coreLevel = depth - coreLevel - 1;
+
+  // Check to see if cpuid leaf 31 or 11 is supported.
+  __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
+  topology_leaf = -1;
+  for (int i = 0; i < num_leaves; ++i) {
+    int leaf = leaves[i];
+    if (highest_leaf < leaf)
+      continue;
+    __kmp_x86_cpuid(leaf, 0, &buf);
+    if (buf.ebx == 0)
+      continue;
+    topology_leaf = leaf;
+    levels_index = __kmp_x2apicid_get_levels(leaf, levels, known_levels);
+    if (levels_index == 0)
+      continue;
+    break;
+  }
+  if (topology_leaf == -1 || levels_index == 0) {
+    *msg_id = leaf_message_id;
+    return -1;
    }
-  KMP_DEBUG_ASSERT(pkgLevel >= 0);
-  pkgLevel = depth - pkgLevel - 1;
+  KMP_ASSERT(levels_index <= INTEL_LEVEL_TYPE_LAST);
  
    // The algorithm used starts by setting the affinity to each available thread
    // and retrieving info from the cpuid instruction, so if we are not capable of
-  // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
-  // need to do something else - use the defaults that we calculated from
+  // calling __kmp_get_system_affinity() and __kmp_get_system_affinity(), then
+  // we need to do something else - use the defaults that we calculated from
    // issuing cpuid without binding to each proc.
    if (!KMP_AFFINITY_CAPABLE()) {
      // Hack to try and infer the machine topology using only the data
      // available from cpuid on the current thread, and __kmp_xproc.
      KMP_ASSERT(__kmp_affinity_type == affinity_none);
  
+    for (unsigned i = 0; i < levels_index; ++i) {
+      if (levels[i].level_type == INTEL_LEVEL_TYPE_SMT) {
+        __kmp_nThreadsPerCore = levels[i].nitems;
+      } else if (levels[i].level_type == INTEL_LEVEL_TYPE_CORE) {
+        nCoresPerPkg = levels[i].nitems;
+      } else if (levels[i].level_type == INTEL_LEVEL_TYPE_DIE) {
+        nDiesPerPkg = levels[i].nitems;
+      }
+    }
      __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
      nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
      if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
+      KMP_INFORM(AffNotCapableUseLocCpuidL, "KMP_AFFINITY", topology_leaf);
        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
        if (__kmp_affinity_uniform_topology()) {
          KMP_INFORM(Uniform, "KMP_AFFINITY");
@@ -1712,6 +1809,9 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
    __kmp_get_system_affinity(oldMask, TRUE);
  
    // Allocate the data structure to be returned.
+  int depth = levels_index;
+  for (int i = depth - 1, j = 0; i >= 0; --i, ++j)
+    types[j] = __kmp_intel_type_2_topology_type(levels[i].level_type);
    AddrUnsPair *retval =
        (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
  
@@ -1720,6 +1820,9 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
    unsigned int proc;
    int nApics = 0;
    KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
+    cpuid_level_info_t my_levels[INTEL_LEVEL_TYPE_LAST];
+    unsigned my_levels_index;
+
      // Skip this proc if it is not included in the machine model.
      if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
        continue;
@@ -1728,36 +1831,24 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
  
      __kmp_affinity_dispatch->bind_thread(proc);
  
-    // Extract labels for each level in the machine topology map from Apic ID.
+    // New algorithm
+    __kmp_x86_cpuid(topology_leaf, 0, &buf);
+    apic_id = buf.edx;
      Address addr(depth);
-    int prev_shift = 0;
-
-    for (level = 0; level < depth; level++) {
-      __kmp_x86_cpuid(11, level, &buf);
-      unsigned apicId = buf.edx;
-      if (buf.ebx == 0) {
-        if (level != depth - 1) {
-          KMP_CPU_FREE(oldMask);
-          *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
-          return -1;
-        }
-        addr.labels[depth - level - 1] = apicId >> prev_shift;
-        level++;
-        break;
-      }
-      int shift = buf.eax & 0x1f;
-      int mask = (1 << shift) - 1;
-      addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
-      prev_shift = shift;
-    }
-    if (level != depth) {
+    my_levels_index =
+        __kmp_x2apicid_get_levels(topology_leaf, my_levels, known_levels);
+    if (my_levels_index == 0 || my_levels_index != levels_index) {
        KMP_CPU_FREE(oldMask);
-      *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
+      *msg_id = kmp_i18n_str_InvalidCpuidInfo;
        return -1;
      }
-
-    retval[nApics] = AddrUnsPair(addr, proc);
-    nApics++;
+    // Put in topology information
+    for (unsigned j = 0, idx = depth - 1; j < my_levels_index; ++j, --idx) {
+      addr.labels[idx] = apic_id & my_levels[j].mask;
+      if (j > 0)
+        addr.labels[idx] >>= my_levels[j - 1].mask_width;
+    }
+    retval[nApics++] = AddrUnsPair(addr, proc);
    }
  
    // We've collected all the info we need.
@@ -1767,10 +1858,11 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
    // If there's only one thread context to bind to, return now.
    KMP_ASSERT(nApics > 0);
    if (nApics == 1) {
+    int pkg_level;
      __kmp_ncores = nPackages = 1;
      __kmp_nThreadsPerCore = nCoresPerPkg = 1;
      if (__kmp_affinity_verbose) {
-      KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
+      KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf);
        KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
        KMP_INFORM(Uniform, "KMP_AFFINITY");
        KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
@@ -1783,9 +1875,15 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
        return 0;
      }
  
+    pkg_level = 0;
+    for (int i = 0; i < depth; ++i)
+      if (types[i] == KMP_HW_SOCKET) {
+        pkg_level = i;
+        break;
+      }
      // Form an Address object which only includes the package level.
      Address addr(1);
-    addr.labels[0] = retval[0].first.labels[pkgLevel];
+    addr.labels[0] = retval[0].first.labels[pkg_level];
      retval[0].first = addr;
  
      if (__kmp_affinity_gran_levels < 0) {
@@ -1804,89 +1902,51 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
    // Sort the table by physical Id.
    qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
  
-  // Find the radix at each of the levels.
-  unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
-  for (level = 0; level < depth; level++) {
-    totals[level] = 1;
-    maxCt[level] = 1;
-    counts[level] = 1;
-    last[level] = retval[0].first.labels[level];
-  }
-
-  // From here on, the iteration variable "level" runs from the finest level to
-  // the coarsest, i.e. we iterate forward through
-  // (*address2os)[].first.labels[] - in the previous loops, we iterated
-  // backwards.
-  for (proc = 1; (int)proc < nApics; proc++) {
-    int level;
-    for (level = 0; level < depth; level++) {
-      if (retval[proc].first.labels[level] != last[level]) {
-        int j;
-        for (j = level + 1; j < depth; j++) {
-          totals[j]++;
-          counts[j] = 1;
-          // The line below causes printing incorrect topology information in
-          // case the max value for some level (maxCt[level]) is encountered
-          // earlier than some less value while going through the array. For
-          // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
-          // maxCt[1] == 2
-          // whereas it must be 4.
-          // TODO!!! Check if it can be commented safely
-          // maxCt[j] = 1;
-          last[j] = retval[proc].first.labels[j];
-        }
-        totals[level]++;
-        counts[level]++;
-        if (counts[level] > maxCt[level]) {
-          maxCt[level] = counts[level];
-        }
-        last[level] = retval[proc].first.labels[level];
-        break;
-      } else if (level == depth - 1) {
-        __kmp_free(last);
-        __kmp_free(maxCt);
-        __kmp_free(counts);
-        __kmp_free(totals);
-        __kmp_free(retval);
-        KMP_CPU_FREE(oldMask);
-        *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
-        return -1;
-      }
-    }
-  }
+  __kmp_affinity_gather_enumeration_information(retval, nApics, depth, types,
+                                                ratio, count);
  
    // When affinity is off, this routine will still be called to set
    // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
    // Make sure all these vars are set correctly, and return if affinity is not
    // enabled.
-  if (threadLevel >= 0) {
-    __kmp_nThreadsPerCore = maxCt[threadLevel];
-  } else {
-    __kmp_nThreadsPerCore = 1;
+  int thread_level, core_level, socket_level, die_level;
+  thread_level = core_level = die_level = socket_level = -1;
+  for (int level = 0; level < depth; ++level) {
+    if (types[level] == KMP_HW_THREAD)
+      thread_level = level;
+    else if (types[level] == KMP_HW_CORE)
+      core_level = level;
+    else if (types[level] == KMP_HW_DIE)
+      die_level = level;
+    else if (types[level] == KMP_HW_SOCKET)
+      socket_level = level;
    }
-  nPackages = totals[pkgLevel];
-
-  if (coreLevel >= 0) {
-    __kmp_ncores = totals[coreLevel];
-    nCoresPerPkg = maxCt[coreLevel];
+  __kmp_nThreadsPerCore =
+      __kmp_affinity_calculate_ratio(ratio, thread_level, core_level);
+  if (die_level > 0) {
+    nDiesPerPkg =
+        __kmp_affinity_calculate_ratio(ratio, die_level, socket_level);
+    nCoresPerPkg = __kmp_affinity_calculate_ratio(ratio, core_level, die_level);
    } else {
-    __kmp_ncores = nPackages;
-    nCoresPerPkg = 1;
+    nCoresPerPkg =
+        __kmp_affinity_calculate_ratio(ratio, core_level, socket_level);
    }
+  if (socket_level >= 0)
+    nPackages = count[socket_level];
+  else
+    nPackages = 1;
+  if (core_level >= 0)
+    __kmp_ncores = count[core_level];
+  else
+    __kmp_ncores = 1;
  
    // Check to see if the machine topology is uniform
-  unsigned prod = maxCt[0];
-  for (level = 1; level < depth; level++) {
-    prod *= maxCt[level];
-  }
-  bool uniform = (prod == totals[level - 1]);
+  unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count);
  
    // Print the machine topology summary.
    if (__kmp_affinity_verbose) {
-    KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
+    kmp_hw_t numerator_type, denominator_type;
+    KMP_INFORM(AffUseGlobCpuidL, "KMP_AFFINITY", topology_leaf);
      KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
      if (uniform) {
        KMP_INFORM(Uniform, "KMP_AFFINITY");
@@ -1897,15 +1957,31 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
      kmp_str_buf_t buf;
      __kmp_str_buf_init(&buf);
  
-    __kmp_str_buf_print(&buf, "%d", totals[0]);
-    for (level = 1; level <= pkgLevel; level++) {
-      __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
-    }
-    KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
-               __kmp_nThreadsPerCore, __kmp_ncores);
+    if (core_level < 0)
+      core_level = depth - 1;
+    int ncores = count[core_level];
  
+    denominator_type = KMP_HW_UNKNOWN;
+    for (int level = 0; level < depth; ++level) {
+      int c;
+      bool plural;
+      numerator_type = types[level];
+      c = ratio[level];
+      plural = (c > 1);
+      if (level == 0) {
+        __kmp_str_buf_print(&buf, "%d %s", c, __kmp_hw_get_catalog_string(
+                                                  numerator_type, plural));
+      } else {
+        __kmp_str_buf_print(&buf, " x %d %s/%s", c,
+                            __kmp_hw_get_catalog_string(numerator_type, plural),
+                            __kmp_hw_get_catalog_string(denominator_type));
+      }
+      denominator_type = numerator_type;
+    }
+    KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores);
      __kmp_str_buf_free(&buf);
    }
+
    KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
    KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
    __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
@@ -1913,10 +1989,6 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
      __kmp_pu_os_idx[proc] = retval[proc].second;
    }
    if (__kmp_affinity_type == affinity_none) {
-    __kmp_free(last);
-    __kmp_free(maxCt);
-    __kmp_free(counts);
-    __kmp_free(totals);
      __kmp_free(retval);
      KMP_CPU_FREE(oldMask);
      return 0;
@@ -1924,64 +1996,30 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
  
    // Find any levels with radix 1, and remove them from the map
    // (except for the package level).
-  int new_depth = 0;
-  for (level = 0; level < depth; level++) {
-    if ((maxCt[level] == 1) && (level != pkgLevel)) {
-      continue;
-    }
-    new_depth++;
-  }
-
-  // If we are removing any levels, allocate a new vector to return,
-  // and copy the relevant information to it.
-  if (new_depth != depth) {
-    AddrUnsPair *new_retval =
-        (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
-    for (proc = 0; (int)proc < nApics; proc++) {
-      Address addr(new_depth);
-      new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
-    }
-    int new_level = 0;
-    int newPkgLevel = -1;
-    int newCoreLevel = -1;
-    int newThreadLevel = -1;
-    for (level = 0; level < depth; level++) {
-      if ((maxCt[level] == 1) && (level != pkgLevel)) {
-        // Remove this level. Never remove the package level
-        continue;
-      }
-      if (level == pkgLevel) {
-        newPkgLevel = new_level;
-      }
-      if (level == coreLevel) {
-        newCoreLevel = new_level;
-      }
-      if (level == threadLevel) {
-        newThreadLevel = new_level;
-      }
-      for (proc = 0; (int)proc < nApics; proc++) {
-        new_retval[proc].first.labels[new_level] =
-            retval[proc].first.labels[level];
-      }
-      new_level++;
-    }
-
-    __kmp_free(retval);
-    retval = new_retval;
-    depth = new_depth;
-    pkgLevel = newPkgLevel;
-    coreLevel = newCoreLevel;
-    threadLevel = newThreadLevel;
+  depth = __kmp_affinity_remove_radix_one_levels(retval, nApics, depth, types);
+  thread_level = core_level = die_level = socket_level = -1;
+  for (int level = 0; level < depth; ++level) {
+    if (types[level] == KMP_HW_THREAD)
+      thread_level = level;
+    else if (types[level] == KMP_HW_CORE)
+      core_level = level;
+    else if (types[level] == KMP_HW_DIE)
+      die_level = level;
+    else if (types[level] == KMP_HW_SOCKET)
+      socket_level = level;
    }
  
    if (__kmp_affinity_gran_levels < 0) {
      // Set the granularity level based on what levels are modeled
      // in the machine topology map.
      __kmp_affinity_gran_levels = 0;
-    if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
+    if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
        __kmp_affinity_gran_levels++;
      }
-    if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+    if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
+      __kmp_affinity_gran_levels++;
+    }
+    if ((die_level >= 0) && (__kmp_affinity_gran > affinity_gran_die)) {
        __kmp_affinity_gran_levels++;
      }
      if (__kmp_affinity_gran > affinity_gran_package) {
@@ -1990,14 +2028,9 @@ static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
    }
  
    if (__kmp_affinity_verbose) {
-    __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
-                                  threadLevel);
+    __kmp_affinity_print_topology(retval, nApics, depth, types);
    }
  
-  __kmp_free(last);
-  __kmp_free(maxCt);
-  __kmp_free(counts);
-  __kmp_free(totals);
    KMP_CPU_FREE(oldMask);
    *address2os = retval;
    return depth;
@@ -3951,6 +3984,8 @@ static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
      }
      if (__kmp_hws_socket.num == 0)
        __kmp_hws_socket.num = nPackages; // use all available sockets
+    if (__kmp_hws_die.num == 0)
+      __kmp_hws_die.num = nDiesPerPkg; // use all available dies
      if (__kmp_hws_core.num == 0)
        __kmp_hws_core.num = nCoresPerPkg; // use all available cores
      if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
@@ -3959,7 +3994,7 @@ static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
        KMP_WARNING(AffHWSubsetNonUniform);
        goto _exit; // don't support non-uniform topology
      }
-    if (depth > 3) {
+    if (depth > 4) {
        KMP_WARNING(AffHWSubsetNonThreeLevel);
        goto _exit; // don't support not-3-level topology
      }
@@ -3967,6 +4002,10 @@ static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
        KMP_WARNING(AffHWSubsetManySockets);
        goto _exit;
      }
+    if (depth == 4 && __kmp_hws_die.offset + __kmp_hws_die.num > nDiesPerPkg) {
+      KMP_WARNING(AffHWSubsetManyDies);
+      goto _exit;
+    }
      if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
        KMP_WARNING(AffHWSubsetManyCores);
        goto _exit;
@@ -3974,62 +4013,84 @@ static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
      // Form the requested subset
      if (pAddr) // pAddr is NULL in case of affinity_none
        newAddr = (AddrUnsPair *)__kmp_allocate(
-          sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
-          __kmp_hws_proc.num);
+          sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_die.num *
+          __kmp_hws_core.num * __kmp_hws_proc.num);
      for (int i = 0; i < nPackages; ++i) {
        if (i < __kmp_hws_socket.offset ||
            i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
          // skip not-requested socket
-        n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
+        n_old += nDiesPerPkg * nCoresPerPkg * __kmp_nThreadsPerCore;
          if (__kmp_pu_os_idx != NULL) {
            // walk through skipped socket
-          for (int j = 0; j < nCoresPerPkg; ++j) {
-            for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-              KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
-              ++proc_num;
+          for (int l = 0; l < nDiesPerPkg; ++l) {
+            for (int j = 0; j < nCoresPerPkg; ++j) {
+              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+                KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+                ++proc_num;
+              }
              }
            }
          }
        } else {
          // walk through requested socket
-        for (int j = 0; j < nCoresPerPkg; ++j) {
-          if (j < __kmp_hws_core.offset ||
-              j >= __kmp_hws_core.offset +
-                       __kmp_hws_core.num) { // skip not-requested core
-            n_old += __kmp_nThreadsPerCore;
+        for (int l = 0; l < nDiesPerPkg; ++l) {
+          // skip unwanted die
+          if (l < __kmp_hws_die.offset ||
+              l >= __kmp_hws_die.offset + __kmp_hws_die.num) {
+            n_old += nCoresPerPkg;
              if (__kmp_pu_os_idx != NULL) {
-              for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+              for (int k = 0; k < nCoresPerPkg; ++k) {
                  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
                  ++proc_num;
                }
              }
            } else {
-            // walk through requested core
-            for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
-              if (k < __kmp_hws_proc.num) {
-                if (pAddr) // collect requested thread's data
-                  newAddr[n_new] = (*pAddr)[n_old];
-                n_new++;
+            for (int j = 0; j < nCoresPerPkg; ++j) {
+              if (j < __kmp_hws_core.offset ||
+                  j >= __kmp_hws_core.offset +
+                           __kmp_hws_core.num) { // skip not-requested core
+                n_old += __kmp_nThreadsPerCore;
+                if (__kmp_pu_os_idx != NULL) {
+                  for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+                    KMP_CPU_CLR(__kmp_pu_os_idx[proc_num],
+                                __kmp_affin_fullMask);
+                    ++proc_num;
+                  }
+                }
                } else {
-                if (__kmp_pu_os_idx != NULL)
-                  KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
+                // walk through requested core
+                for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
+                  if (k < __kmp_hws_proc.num) {
+                    if (pAddr) // collect requested thread's data
+                      newAddr[n_new] = (*pAddr)[n_old];
+                    n_new++;
+                  } else {
+                    if (__kmp_pu_os_idx != NULL)
+                      KMP_CPU_CLR(__kmp_pu_os_idx[proc_num],
+                                  __kmp_affin_fullMask);
+                  }
+                  n_old++;
+                  ++proc_num;
+                }
                }
-              n_old++;
-              ++proc_num;
              }
            }
          }
        }
      }
-    KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
+    KMP_DEBUG_ASSERT(n_old ==
+                     nPackages * nDiesPerPkg * nCoresPerPkg *
+                         __kmp_nThreadsPerCore);
      KMP_DEBUG_ASSERT(n_new ==
-                     __kmp_hws_socket.num * __kmp_hws_core.num *
-                         __kmp_hws_proc.num);
+                     __kmp_hws_socket.num * __kmp_hws_die.num *
+                         __kmp_hws_core.num * __kmp_hws_proc.num);
      nPackages = __kmp_hws_socket.num; // correct nPackages
      nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
+    nDiesPerPkg = __kmp_hws_die.num; // correct nDiesPerPkg
      __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
      __kmp_avail_proc = n_new; // correct avail_proc
-    __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
+    __kmp_ncores =
+        nPackages * nDiesPerPkg * __kmp_hws_core.num; // correct ncores
    } // non-hwloc topology method
    if (pAddr) {
      __kmp_free(*pAddr);
@@ -4395,7 +4456,8 @@ static void __kmp_aux_affinity_initialize(void) {
  
  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
  
-  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
+  else if (__kmp_affinity_top_method == affinity_top_method_x2apicid ||
+           __kmp_affinity_top_method == affinity_top_method_x2apicid_1f) {
      if (__kmp_affinity_verbose) {
        KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
      }
diff --git a/openmp/runtime/src/kmp_global.cpp b/openmp/runtime/src/kmp_global.cpp

index 4e0035e..afe7232 100644 (file)
--- a/openmp/runtime/src/kmp_global.cpp
+++ b/openmp/runtime/src/kmp_global.cpp
@@ -284,6 +284,7 @@ int __kmp_display_affinity = FALSE;
  char *__kmp_affinity_format = NULL;
  
  kmp_hws_item_t __kmp_hws_socket = {0, 0};
+kmp_hws_item_t __kmp_hws_die = {0, 0};
  kmp_hws_item_t __kmp_hws_node = {0, 0};
  kmp_hws_item_t __kmp_hws_tile = {0, 0};
  kmp_hws_item_t __kmp_hws_core = {0, 0};
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp

index a852213..693ee34 100644 (file)
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -2216,6 +2216,9 @@ static void __kmp_parse_affinity_env(char const *name, char const *value,
          set_gran(affinity_gran_tile, -1);
          buf = next;
  #endif
+      } else if (__kmp_match_str("die", buf, CCAST(const char **, &next))) {
+        set_gran(affinity_gran_die, -1);
+        buf = next;
        } else if (__kmp_match_str("package", buf, CCAST(const char **, &next))) {
          set_gran(affinity_gran_package, -1);
          buf = next;
@@ -2856,6 +2859,13 @@ static void __kmp_stg_parse_places(char const *name, char const *value,
      __kmp_affinity_dups = FALSE;
      kind = "\"tiles\"";
  #endif
+  } else if (__kmp_match_str("dice", scan, &next) ||
+             __kmp_match_str("dies", scan, &next)) {
+    scan = next;
+    __kmp_affinity_type = affinity_compact;
+    __kmp_affinity_gran = affinity_gran_die;
+    __kmp_affinity_dups = FALSE;
+    kind = "\"dice\"";
    } else if (__kmp_match_str("sockets", scan, &next)) {
      scan = next;
      __kmp_affinity_type = affinity_compact;
@@ -2986,28 +2996,38 @@ static void __kmp_stg_parse_topology_method(char const *name, char const *value,
    }
  #endif
  #if KMP_ARCH_X86 || KMP_ARCH_X86_64
-  else if (__kmp_str_match("x2apic id", 9, value) ||
-           __kmp_str_match("x2apic_id", 9, value) ||
-           __kmp_str_match("x2apic-id", 9, value) ||
-           __kmp_str_match("x2apicid", 8, value) ||
-           __kmp_str_match("cpuid leaf 11", 13, value) ||
-           __kmp_str_match("cpuid_leaf_11", 13, value) ||
-           __kmp_str_match("cpuid-leaf-11", 13, value) ||
-           __kmp_str_match("cpuid leaf11", 12, value) ||
-           __kmp_str_match("cpuid_leaf11", 12, value) ||
-           __kmp_str_match("cpuid-leaf11", 12, value) ||
-           __kmp_str_match("cpuidleaf 11", 12, value) ||
-           __kmp_str_match("cpuidleaf_11", 12, value) ||
-           __kmp_str_match("cpuidleaf-11", 12, value) ||
-           __kmp_str_match("cpuidleaf11", 11, value) ||
-           __kmp_str_match("cpuid 11", 8, value) ||
-           __kmp_str_match("cpuid_11", 8, value) ||
-           __kmp_str_match("cpuid-11", 8, value) ||
-           __kmp_str_match("cpuid11", 7, value) ||
-           __kmp_str_match("leaf 11", 7, value) ||
-           __kmp_str_match("leaf_11", 7, value) ||
-           __kmp_str_match("leaf-11", 7, value) ||
-           __kmp_str_match("leaf11", 6, value)) {
+  else if (__kmp_str_match("cpuid_leaf31", 12, value) ||
+           __kmp_str_match("cpuid 1f", 8, value) ||
+           __kmp_str_match("cpuid 31", 8, value) ||
+           __kmp_str_match("cpuid1f", 7, value) ||
+           __kmp_str_match("cpuid31", 7, value) ||
+           __kmp_str_match("leaf 1f", 7, value) ||
+           __kmp_str_match("leaf 31", 7, value) ||
+           __kmp_str_match("leaf1f", 6, value) ||
+           __kmp_str_match("leaf31", 6, value)) {
+    __kmp_affinity_top_method = affinity_top_method_x2apicid_1f;
+  } else if (__kmp_str_match("x2apic id", 9, value) ||
+             __kmp_str_match("x2apic_id", 9, value) ||
+             __kmp_str_match("x2apic-id", 9, value) ||
+             __kmp_str_match("x2apicid", 8, value) ||
+             __kmp_str_match("cpuid leaf 11", 13, value) ||
+             __kmp_str_match("cpuid_leaf_11", 13, value) ||
+             __kmp_str_match("cpuid-leaf-11", 13, value) ||
+             __kmp_str_match("cpuid leaf11", 12, value) ||
+             __kmp_str_match("cpuid_leaf11", 12, value) ||
+             __kmp_str_match("cpuid-leaf11", 12, value) ||
+             __kmp_str_match("cpuidleaf 11", 12, value) ||
+             __kmp_str_match("cpuidleaf_11", 12, value) ||
+             __kmp_str_match("cpuidleaf-11", 12, value) ||
+             __kmp_str_match("cpuidleaf11", 11, value) ||
+             __kmp_str_match("cpuid 11", 8, value) ||
+             __kmp_str_match("cpuid_11", 8, value) ||
+             __kmp_str_match("cpuid-11", 8, value) ||
+             __kmp_str_match("cpuid11", 7, value) ||
+             __kmp_str_match("leaf 11", 7, value) ||
+             __kmp_str_match("leaf_11", 7, value) ||
+             __kmp_str_match("leaf-11", 7, value) ||
+             __kmp_str_match("leaf11", 6, value)) {
      __kmp_affinity_top_method = affinity_top_method_x2apicid;
    } else if (__kmp_str_match("apic id", 7, value) ||
               __kmp_str_match("apic_id", 7, value) ||
@@ -4738,6 +4758,12 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
        __kmp_hws_node.num = num;
        __kmp_hws_node.offset = offset;
        break;
+    case 'D': // Die
+      if (__kmp_hws_die.num > 0)
+        goto err; // duplicate is not allowed
+      __kmp_hws_die.num = num;
+      __kmp_hws_die.offset = offset;
+      break;
      case 'L': // Cache
        if (*(pos + 1) == '2') { // L2 - Tile
          if (__kmp_hws_tile.num > 0)
@@ -4745,7 +4771,7 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
          __kmp_hws_tile.num = num;
          __kmp_hws_tile.offset = offset;
        } else if (*(pos + 1) == '3') { // L3 - Socket
-        if (__kmp_hws_socket.num > 0)
+        if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0)
            goto err; // duplicate is not allowed
          __kmp_hws_socket.num = num;
          __kmp_hws_socket.offset = offset;
@@ -4770,7 +4796,7 @@ static void __kmp_stg_parse_hw_subset(char const *name, char const *value,
            __kmp_hws_tile.num = num;
            __kmp_hws_tile.offset = offset;
          } else if (*d == '3') { // L3 - Socket
-          if (__kmp_hws_socket.num > 0)
+          if (__kmp_hws_socket.num > 0 || __kmp_hws_die.num > 0)
              goto err; // duplicate is not allowed
            __kmp_hws_socket.num = num;
            __kmp_hws_socket.offset = offset;
@@ -4817,6 +4843,12 @@ static void __kmp_stg_print_hw_subset(kmp_str_buf_t *buffer, char const *name,
          __kmp_str_buf_print(&buf, "@%d", __kmp_hws_socket.offset);
        comma = 1;
      }
+    if (__kmp_hws_die.num) {
+      __kmp_str_buf_print(&buf, "%s%dd", comma ? "," : "", __kmp_hws_die.num);
+      if (__kmp_hws_die.offset)
+        __kmp_str_buf_print(&buf, "@%d", __kmp_hws_die.offset);
+      comma = 1;
+    }
      if (__kmp_hws_node.num) {
        __kmp_str_buf_print(&buf, "%s%dn", comma ? "," : "", __kmp_hws_node.num);
        if (__kmp_hws_node.offset)
author	Peyton, Jonathan L <jonathan.l.peyton@intel.com>
	Fri, 15 Jan 2021 19:38:50 +0000 (13:38 -0600)
committer	Peyton, Jonathan L <jonathan.l.peyton@intel.com>
	Wed, 27 Jan 2021 20:27:23 +0000 (14:27 -0600)
openmp/runtime/src/i18n/en_US.txt		patch \| blob \| history
openmp/runtime/src/kmp.h		patch \| blob \| history
openmp/runtime/src/kmp_affinity.cpp		patch \| blob \| history
openmp/runtime/src/kmp_global.cpp		patch \| blob \| history
openmp/runtime/src/kmp_settings.cpp		patch \| blob \| history