[OpenMP] Fix HWLOC topology detection for 2.0.x

HWLOC 2.0 has numa nodes as separate children and are not in the main
parent/child topology tree anymore.  This change takes this into
account.  The main topology detection loop in the create_hwloc_map()
routine starts at a hardware thread within the initial affinity mask and
goes up the topology tree setting the socket/core/thread labels
correctly.

This change also introduces some of the more generic changes that the
future kmp_topology_t structure will take advantage of including a
generic ratio & count array (finding all ratios of topology layers like
threads/core cores/socket and finding all counts of each topology
layer), generic radix1 reduction step, generic uniformity check, and
generic printing of topology (en_US.txt)

Differential Revision: https://reviews.llvm.org/D95156
This commit is contained in:
Peyton, Jonathan L 2021-01-11 15:17:02 -06:00
parent ff038b316d
commit 9f87c6b47d
3 changed files with 500 additions and 331 deletions

View File

@ -103,6 +103,26 @@ DisplayEnvEnd "OPENMP DISPLAY ENVIRONMENT END"
Device "[device]"
Host "[host]"
Tile "tile"
Tiles "tiles"
Threads "threads"
Cores "cores"
Socket "socket"
Sockets "sockets"
Die "die"
Dice "dice"
Module "module"
Modules "modules"
L1Cache "L1 cache"
L1Caches "L1 caches"
L2Cache "L2 cache"
L2Caches "L2 caches"
L3Cache "L3 cache"
L3Caches "L3 caches"
NumaDomain "NUMA domain"
NumaDomains "NUMA domains"
ProcGroup "processor group"
ProcGroups "processor groups"
Unknown "unknown"
@ -333,6 +353,7 @@ TopologyExtraNoTi "%1$s: %2$d packages x %3$d nodes/pkg x %4$d tiles/
OmptOutdatedWorkshare "OMPT: Cannot determine workshare type; using the default (loop) instead. "
"This issue is fixed in an up-to-date compiler."
OmpNoAllocator "Allocator %1$s is not available, will use default allocator."
TopologyGeneric "%1$s: %2$s (%3$d total cores)"
# --- OpenMP errors detected at runtime ---
#

View File

@ -595,6 +595,32 @@ typedef int PACKED_REDUCTION_METHOD_T;
#include <pthread.h>
#endif
enum kmp_hw_t : int {
KMP_HW_UNKNOWN = -1,
KMP_HW_MACHINE = 0,
KMP_HW_SOCKET,
KMP_HW_PROC_GROUP,
KMP_HW_NUMA,
KMP_HW_DIE,
KMP_HW_L3,
KMP_HW_TILE,
KMP_HW_MODULE,
KMP_HW_L2,
KMP_HW_L1,
KMP_HW_CORE,
KMP_HW_THREAD,
KMP_HW_LAST
};
#define KMP_ASSERT_VALID_HW_TYPE(type) \
KMP_DEBUG_ASSERT(type >= (kmp_hw_t)0 && type < KMP_HW_LAST)
#define KMP_FOREACH_HW_TYPE(type) \
for (kmp_hw_t type = (kmp_hw_t)0; type < KMP_HW_LAST; \
type = (kmp_hw_t)((int)type + 1))
const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural = false);
/* Only Linux* OS and Windows* OS support thread affinity. */
#if KMP_AFFINITY_SUPPORTED

View File

@ -45,6 +45,218 @@ void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
}
const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
switch (type) {
case KMP_HW_SOCKET:
return ((plural) ? KMP_I18N_STR(Sockets) : KMP_I18N_STR(Socket));
case KMP_HW_DIE:
return ((plural) ? KMP_I18N_STR(Dice) : KMP_I18N_STR(Die));
case KMP_HW_MODULE:
return ((plural) ? KMP_I18N_STR(Modules) : KMP_I18N_STR(Module));
case KMP_HW_TILE:
return ((plural) ? KMP_I18N_STR(Tiles) : KMP_I18N_STR(Tile));
case KMP_HW_NUMA:
return ((plural) ? KMP_I18N_STR(NumaDomains) : KMP_I18N_STR(NumaDomain));
case KMP_HW_L3:
return ((plural) ? KMP_I18N_STR(L3Caches) : KMP_I18N_STR(L3Cache));
case KMP_HW_L2:
return ((plural) ? KMP_I18N_STR(L2Caches) : KMP_I18N_STR(L2Cache));
case KMP_HW_L1:
return ((plural) ? KMP_I18N_STR(L1Caches) : KMP_I18N_STR(L1Cache));
case KMP_HW_CORE:
return ((plural) ? KMP_I18N_STR(Cores) : KMP_I18N_STR(Core));
case KMP_HW_THREAD:
return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
case KMP_HW_PROC_GROUP:
return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
}
return KMP_I18N_STR(Unknown);
}
#if KMP_USE_HWLOC
// This function removes the topology levels that are radix 1 and don't offer
// further information about the topology. The most common example is when you
// have one thread context per core, we don't want the extra thread context
// level if it offers no unique labels. So they are removed.
// return value: the new depth of address2os
static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
int depth, kmp_hw_t *types) {
int preference[KMP_HW_LAST];
int top_index1, top_index2;
// Set up preference associative array
preference[KMP_HW_PROC_GROUP] = 110;
preference[KMP_HW_SOCKET] = 100;
preference[KMP_HW_CORE] = 95;
preference[KMP_HW_THREAD] = 90;
preference[KMP_HW_DIE] = 85;
preference[KMP_HW_NUMA] = 80;
preference[KMP_HW_TILE] = 75;
preference[KMP_HW_MODULE] = 73;
preference[KMP_HW_L3] = 70;
preference[KMP_HW_L2] = 65;
preference[KMP_HW_L1] = 60;
top_index1 = 0;
top_index2 = 1;
while (top_index1 < depth - 1 && top_index2 < depth) {
KMP_DEBUG_ASSERT(top_index1 >= 0 && top_index1 < depth);
KMP_DEBUG_ASSERT(top_index2 >= 0 && top_index2 < depth);
kmp_hw_t type1 = types[top_index1];
kmp_hw_t type2 = types[top_index2];
if (type1 == KMP_HW_SOCKET && type2 == KMP_HW_CORE) {
top_index1 = top_index2++;
continue;
}
bool radix1 = true;
bool all_same = true;
int id1 = addrP[0].first.labels[top_index1];
int id2 = addrP[0].first.labels[top_index2];
int pref1 = preference[type1];
int pref2 = preference[type2];
for (int hwidx = 1; hwidx < nTh; ++hwidx) {
if (addrP[hwidx].first.labels[top_index1] == id1 &&
addrP[hwidx].first.labels[top_index2] != id2) {
radix1 = false;
break;
}
if (addrP[hwidx].first.labels[top_index2] != id2)
all_same = false;
id1 = addrP[hwidx].first.labels[top_index1];
id2 = addrP[hwidx].first.labels[top_index2];
}
if (radix1) {
// Select the layer to remove based on preference
kmp_hw_t remove_type, keep_type;
int remove_layer, remove_layer_ids;
if (pref1 > pref2) {
remove_type = type2;
remove_layer = remove_layer_ids = top_index2;
keep_type = type1;
} else {
remove_type = type1;
remove_layer = remove_layer_ids = top_index1;
keep_type = type2;
}
// If all the indexes for the second (deeper) layer are the same.
// e.g., all are zero, then make sure to keep the first layer's ids
if (all_same)
remove_layer_ids = top_index2;
// Remove radix one type by setting the equivalence, removing the id from
// the hw threads and removing the layer from types and depth
for (int idx = 0; idx < nTh; ++idx) {
Address &hw_thread = addrP[idx].first;
for (int d = remove_layer_ids; d < depth - 1; ++d)
hw_thread.labels[d] = hw_thread.labels[d + 1];
hw_thread.depth--;
}
for (int idx = remove_layer; idx < depth - 1; ++idx)
types[idx] = types[idx + 1];
depth--;
} else {
top_index1 = top_index2++;
}
}
KMP_ASSERT(depth > 0);
return depth;
}
// Gather the count of each topology layer and the ratio
// ratio contains the number of types[i] / types[i+1] and so forth
// count contains the absolute number of types[i]
static void __kmp_affinity_gather_enumeration_information(AddrUnsPair *addrP,
int nTh, int depth,
kmp_hw_t *types,
int *ratio,
int *count) {
int previous_id[KMP_HW_LAST];
int max[KMP_HW_LAST];
for (int i = 0; i < depth; ++i) {
previous_id[i] = -1;
max[i] = 0;
count[i] = 0;
ratio[i] = 0;
}
for (int i = 0; i < nTh; ++i) {
Address &hw_thread = addrP[i].first;
for (int layer = 0; layer < depth; ++layer) {
int id = hw_thread.labels[layer];
if (id != previous_id[layer]) {
// Add an additional increment to each count
for (int l = layer; l < depth; ++l)
count[l]++;
// Keep track of topology layer ratio statistics
max[layer]++;
for (int l = layer + 1; l < depth; ++l) {
if (max[l] > ratio[l])
ratio[l] = max[l];
max[l] = 1;
}
break;
}
}
for (int layer = 0; layer < depth; ++layer) {
previous_id[layer] = hw_thread.labels[layer];
}
}
for (int layer = 0; layer < depth; ++layer) {
if (max[layer] > ratio[layer])
ratio[layer] = max[layer];
}
}
// Find out if the topology is uniform
static bool __kmp_affinity_discover_uniformity(int depth, int *ratio,
int *count) {
int num = 1;
for (int level = 0; level < depth; ++level)
num *= ratio[level];
return (num == count[depth - 1]);
}
// calculate the number of X's per Y
static inline int __kmp_hwloc_calculate_ratio(int *ratio, int deep_level,
int shallow_level) {
int retval = 1;
if (deep_level < 0 || shallow_level < 0)
return retval;
for (int level = deep_level; level > shallow_level; --level)
retval *= ratio[level];
return retval;
}
#endif // KMP_USE_HWLOC
// Print out the detailed machine topology map, i.e. the physical locations
// of each OS proc.
static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
int depth, int pkgLevel,
int coreLevel, int threadLevel) {
int proc;
KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
for (proc = 0; proc < len; proc++) {
int level;
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
for (level = 0; level < depth; level++) {
if (level == threadLevel) {
__kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
} else if (level == coreLevel) {
__kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
} else if (level == pkgLevel) {
__kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
} else if (level > pkgLevel) {
__kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
level - pkgLevel - 1);
} else {
__kmp_str_buf_print(&buf, "L%d ", level);
}
__kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
}
KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
buf.str);
__kmp_str_buf_free(&buf);
}
}
#if KMP_AFFINITY_SUPPORTED
bool KMPAffinity::picked_api = false;
@ -313,72 +525,18 @@ inline static bool __kmp_affinity_uniform_topology() {
return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
}
// Print out the detailed machine topology map, i.e. the physical locations
// of each OS proc.
static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
int depth, int pkgLevel,
int coreLevel, int threadLevel) {
int proc;
KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
for (proc = 0; proc < len; proc++) {
int level;
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
for (level = 0; level < depth; level++) {
if (level == threadLevel) {
__kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
} else if (level == coreLevel) {
__kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
} else if (level == pkgLevel) {
__kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
} else if (level > pkgLevel) {
__kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
level - pkgLevel - 1);
} else {
__kmp_str_buf_print(&buf, "L%d ", level);
}
__kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
}
KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
buf.str);
__kmp_str_buf_free(&buf);
}
}
#if KMP_USE_HWLOC
static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
int depth, int *levels) {
int depth, kmp_hw_t *types) {
int proc;
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
for (proc = 0; proc < len; proc++) {
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
addrP[proc].first.labels[0]);
if (depth > 1) {
int level = 1; // iterate over levels
int label = 1; // iterate over labels
if (__kmp_numa_detected)
// node level follows package
if (levels[level++] > 0)
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
addrP[proc].first.labels[label++]);
if (__kmp_tile_depth > 0)
// tile level follows node if any, or package
if (levels[level++] > 0)
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
addrP[proc].first.labels[label++]);
if (levels[level++] > 0)
// core level follows
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
addrP[proc].first.labels[label++]);
if (levels[level++] > 0)
// thread level is the latest
__kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
addrP[proc].first.labels[label++]);
KMP_DEBUG_ASSERT(label == depth);
for (int i = 0; i < depth; ++i) {
__kmp_str_buf_print(&buf, "%s %d ", __kmp_hw_get_catalog_string(types[i]),
addrP[proc].first.labels[i]);
}
KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
__kmp_str_buf_clear(&buf);
@ -386,53 +544,47 @@ static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
__kmp_str_buf_free(&buf);
}
static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
static inline bool __kmp_hwloc_is_cache_type(hwloc_obj_t obj) {
#if HWLOC_API_VERSION >= 0x00020000
return hwloc_obj_type_is_cache(obj->type);
#else
return obj->type == HWLOC_OBJ_CACHE;
#endif
}
// This function removes the topology levels that are radix 1 and don't offer
// further information about the topology. The most common example is when you
// have one thread context per core, we don't want the extra thread context
// level if it offers no unique labels. So they are removed.
// return value: the new depth of address2os
static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
int depth, int *levels) {
int level;
int i;
int radix1_detected;
int new_depth = depth;
for (level = depth - 1; level > 0; --level) {
// Detect if this level is radix 1
radix1_detected = 1;
for (i = 1; i < nTh; ++i) {
if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
// There are differing label values for this level so it stays
radix1_detected = 0;
break;
}
}
if (!radix1_detected)
continue;
// Radix 1 was detected
--new_depth;
levels[level] = -1; // mark level as not present in address2os array
if (level == new_depth) {
// "turn off" deepest level, just decrement the depth that removes
// the level from address2os array
for (i = 0; i < nTh; ++i) {
addrP[i].first.depth--;
}
} else {
// For other levels, we move labels over and also reduce the depth
int j;
for (j = level; j < new_depth; ++j) {
for (i = 0; i < nTh; ++i) {
addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
addrP[i].first.depth--;
}
levels[j + 1] -= 1;
// Returns KMP_HW_* type derived from HWLOC_* type
static inline kmp_hw_t __kmp_hwloc_type_2_topology_type(hwloc_obj_t obj) {
if (__kmp_hwloc_is_cache_type(obj)) {
if (obj->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION)
return KMP_HW_UNKNOWN;
switch (obj->attr->cache.depth) {
case 1:
return KMP_HW_L1;
case 2:
#if KMP_MIC_SUPPORTED
if (__kmp_mic_type == mic3) {
return KMP_HW_TILE;
}
#endif
return KMP_HW_L2;
case 3:
return KMP_HW_L3;
}
return KMP_HW_UNKNOWN;
}
return new_depth;
switch (obj->type) {
case HWLOC_OBJ_PACKAGE:
return KMP_HW_SOCKET;
case HWLOC_OBJ_NUMANODE:
return KMP_HW_NUMA;
case HWLOC_OBJ_CORE:
return KMP_HW_CORE;
case HWLOC_OBJ_PU:
return KMP_HW_THREAD;
}
return KMP_HW_UNKNOWN;
}
// Returns the number of objects of type 'type' below 'obj' within the topology
@ -484,93 +636,48 @@ static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
return sum; // will be 0 if no one found (as PU arity is 0)
}
static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
int &nActiveThreads,
int &num_active_cores,
hwloc_obj_t obj, int depth,
int *labels) {
hwloc_obj_t core = NULL;
hwloc_topology_t &tp = __kmp_hwloc_topology;
int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
hwloc_obj_t pu = NULL;
KMP_DEBUG_ASSERT(core != NULL);
int num_active_threads = 0;
int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
// int NT = core->arity; pu = core->first_child; // faster?
for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
KMP_DEBUG_ASSERT(pu != NULL);
if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
continue; // skip inactive (inaccessible) unit
Address addr(depth + 2);
KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
obj->os_index, obj->logical_index, core->os_index,
core->logical_index, pu->os_index, pu->logical_index));
for (int i = 0; i < depth; ++i)
addr.labels[i] = labels[i]; // package, etc.
addr.labels[depth] = core_id; // core
addr.labels[depth + 1] = pu_id; // pu
addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
__kmp_pu_os_idx[nActiveThreads] = pu->os_index;
nActiveThreads++;
++num_active_threads; // count active threads per core
}
if (num_active_threads) { // were there any active threads on the core?
++__kmp_ncores; // count total active cores
++num_active_cores; // count active cores per socket
if (num_active_threads > __kmp_nThreadsPerCore)
__kmp_nThreadsPerCore = num_active_threads; // calc maximum
// This gets the sub_id for a lower object under a higher object in the
// topology tree
static int __kmp_hwloc_get_sub_id(hwloc_topology_t t, hwloc_obj_t higher,
hwloc_obj_t lower) {
hwloc_obj_t obj;
hwloc_obj_type_t ltype = lower->type;
int lindex = lower->logical_index - 1;
int sub_id = 0;
// Get the previous lower object
obj = hwloc_get_obj_by_type(t, ltype, lindex);
while (obj && lindex >= 0 &&
hwloc_bitmap_isincluded(obj->cpuset, higher->cpuset)) {
if (obj->userdata) {
sub_id = (int)(RCAST(kmp_intptr_t, obj->userdata));
break;
}
sub_id++;
lindex--;
obj = hwloc_get_obj_by_type(t, ltype, lindex);
}
return 0;
}
// Check if NUMA node detected below the package,
// and if tile object is detected and return its depth
static int __kmp_hwloc_check_numa() {
hwloc_topology_t &tp = __kmp_hwloc_topology;
hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
int depth, l2cache_depth, package_depth;
// Get some PU
hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
if (hT == NULL) // something has gone wrong
return 1;
// check NUMA node below PACKAGE
hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
KMP_DEBUG_ASSERT(hS != NULL);
if (hN != NULL && hN->depth > hS->depth) {
__kmp_numa_detected = TRUE; // socket includes node(s)
if (__kmp_affinity_gran == affinity_gran_node) {
__kmp_affinity_gran = affinity_gran_numa;
}
}
package_depth = hwloc_get_type_depth(tp, HWLOC_OBJ_PACKAGE);
l2cache_depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
// check tile, get object by depth because of multiple caches possible
depth = (l2cache_depth < package_depth) ? package_depth : l2cache_depth;
hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
hC = NULL; // not used, but reset it here just in case
if (hL != NULL &&
__kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
__kmp_tile_depth = depth; // tile consists of multiple cores
return 0;
// store sub_id + 1 so that 0 is differed from NULL
lower->userdata = RCAST(void *, sub_id + 1);
return sub_id;
}
static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
kmp_i18n_id_t *const msg_id) {
hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
*address2os = NULL;
kmp_hw_t type;
int hw_thread_index, sub_id, nActiveThreads;
int depth;
hwloc_obj_t pu, obj, root, prev;
int ratio[KMP_HW_LAST];
int count[KMP_HW_LAST];
kmp_hw_t types[KMP_HW_LAST];
hwloc_topology_t tp = __kmp_hwloc_topology;
*msg_id = kmp_i18n_null;
// Save the affinity mask for the current thread.
kmp_affin_mask_t *oldMask;
KMP_CPU_ALLOC(oldMask);
__kmp_get_system_affinity(oldMask, TRUE);
__kmp_hwloc_check_numa();
if (!KMP_AFFINITY_CAPABLE()) {
// Hack to try and infer the machine topology using only the data
@ -606,13 +713,44 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
return 0;
}
int depth = 3;
int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
int labels[3] = {0}; // package [,node] [,tile] - head of labels array
if (__kmp_numa_detected)
++depth;
if (__kmp_tile_depth)
++depth;
root = hwloc_get_root_obj(tp);
// Figure out the depth and types in the topology
depth = 0;
pu = hwloc_get_pu_obj_by_os_index(tp, __kmp_affin_fullMask->begin());
obj = pu;
types[depth] = KMP_HW_THREAD;
depth++;
while (obj != root && obj != NULL) {
obj = obj->parent;
#if HWLOC_API_VERSION >= 0x00020000
if (obj->memory_arity) {
hwloc_obj_t memory;
for (memory = obj->memory_first_child; memory;
memory = hwloc_get_next_child(tp, obj, memory)) {
if (memory->type == HWLOC_OBJ_NUMANODE)
break;
}
if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
types[depth] = KMP_HW_NUMA;
depth++;
}
}
#endif
type = __kmp_hwloc_type_2_topology_type(obj);
if (type != KMP_HW_UNKNOWN) {
types[depth] = type;
depth++;
}
}
KMP_ASSERT(depth > 0 && depth <= KMP_HW_LAST);
// Get the order for the types correct
for (int i = 0, j = depth - 1; i < j; ++i, --j) {
kmp_hw_t temp = types[i];
types[i] = types[j];
types[j] = temp;
}
// Allocate the data structure to be returned.
AddrUnsPair *retval =
@ -620,105 +758,60 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
__kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
// When affinity is off, this routine will still be called to set
// __kmp_ncores, as well as __kmp_nThreadsPerCore,
// nCoresPerPkg, & nPackages. Make sure all these vars are set
// correctly, and return if affinity is not enabled.
hwloc_obj_t socket, node, tile;
int nActiveThreads = 0;
int socket_id = 0;
// re-calculate globals to count only accessible resources
__kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
socket_id++) {
labels[0] = socket_id;
if (__kmp_numa_detected) {
int NN;
int n_active_nodes = 0;
node = NULL;
NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
&node);
for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
labels[1] = node_id;
if (__kmp_tile_depth) {
// NUMA + tiles
int NT;
int n_active_tiles = 0;
tile = NULL;
NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
&tile);
for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
labels[2] = tl_id;
int n_active_cores = 0;
__kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
n_active_cores, tile, 3, labels);
if (n_active_cores) { // were there any active cores on the socket?
++n_active_tiles; // count active tiles per node
if (n_active_cores > nCorePerTile)
nCorePerTile = n_active_cores; // calc maximum
}
}
if (n_active_tiles) { // were there any active tiles on the socket?
++n_active_nodes; // count active nodes per package
if (n_active_tiles > nTilePerNode)
nTilePerNode = n_active_tiles; // calc maximum
}
} else {
// NUMA, no tiles
int n_active_cores = 0;
__kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
n_active_cores, node, 2, labels);
if (n_active_cores) { // were there any active cores on the socket?
++n_active_nodes; // count active nodes per package
if (n_active_cores > nCorePerNode)
nCorePerNode = n_active_cores; // calc maximum
hw_thread_index = 0;
pu = NULL;
nActiveThreads = 0;
while (pu = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, pu)) {
int index = depth - 1;
bool included = KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask);
Address hw_thread(depth);
if (included) {
hw_thread.labels[index] = pu->logical_index;
__kmp_pu_os_idx[hw_thread_index] = pu->os_index;
index--;
nActiveThreads++;
}
obj = pu;
prev = obj;
while (obj != root && obj != NULL) {
obj = obj->parent;
#if HWLOC_API_VERSION >= 0x00020000
// NUMA Nodes are handled differently since they are not within the
// parent/child structure anymore. They are separate children
// of obj (memory_first_child points to first memory child)
if (obj->memory_arity) {
hwloc_obj_t memory;
for (memory = obj->memory_first_child; memory;
memory = hwloc_get_next_child(tp, obj, memory)) {
if (memory->type == HWLOC_OBJ_NUMANODE)
break;
}
if (memory && memory->type == HWLOC_OBJ_NUMANODE) {
sub_id = __kmp_hwloc_get_sub_id(tp, memory, prev);
if (included) {
hw_thread.labels[index] = memory->logical_index;
hw_thread.labels[index + 1] = sub_id;
index--;
}
prev = memory;
}
}
if (n_active_nodes) { // were there any active nodes on the socket?
++nPackages; // count total active packages
if (n_active_nodes > nNodePerPkg)
nNodePerPkg = n_active_nodes; // calc maximum
}
} else {
if (__kmp_tile_depth) {
// no NUMA, tiles
int NT;
int n_active_tiles = 0;
tile = NULL;
NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
&tile);
for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
labels[1] = tl_id;
int n_active_cores = 0;
__kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
n_active_cores, tile, 2, labels);
if (n_active_cores) { // were there any active cores on the socket?
++n_active_tiles; // count active tiles per package
if (n_active_cores > nCorePerTile)
nCorePerTile = n_active_cores; // calc maximum
}
}
if (n_active_tiles) { // were there any active tiles on the socket?
++nPackages; // count total active packages
if (n_active_tiles > nTilePerPkg)
nTilePerPkg = n_active_tiles; // calc maximum
}
} else {
// no NUMA, no tiles
int n_active_cores = 0;
__kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
socket, 1, labels);
if (n_active_cores) { // were there any active cores on the socket?
++nPackages; // count total active packages
if (n_active_cores > nCoresPerPkg)
nCoresPerPkg = n_active_cores; // calc maximum
}
#endif
type = __kmp_hwloc_type_2_topology_type(obj);
if (type != KMP_HW_UNKNOWN) {
sub_id = __kmp_hwloc_get_sub_id(tp, obj, prev);
if (included) {
hw_thread.labels[index] = obj->logical_index;
hw_thread.labels[index + 1] = sub_id;
index--;
}
prev = obj;
}
}
if (included) {
retval[hw_thread_index] = AddrUnsPair(hw_thread, pu->os_index);
hw_thread_index++;
}
}
// If there's only one thread context to bind to, return now.
@ -763,54 +856,86 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
qsort(retval, nActiveThreads, sizeof(*retval),
__kmp_affinity_cmp_Address_labels);
// Check to see if the machine topology is uniform
int nPUs = nPackages * __kmp_nThreadsPerCore;
if (__kmp_numa_detected) {
if (__kmp_tile_depth) { // NUMA + tiles
nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
} else { // NUMA, no tiles
nPUs *= (nNodePerPkg * nCorePerNode);
}
} else {
if (__kmp_tile_depth) { // no NUMA, tiles
nPUs *= (nTilePerPkg * nCorePerTile);
} else { // no NUMA, no tiles
nPUs *= nCoresPerPkg;
}
// Find any levels with radiix 1, and remove them from the map
// (except for the package level).
depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
types);
__kmp_affinity_gather_enumeration_information(retval, nActiveThreads, depth,
types, ratio, count);
for (int level = 0; level < depth; ++level) {
if ((types[level] == KMP_HW_L2 || types[level] == KMP_HW_L3))
__kmp_tile_depth = level;
}
unsigned uniform = (nPUs == nActiveThreads);
// This routine should set __kmp_ncores, as well as
// __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
int thread_level, core_level, tile_level, numa_level, socket_level;
thread_level = core_level = tile_level = numa_level = socket_level = -1;
for (int level = 0; level < depth; ++level) {
if (types[level] == KMP_HW_THREAD)
thread_level = level;
else if (types[level] == KMP_HW_CORE)
core_level = level;
else if (types[level] == KMP_HW_SOCKET)
socket_level = level;
else if (types[level] == KMP_HW_TILE)
tile_level = level;
else if (types[level] == KMP_HW_NUMA)
numa_level = level;
}
__kmp_nThreadsPerCore =
__kmp_hwloc_calculate_ratio(ratio, thread_level, core_level);
nCoresPerPkg = __kmp_hwloc_calculate_ratio(ratio, core_level, socket_level);
if (socket_level >= 0)
nPackages = count[socket_level];
else
nPackages = 1;
if (core_level >= 0)
__kmp_ncores = count[core_level];
else
__kmp_ncores = 1;
unsigned uniform = __kmp_affinity_discover_uniformity(depth, ratio, count);
// Print the machine topology summary.
if (__kmp_affinity_verbose) {
kmp_hw_t numerator_type, denominator_type;
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
if (uniform) {
KMP_INFORM(Uniform, "KMP_AFFINITY");
} else {
KMP_INFORM(NonUniform, "KMP_AFFINITY");
}
if (__kmp_numa_detected) {
if (__kmp_tile_depth) { // NUMA + tiles
KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
__kmp_ncores);
} else { // NUMA, no tiles
KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
nPUs *= (nNodePerPkg * nCorePerNode);
}
} else {
if (__kmp_tile_depth) { // no NUMA, tiles
KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
} else { // no NUMA, no tiles
kmp_str_buf_t buf;
__kmp_str_buf_init(&buf);
__kmp_str_buf_print(&buf, "%d", nPackages);
KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
__kmp_nThreadsPerCore, __kmp_ncores);
__kmp_str_buf_free(&buf);
__kmp_str_buf_clear(&buf);
if (core_level < 0)
core_level = depth - 1;
int ncores = count[core_level];
denominator_type = KMP_HW_UNKNOWN;
for (int level = 0; level < depth; ++level) {
int c;
bool plural;
numerator_type = types[level];
c = ratio[level];
plural = (c > 1);
if (level == 0) {
__kmp_str_buf_print(&buf, "%d %s", c, __kmp_hw_get_catalog_string(
numerator_type, plural));
} else {
__kmp_str_buf_print(&buf, " x %d %s/%s", c,
__kmp_hw_get_catalog_string(numerator_type, plural),
__kmp_hw_get_catalog_string(denominator_type));
}
denominator_type = numerator_type;
}
KMP_INFORM(TopologyGeneric, "KMP_AFFINITY", buf.str, ncores);
__kmp_str_buf_free(&buf);
}
if (__kmp_affinity_type == affinity_none) {
@ -819,30 +944,27 @@ static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
return 0;
}
int depth_full = depth; // number of levels before compressing
// Find any levels with radix 1, and remove them from the map
// (except for the package level).
depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
levels);
// Set the granularity level based on what levels are modeled
// in the machine topology map.
if (__kmp_affinity_gran == affinity_gran_node)
__kmp_affinity_gran = affinity_gran_numa;
KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
if (__kmp_affinity_gran_levels < 0) {
// Set the granularity level based on what levels are modeled
// in the machine topology map.
__kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
if (__kmp_affinity_gran > affinity_gran_thread) {
for (int i = 1; i <= depth_full; ++i) {
if (__kmp_affinity_gran <= i) // only count deeper levels
break;
if (levels[depth_full - i] > 0)
__kmp_affinity_gran_levels++;
}
}
if (__kmp_affinity_gran > affinity_gran_package)
__kmp_affinity_gran_levels++; // e.g. granularity = group
if ((thread_level >= 0) && (__kmp_affinity_gran > affinity_gran_thread))
__kmp_affinity_gran_levels++;
if ((core_level >= 0) && (__kmp_affinity_gran > affinity_gran_core))
__kmp_affinity_gran_levels++;
if ((tile_level >= 0) && (__kmp_affinity_gran > affinity_gran_tile))
__kmp_affinity_gran_levels++;
if ((numa_level >= 0) && (__kmp_affinity_gran > affinity_gran_numa))
__kmp_affinity_gran_levels++;
if ((socket_level >= 0) && (__kmp_affinity_gran > affinity_gran_package))
__kmp_affinity_gran_levels++;
}
if (__kmp_affinity_verbose)
__kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
__kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, types);
KMP_CPU_FREE(oldMask);
*address2os = retval;