Cleanup of affinity hierarchy code.

Some of this is improvement to code suggested by Hal Finkel. Four changes here:
1.Cleanup of hierarchy code to handle all hierarchy cases whether affinity is available or not
2.Separated this and other classes and common functions out to a header file
3.Added a destructor-like fini function for the hierarchy (and call in __kmp_cleanup)
4.Remove some redundant code that is hopefully no longer needed

Differential Revision: http://reviews.llvm.org/D12449

llvm-svn: 247326
This commit is contained in:
Jonathan Peyton 2015-09-10 19:22:07 +00:00
parent 6f94087329
commit 1707836b68
4 changed files with 310 additions and 456 deletions

View File

@ -2984,6 +2984,7 @@ extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
extern void __kmp_balanced_affinity( int tid, int team_size );
#endif /* KMP_AFFINITY_SUPPORTED */
extern void __kmp_cleanup_hierarchy();
extern void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar);
#if KMP_OS_LINUX && (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)

View File

@ -18,6 +18,34 @@
#include "kmp_io.h"
#include "kmp_str.h"
#include "kmp_wrapper_getpid.h"
#include "kmp_affinity.h"
// Store the real or imagined machine hierarchy here
static hierarchy_info machine_hierarchy;
void __kmp_cleanup_hierarchy() {
machine_hierarchy.fini();
}
void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
kmp_uint32 depth;
// The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
if (TCR_1(machine_hierarchy.uninitialized))
machine_hierarchy.init(NULL, nproc);
// Adjust the hierarchy in case num threads exceeds original
if (nproc > machine_hierarchy.base_num_threads)
machine_hierarchy.resize(nproc);
depth = machine_hierarchy.depth;
KMP_DEBUG_ASSERT(depth > 0);
// The loop below adjusts the depth in the case of a resize
while (nproc > machine_hierarchy.skipPerLevel[depth-1])
depth++;
thr_bar->depth = depth;
thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
}
#if KMP_AFFINITY_SUPPORTED
@ -108,393 +136,6 @@ __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask)
}
}
//
// In Linux* OS debug & cover (-O0) builds, we need to avoid inline member
// functions.
//
// The icc codegen emits sections with extremely long names, of the form
// ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug
// introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving
// some sort of memory corruption or table overflow that is triggered by
// these long strings. I checked the latest version of the linker -
// GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not
// fixed.
//
// Unfortunately, my attempts to reproduce it in a smaller example have
// failed - I'm not sure what the prospects are of getting it fixed
// properly - but we need a reproducer smaller than all of libomp.
//
// Work around the problem by avoiding inline constructors in such builds.
// We do this for all platforms, not just Linux* OS - non-inline functions are
// more debuggable and provide better coverage into than inline functions.
// Use inline functions in shipping libs, for performance.
//
# if !defined(KMP_DEBUG) && !defined(COVER)
class Address {
public:
static const unsigned maxDepth = 32;
unsigned labels[maxDepth];
unsigned childNums[maxDepth];
unsigned depth;
unsigned leader;
Address(unsigned _depth)
: depth(_depth), leader(FALSE) {
}
Address &operator=(const Address &b) {
depth = b.depth;
for (unsigned i = 0; i < depth; i++) {
labels[i] = b.labels[i];
childNums[i] = b.childNums[i];
}
leader = FALSE;
return *this;
}
bool operator==(const Address &b) const {
if (depth != b.depth)
return false;
for (unsigned i = 0; i < depth; i++)
if(labels[i] != b.labels[i])
return false;
return true;
}
bool isClose(const Address &b, int level) const {
if (depth != b.depth)
return false;
if ((unsigned)level >= depth)
return true;
for (unsigned i = 0; i < (depth - level); i++)
if(labels[i] != b.labels[i])
return false;
return true;
}
bool operator!=(const Address &b) const {
return !operator==(b);
}
};
class AddrUnsPair {
public:
Address first;
unsigned second;
AddrUnsPair(Address _first, unsigned _second)
: first(_first), second(_second) {
}
AddrUnsPair &operator=(const AddrUnsPair &b)
{
first = b.first;
second = b.second;
return *this;
}
};
# else
class Address {
public:
static const unsigned maxDepth = 32;
unsigned labels[maxDepth];
unsigned childNums[maxDepth];
unsigned depth;
unsigned leader;
Address(unsigned _depth);
Address &operator=(const Address &b);
bool operator==(const Address &b) const;
bool isClose(const Address &b, int level) const;
bool operator!=(const Address &b) const;
};
Address::Address(unsigned _depth)
{
depth = _depth;
leader = FALSE;
}
Address &Address::operator=(const Address &b) {
depth = b.depth;
for (unsigned i = 0; i < depth; i++) {
labels[i] = b.labels[i];
childNums[i] = b.childNums[i];
}
leader = FALSE;
return *this;
}
bool Address::operator==(const Address &b) const {
if (depth != b.depth)
return false;
for (unsigned i = 0; i < depth; i++)
if(labels[i] != b.labels[i])
return false;
return true;
}
bool Address::isClose(const Address &b, int level) const {
if (depth != b.depth)
return false;
if ((unsigned)level >= depth)
return true;
for (unsigned i = 0; i < (depth - level); i++)
if(labels[i] != b.labels[i])
return false;
return true;
}
bool Address::operator!=(const Address &b) const {
return !operator==(b);
}
class AddrUnsPair {
public:
Address first;
unsigned second;
AddrUnsPair(Address _first, unsigned _second);
AddrUnsPair &operator=(const AddrUnsPair &b);
};
AddrUnsPair::AddrUnsPair(Address _first, unsigned _second)
: first(_first), second(_second)
{
}
AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b)
{
first = b.first;
second = b.second;
return *this;
}
# endif /* !defined(KMP_DEBUG) && !defined(COVER) */
static int
__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
{
const Address *aa = (const Address *)&(((AddrUnsPair *)a)
->first);
const Address *bb = (const Address *)&(((AddrUnsPair *)b)
->first);
unsigned depth = aa->depth;
unsigned i;
KMP_DEBUG_ASSERT(depth == bb->depth);
for (i = 0; i < depth; i++) {
if (aa->labels[i] < bb->labels[i]) return -1;
if (aa->labels[i] > bb->labels[i]) return 1;
}
return 0;
}
static int
__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
{
const Address *aa = (const Address *)&(((AddrUnsPair *)a)
->first);
const Address *bb = (const Address *)&(((AddrUnsPair *)b)
->first);
unsigned depth = aa->depth;
unsigned i;
KMP_DEBUG_ASSERT(depth == bb->depth);
KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
int j = depth - i - 1;
if (aa->childNums[j] < bb->childNums[j]) return -1;
if (aa->childNums[j] > bb->childNums[j]) return 1;
}
for (; i < depth; i++) {
int j = i - __kmp_affinity_compact;
if (aa->childNums[j] < bb->childNums[j]) return -1;
if (aa->childNums[j] > bb->childNums[j]) return 1;
}
return 0;
}
/** A structure for holding machine-specific hierarchy info to be computed once at init.
This structure represents a mapping of threads to the actual machine hierarchy, or to
our best guess at what the hierarchy might be, for the purpose of performing an
efficient barrier. In the worst case, when there is no machine hierarchy information,
it produces a tree suitable for a barrier, similar to the tree used in the hyper barrier. */
class hierarchy_info {
public:
/** Number of levels in the hierarchy. Typical levels are threads/core, cores/package
or socket, packages/node, nodes/machine, etc. We don't want to get specific with
nomenclature. When the machine is oversubscribed we add levels to duplicate the
hierarchy, doubling the thread capacity of the hierarchy each time we add a level. */
kmp_uint32 maxLevels;
/** This is specifically the depth of the machine configuration hierarchy, in terms of the
number of levels along the longest path from root to any leaf. It corresponds to the
number of entries in numPerLevel if we exclude all but one trailing 1. */
kmp_uint32 depth;
kmp_uint32 base_num_threads;
volatile kmp_int8 uninitialized; // 0=initialized, 1=uninitialized, 2=initialization in progress
volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
/** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
kmp_uint32 *numPerLevel;
kmp_uint32 *skipPerLevel;
void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
int hier_depth = adr2os[0].first.depth;
int level = 0;
for (int i=hier_depth-1; i>=0; --i) {
int max = -1;
for (int j=0; j<num_addrs; ++j) {
int next = adr2os[j].first.childNums[i];
if (next > max) max = next;
}
numPerLevel[level] = max+1;
++level;
}
}
hierarchy_info() : maxLevels(7), depth(1), uninitialized(1), resizing(0) {}
// TO FIX: This destructor causes a segfault in the library at shutdown.
//~hierarchy_info() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
void init(AddrUnsPair *adr2os, int num_addrs)
{
kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, 1, 2);
if (bool_result == 0) { // Wait for initialization
while (TCR_1(uninitialized) != 0) KMP_CPU_PAUSE();
return;
}
KMP_DEBUG_ASSERT(bool_result==1);
/* Added explicit initialization of the data fields here to prevent usage of dirty value
observed when static library is re-initialized multiple times (e.g. when
non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
depth = 1;
resizing = 0;
maxLevels = 7;
numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
skipPerLevel = &(numPerLevel[maxLevels]);
for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
numPerLevel[i] = 1;
skipPerLevel[i] = 1;
}
// Sort table by physical ID
if (adr2os) {
qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
deriveLevels(adr2os, num_addrs);
}
else {
numPerLevel[0] = 4;
numPerLevel[1] = num_addrs/4;
if (num_addrs%4) numPerLevel[1]++;
}
base_num_threads = num_addrs;
for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
depth++;
kmp_uint32 branch = 4;
if (numPerLevel[0] == 1) branch = num_addrs/4;
if (branch<4) branch=4;
for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
if (numPerLevel[d] & 1) numPerLevel[d]++;
numPerLevel[d] = numPerLevel[d] >> 1;
if (numPerLevel[d+1] == 1) depth++;
numPerLevel[d+1] = numPerLevel[d+1] << 1;
}
if(numPerLevel[0] == 1) {
branch = branch >> 1;
if (branch<4) branch = 4;
}
}
for (kmp_uint32 i=1; i<depth; ++i)
skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
// Fill in hierarchy in the case of oversubscription
for (kmp_uint32 i=depth; i<maxLevels; ++i)
skipPerLevel[i] = 2*skipPerLevel[i-1];
uninitialized = 0; // One writer
}
void resize(kmp_uint32 nproc)
{
kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
if (bool_result == 0) { // Someone else is resizing
while (TCR_1(resizing) != 0) KMP_CPU_PAUSE();
return;
}
KMP_DEBUG_ASSERT(bool_result!=0);
KMP_DEBUG_ASSERT(nproc > base_num_threads);
// Calculate new max_levels
kmp_uint32 old_sz = skipPerLevel[depth-1];
kmp_uint32 incs = 0, old_maxLevels= maxLevels;
while (nproc > old_sz) {
old_sz *=2;
incs++;
}
maxLevels += incs;
// Resize arrays
kmp_uint32 *old_numPerLevel = numPerLevel;
kmp_uint32 *old_skipPerLevel = skipPerLevel;
numPerLevel = skipPerLevel = NULL;
numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
skipPerLevel = &(numPerLevel[maxLevels]);
// Copy old elements from old arrays
for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
numPerLevel[i] = old_numPerLevel[i];
skipPerLevel[i] = old_skipPerLevel[i];
}
// Init new elements in arrays to 1
for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
numPerLevel[i] = 1;
skipPerLevel[i] = 1;
}
// Free old arrays
__kmp_free(old_numPerLevel);
// Fill in oversubscription levels of hierarchy
for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
skipPerLevel[i] = 2*skipPerLevel[i-1];
base_num_threads = nproc;
resizing = 0; // One writer
}
};
static hierarchy_info machine_hierarchy;
void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
kmp_uint32 depth;
// The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier.
if (TCR_1(machine_hierarchy.uninitialized))
machine_hierarchy.init(NULL, nproc);
// Adjust the hierarchy in case num threads exceeds original
if (nproc > machine_hierarchy.base_num_threads)
machine_hierarchy.resize(nproc);
depth = machine_hierarchy.depth;
KMP_DEBUG_ASSERT(depth > 0);
// The loop below adjusts the depth in the case of a resize
while (nproc > machine_hierarchy.skipPerLevel[depth-1])
depth++;
thr_bar->depth = depth;
thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1;
thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
}
//
// When sorting by labels, __kmp_affinity_assign_child_nums() must first be
// called to renumber the labels from [0..n] and place them into the child_num
@ -4683,73 +4324,4 @@ void __kmp_balanced_affinity( int tid, int nthreads )
}
}
#else
// affinity not supported
static const kmp_uint32 noaff_maxLevels=7;
kmp_uint32 noaff_skipPerLevel[noaff_maxLevels];
kmp_uint32 noaff_depth;
kmp_uint8 noaff_leaf_kids;
kmp_int8 noaff_uninitialized=1;
void noaff_init(int nprocs)
{
kmp_int8 result = KMP_COMPARE_AND_STORE_ACQ8(&noaff_uninitialized, 1, 2);
if (result == 0) return; // Already initialized
else if (result == 2) { // Someone else is initializing
while (TCR_1(noaff_uninitialized) != 0) KMP_CPU_PAUSE();
return;
}
KMP_DEBUG_ASSERT(result==1);
kmp_uint32 numPerLevel[noaff_maxLevels];
noaff_depth = 1;
for (kmp_uint32 i=0; i<noaff_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
numPerLevel[i] = 1;
noaff_skipPerLevel[i] = 1;
}
numPerLevel[0] = 4;
numPerLevel[1] = nprocs/4;
if (nprocs%4) numPerLevel[1]++;
for (int i=noaff_maxLevels-1; i>=0; --i) // count non-empty levels to get depth
if (numPerLevel[i] != 1 || noaff_depth > 1) // only count one top-level '1'
noaff_depth++;
kmp_uint32 branch = 4;
if (numPerLevel[0] == 1) branch = nprocs/4;
if (branch<4) branch=4;
for (kmp_uint32 d=0; d<noaff_depth-1; ++d) { // optimize hierarchy width
while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0!
if (numPerLevel[d] & 1) numPerLevel[d]++;
numPerLevel[d] = numPerLevel[d] >> 1;
if (numPerLevel[d+1] == 1) noaff_depth++;
numPerLevel[d+1] = numPerLevel[d+1] << 1;
}
if(numPerLevel[0] == 1) {
branch = branch >> 1;
if (branch<4) branch = 4;
}
}
for (kmp_uint32 i=1; i<noaff_depth; ++i)
noaff_skipPerLevel[i] = numPerLevel[i-1] * noaff_skipPerLevel[i-1];
// Fill in hierarchy in the case of oversubscription
for (kmp_uint32 i=noaff_depth; i<noaff_maxLevels; ++i)
noaff_skipPerLevel[i] = 2*noaff_skipPerLevel[i-1];
noaff_leaf_kids = (kmp_uint8)numPerLevel[0]-1;
noaff_uninitialized = 0; // One writer
}
void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
if (noaff_uninitialized)
noaff_init(nproc);
thr_bar->depth = noaff_depth;
thr_bar->base_leaf_kids = noaff_leaf_kids;
thr_bar->skip_per_level = noaff_skipPerLevel;
}
#endif // KMP_AFFINITY_SUPPORTED

View File

@ -0,0 +1,280 @@
/*
* kmp_affinity.h -- header for affinity management
*/
//===----------------------------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.txt for details.
//
//===----------------------------------------------------------------------===//
#ifndef KMP_AFFINITY_H
#define KMP_AFFINITY_H
extern int __kmp_affinity_compact; /* Affinity 'compact' value */
class Address {
public:
static const unsigned maxDepth = 32;
unsigned labels[maxDepth];
unsigned childNums[maxDepth];
unsigned depth;
unsigned leader;
Address(unsigned _depth)
: depth(_depth), leader(FALSE) {
}
Address &operator=(const Address &b) {
depth = b.depth;
for (unsigned i = 0; i < depth; i++) {
labels[i] = b.labels[i];
childNums[i] = b.childNums[i];
}
leader = FALSE;
return *this;
}
bool operator==(const Address &b) const {
if (depth != b.depth)
return false;
for (unsigned i = 0; i < depth; i++)
if(labels[i] != b.labels[i])
return false;
return true;
}
bool isClose(const Address &b, int level) const {
if (depth != b.depth)
return false;
if ((unsigned)level >= depth)
return true;
for (unsigned i = 0; i < (depth - level); i++)
if(labels[i] != b.labels[i])
return false;
return true;
}
bool operator!=(const Address &b) const {
return !operator==(b);
}
};
class AddrUnsPair {
public:
Address first;
unsigned second;
AddrUnsPair(Address _first, unsigned _second)
: first(_first), second(_second) {
}
AddrUnsPair &operator=(const AddrUnsPair &b)
{
first = b.first;
second = b.second;
return *this;
}
};
static int
__kmp_affinity_cmp_Address_labels(const void *a, const void *b)
{
const Address *aa = (const Address *)&(((AddrUnsPair *)a)
->first);
const Address *bb = (const Address *)&(((AddrUnsPair *)b)
->first);
unsigned depth = aa->depth;
unsigned i;
KMP_DEBUG_ASSERT(depth == bb->depth);
for (i = 0; i < depth; i++) {
if (aa->labels[i] < bb->labels[i]) return -1;
if (aa->labels[i] > bb->labels[i]) return 1;
}
return 0;
}
static int
__kmp_affinity_cmp_Address_child_num(const void *a, const void *b)
{
const Address *aa = (const Address *)&(((AddrUnsPair *)a)
->first);
const Address *bb = (const Address *)&(((AddrUnsPair *)b)
->first);
unsigned depth = aa->depth;
unsigned i;
KMP_DEBUG_ASSERT(depth == bb->depth);
KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
int j = depth - i - 1;
if (aa->childNums[j] < bb->childNums[j]) return -1;
if (aa->childNums[j] > bb->childNums[j]) return 1;
}
for (; i < depth; i++) {
int j = i - __kmp_affinity_compact;
if (aa->childNums[j] < bb->childNums[j]) return -1;
if (aa->childNums[j] > bb->childNums[j]) return 1;
}
return 0;
}
/** A structure for holding machine-specific hierarchy info to be computed once at init. */
class hierarchy_info {
public:
/** Good default values for number of leaves and branching factor, given no affinity information.
Behaves a bit like hyper barrier. */
static const kmp_uint32 maxLeaves=4;
static const kmp_uint32 minBranch=4;
/** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine,
etc. We don't want to get specific with nomenclature */
kmp_uint32 maxLevels;
/** This is specifically the depth of the machine configuration hierarchy, in terms of the
number of levels along the longest path from root to any leaf. It corresponds to the
number of entries in numPerLevel if we exclude all but one trailing 1. */
kmp_uint32 depth;
kmp_uint32 base_num_threads;
enum init_status { initialized=0, not_initialized=1, initializing=2 };
volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 2=initialization in progress
volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
/** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a
node at level i has. For example, if we have a machine with 4 packages, 4 cores/package
and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */
kmp_uint32 *numPerLevel;
kmp_uint32 *skipPerLevel;
void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
int hier_depth = adr2os[0].first.depth;
int level = 0;
for (int i=hier_depth-1; i>=0; --i) {
int max = -1;
for (int j=0; j<num_addrs; ++j) {
int next = adr2os[j].first.childNums[i];
if (next > max) max = next;
}
numPerLevel[level] = max+1;
++level;
}
}
hierarchy_info() : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
void fini() { if (!uninitialized && numPerLevel) __kmp_free(numPerLevel); }
void init(AddrUnsPair *adr2os, int num_addrs)
{
kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&uninitialized, not_initialized, initializing);
if (bool_result == 0) { // Wait for initialization
while (TCR_1(uninitialized) != initialized) KMP_CPU_PAUSE();
return;
}
KMP_DEBUG_ASSERT(bool_result==1);
/* Added explicit initialization of the data fields here to prevent usage of dirty value
observed when static library is re-initialized multiple times (e.g. when
non-OpenMP thread repeatedly launches/joins thread that uses OpenMP). */
depth = 1;
resizing = 0;
maxLevels = 7;
numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
skipPerLevel = &(numPerLevel[maxLevels]);
for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
numPerLevel[i] = 1;
skipPerLevel[i] = 1;
}
// Sort table by physical ID
if (adr2os) {
qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels);
deriveLevels(adr2os, num_addrs);
}
else {
numPerLevel[0] = maxLeaves;
numPerLevel[1] = num_addrs/maxLeaves;
if (num_addrs%maxLeaves) numPerLevel[1]++;
}
base_num_threads = num_addrs;
for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth
if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
depth++;
kmp_uint32 branch = minBranch;
if (numPerLevel[0] == 1) branch = num_addrs/maxLeaves;
if (branch<minBranch) branch=minBranch;
for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width
while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>maxLeaves)) { // max 4 on level 0!
if (numPerLevel[d] & 1) numPerLevel[d]++;
numPerLevel[d] = numPerLevel[d] >> 1;
if (numPerLevel[d+1] == 1) depth++;
numPerLevel[d+1] = numPerLevel[d+1] << 1;
}
if(numPerLevel[0] == 1) {
branch = branch >> 1;
if (branch<4) branch = minBranch;
}
}
for (kmp_uint32 i=1; i<depth; ++i)
skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1];
// Fill in hierarchy in the case of oversubscription
for (kmp_uint32 i=depth; i<maxLevels; ++i)
skipPerLevel[i] = 2*skipPerLevel[i-1];
uninitialized = initialized; // One writer
}
void resize(kmp_uint32 nproc)
{
kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
if (bool_result == 0) { // Someone else is resizing
while (TCR_1(resizing) != 0) KMP_CPU_PAUSE();
return;
}
KMP_DEBUG_ASSERT(bool_result!=0);
KMP_DEBUG_ASSERT(nproc > base_num_threads);
// Calculate new max_levels
kmp_uint32 old_sz = skipPerLevel[depth-1];
kmp_uint32 incs = 0, old_maxLevels= maxLevels;
while (nproc > old_sz) {
old_sz *=2;
incs++;
}
maxLevels += incs;
// Resize arrays
kmp_uint32 *old_numPerLevel = numPerLevel;
kmp_uint32 *old_skipPerLevel = skipPerLevel;
numPerLevel = skipPerLevel = NULL;
numPerLevel = (kmp_uint32 *)__kmp_allocate(maxLevels*2*sizeof(kmp_uint32));
skipPerLevel = &(numPerLevel[maxLevels]);
// Copy old elements from old arrays
for (kmp_uint32 i=0; i<old_maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
numPerLevel[i] = old_numPerLevel[i];
skipPerLevel[i] = old_skipPerLevel[i];
}
// Init new elements in arrays to 1
for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level
numPerLevel[i] = 1;
skipPerLevel[i] = 1;
}
// Free old arrays
__kmp_free(old_numPerLevel);
// Fill in oversubscription levels of hierarchy
for (kmp_uint32 i=old_maxLevels; i<maxLevels; ++i)
skipPerLevel[i] = 2*skipPerLevel[i-1];
base_num_threads = nproc;
resizing = 0; // One writer
}
};
#endif // KMP_AFFINITY_H

View File

@ -7286,6 +7286,7 @@ __kmp_cleanup( void )
#if KMP_AFFINITY_SUPPORTED
__kmp_affinity_uninitialize();
#endif /* KMP_AFFINITY_SUPPORTED */
__kmp_cleanup_hierarchy();
TCW_4(__kmp_init_middle, FALSE);
}