[OpenMP] Add use of TPAUSE

Add use of TPAUSE (from WAITPKG) to the runtime for Intel hardware,
with an envirable to turn it on in a particular C-state.  Always uses
TPAUSE if it is selected and enabled by Intel hardware and presence of
WAITPKG, and if not, falls back to old way of checking
__kmp_use_yield, etc.

Differential Revision: https://reviews.llvm.org/D115758
This commit is contained in:
Terry Wilmarth 2021-12-14 15:04:55 -06:00
parent 1ad48d6de2
commit 2e02579a76
11 changed files with 207 additions and 96 deletions

View File

@ -1315,86 +1315,6 @@ static inline void __kmp_store_mxcsr(kmp_uint32 *p) { *p = _mm_getcsr(); }
#define KMP_X86_MXCSR_MASK 0xffffffc0 /* ignore status flags (6 lsb) */
#if KMP_ARCH_X86
extern void __kmp_x86_pause(void);
#elif KMP_MIC
// Performance testing on KNC (C0QS-7120 P/A/X/D, 61-core, 16 GB Memory) showed
// regression after removal of extra PAUSE from spin loops. Changing
// the delay from 100 to 300 showed even better performance than double PAUSE
// on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC.
static inline void __kmp_x86_pause(void) { _mm_delay_32(300); }
#else
static inline void __kmp_x86_pause(void) { _mm_pause(); }
#endif
#define KMP_CPU_PAUSE() __kmp_x86_pause()
#elif KMP_ARCH_PPC64
#define KMP_PPC64_PRI_LOW() __asm__ volatile("or 1, 1, 1")
#define KMP_PPC64_PRI_MED() __asm__ volatile("or 2, 2, 2")
#define KMP_PPC64_PRI_LOC_MB() __asm__ volatile("" : : : "memory")
#define KMP_CPU_PAUSE() \
do { \
KMP_PPC64_PRI_LOW(); \
KMP_PPC64_PRI_MED(); \
KMP_PPC64_PRI_LOC_MB(); \
} while (0)
#else
#define KMP_CPU_PAUSE() /* nothing to do */
#endif
#define KMP_INIT_YIELD(count) \
{ (count) = __kmp_yield_init; }
#define KMP_OVERSUBSCRIBED \
(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))
#define KMP_TRY_YIELD \
((__kmp_use_yield == 1) || (__kmp_use_yield == 2 && (KMP_OVERSUBSCRIBED)))
#define KMP_TRY_YIELD_OVERSUB \
((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (KMP_OVERSUBSCRIBED))
#define KMP_YIELD(cond) \
{ \
KMP_CPU_PAUSE(); \
if ((cond) && (KMP_TRY_YIELD)) \
__kmp_yield(); \
}
#define KMP_YIELD_OVERSUB() \
{ \
KMP_CPU_PAUSE(); \
if ((KMP_TRY_YIELD_OVERSUB)) \
__kmp_yield(); \
}
// Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround,
// there should be no yielding since initial value from KMP_INIT_YIELD() is odd.
#define KMP_YIELD_SPIN(count) \
{ \
KMP_CPU_PAUSE(); \
if (KMP_TRY_YIELD) { \
(count) -= 2; \
if (!(count)) { \
__kmp_yield(); \
(count) = __kmp_yield_next; \
} \
} \
}
#define KMP_YIELD_OVERSUB_ELSE_SPIN(count) \
{ \
KMP_CPU_PAUSE(); \
if ((KMP_TRY_YIELD_OVERSUB)) \
__kmp_yield(); \
else if (__kmp_use_yield == 1) { \
(count) -= 2; \
if (!(count)) { \
__kmp_yield(); \
(count) = __kmp_yield_next; \
} \
} \
}
// User-level Monitor/Mwait
#if KMP_HAVE_UMWAIT
// We always try for UMWAIT first
@ -1405,6 +1325,7 @@ static inline void __kmp_x86_pause(void) { _mm_pause(); }
#include <intrin.h>
#endif
#endif // KMP_HAVE_WAITPKG_INTRINSICS
KMP_ATTRIBUTE_TARGET_WAITPKG
static inline int __kmp_tpause(uint32_t hint, uint64_t counter) {
#if !KMP_HAVE_WAITPKG_INTRINSICS
@ -1470,6 +1391,119 @@ __kmp_mm_mwait(unsigned extensions, unsigned hints) {
}
#endif // KMP_HAVE_UMWAIT
#if KMP_ARCH_X86
extern void __kmp_x86_pause(void);
#elif KMP_MIC
// Performance testing on KNC (C0QS-7120 P/A/X/D, 61-core, 16 GB Memory) showed
// regression after removal of extra PAUSE from spin loops. Changing
// the delay from 100 to 300 showed even better performance than double PAUSE
// on Spec OMP2001 and LCPC tasking tests, no regressions on EPCC.
static inline void __kmp_x86_pause(void) { _mm_delay_32(300); }
#else
static inline void __kmp_x86_pause(void) { _mm_pause(); }
#endif
#define KMP_CPU_PAUSE() __kmp_x86_pause()
#elif KMP_ARCH_PPC64
#define KMP_PPC64_PRI_LOW() __asm__ volatile("or 1, 1, 1")
#define KMP_PPC64_PRI_MED() __asm__ volatile("or 2, 2, 2")
#define KMP_PPC64_PRI_LOC_MB() __asm__ volatile("" : : : "memory")
#define KMP_CPU_PAUSE() \
do { \
KMP_PPC64_PRI_LOW(); \
KMP_PPC64_PRI_MED(); \
KMP_PPC64_PRI_LOC_MB(); \
} while (0)
#else
#define KMP_CPU_PAUSE() /* nothing to do */
#endif
#define KMP_INIT_YIELD(count) \
{ (count) = __kmp_yield_init; }
#define KMP_INIT_BACKOFF(time) \
{ (time) = __kmp_pause_init; }
#define KMP_OVERSUBSCRIBED \
(TCR_4(__kmp_nth) > (__kmp_avail_proc ? __kmp_avail_proc : __kmp_xproc))
#define KMP_TRY_YIELD \
((__kmp_use_yield == 1) || (__kmp_use_yield == 2 && (KMP_OVERSUBSCRIBED)))
#define KMP_TRY_YIELD_OVERSUB \
((__kmp_use_yield == 1 || __kmp_use_yield == 2) && (KMP_OVERSUBSCRIBED))
#define KMP_YIELD(cond) \
{ \
KMP_CPU_PAUSE(); \
if ((cond) && (KMP_TRY_YIELD)) \
__kmp_yield(); \
}
#define KMP_YIELD_OVERSUB() \
{ \
KMP_CPU_PAUSE(); \
if ((KMP_TRY_YIELD_OVERSUB)) \
__kmp_yield(); \
}
// Note the decrement of 2 in the following Macros. With KMP_LIBRARY=turnaround,
// there should be no yielding since initial value from KMP_INIT_YIELD() is odd.
#define KMP_YIELD_SPIN(count) \
{ \
KMP_CPU_PAUSE(); \
if (KMP_TRY_YIELD) { \
(count) -= 2; \
if (!(count)) { \
__kmp_yield(); \
(count) = __kmp_yield_next; \
} \
} \
}
// If TPAUSE is available & enabled, use it. If oversubscribed, use the slower
// (C0.2) state, which improves performance of other SMT threads on the same
// core, otherwise, use the fast (C0.1) default state, or whatever the user has
// requested. Uses a timed TPAUSE, and exponential backoff. If TPAUSE isn't
// available, fall back to the regular CPU pause and yield combination.
#if KMP_HAVE_UMWAIT
#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \
{ \
if (__kmp_tpause_enabled) { \
if (KMP_OVERSUBSCRIBED) { \
__kmp_tpause(0, (time)); \
} else { \
__kmp_tpause(__kmp_tpause_hint, (time)); \
} \
(time) *= 2; \
} else { \
KMP_CPU_PAUSE(); \
if ((KMP_TRY_YIELD_OVERSUB)) { \
__kmp_yield(); \
} else if (__kmp_use_yield == 1) { \
(count) -= 2; \
if (!(count)) { \
__kmp_yield(); \
(count) = __kmp_yield_next; \
} \
} \
} \
}
#else
#define KMP_YIELD_OVERSUB_ELSE_SPIN(count, time) \
{ \
KMP_CPU_PAUSE(); \
if ((KMP_TRY_YIELD_OVERSUB)) \
__kmp_yield(); \
else if (__kmp_use_yield == 1) { \
(count) -= 2; \
if (!(count)) { \
__kmp_yield(); \
(count) = __kmp_yield_next; \
} \
} \
}
#endif // KMP_HAVE_UMWAIT
/* ------------------------------------------------------------------------ */
/* Support datatypes for the orphaned construct nesting checks. */
/* ------------------------------------------------------------------------ */
@ -3088,6 +3122,7 @@ extern kmp_int32 __kmp_use_yield;
extern kmp_int32 __kmp_use_yield_exp_set;
extern kmp_uint32 __kmp_yield_init;
extern kmp_uint32 __kmp_yield_next;
extern kmp_uint64 __kmp_pause_init;
/* ------------------------------------------------------------------------- */
extern int __kmp_allThreadsSpecified;
@ -3290,6 +3325,13 @@ extern int __kmp_mwait_enabled; // Runtime check if ring3 mwait is enabled
extern int __kmp_mwait_hints; // Hints to pass in to mwait
#endif
#if KMP_HAVE_UMWAIT
extern int __kmp_waitpkg_enabled; // Runtime check if waitpkg exists
extern int __kmp_tpause_state; // 0 (default), 1=C0.1, 2=C0.2; from KMP_TPAUSE
extern int __kmp_tpause_hint; // 1=C0.1 (default), 0=C0.2; from KMP_TPAUSE
extern int __kmp_tpause_enabled; // 0 (default), 1 (KMP_TPAUSE is non-zero)
#endif
/* ------------------------------------------------------------------------- */
extern kmp_global_t __kmp_global; /* global status */

View File

@ -2655,9 +2655,11 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
kmp_uint32 spins;
kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
kmp_uint32 r;
kmp_uint64 time;
KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
KMP_INIT_YIELD(spins);
KMP_INIT_BACKOFF(time);
// main wait spin loop
while (!f(r = TCR_4(*spin), check)) {
KMP_FSYNC_SPIN_PREPARE(obj);
@ -2665,7 +2667,7 @@ __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
split. It causes problems with infinite recursion because of exit lock */
/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
__kmp_abort_thread(); */
KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
return r;
@ -2680,15 +2682,17 @@ void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
kmp_uint32 check = checker;
kmp_uint32 spins;
kmp_uint32 (*f)(void *, kmp_uint32) = pred;
kmp_uint64 time;
KMP_FSYNC_SPIN_INIT(obj, spin);
KMP_INIT_YIELD(spins);
KMP_INIT_BACKOFF(time);
// main wait spin loop
while (!f(spin, check)) {
KMP_FSYNC_SPIN_PREPARE(obj);
/* if we have waited a bit, or are noversubscribed, yield */
/* pause is in the following code */
KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
}

View File

@ -292,10 +292,12 @@ static UT __kmp_wait(volatile UT *spinner, UT checker,
UT check = checker;
kmp_uint32 spins;
kmp_uint32 (*f)(UT, UT) = pred;
kmp_uint64 time;
UT r;
KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
KMP_INIT_YIELD(spins);
KMP_INIT_BACKOFF(time);
// main wait spin loop
while (!f(r = *spin, check)) {
KMP_FSYNC_SPIN_PREPARE(obj);
@ -305,7 +307,7 @@ static UT __kmp_wait(volatile UT *spinner, UT checker,
/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
__kmp_abort_thread(); */
// If oversubscribed, or have waited a bit then yield.
KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
KMP_FSYNC_SPIN_ACQUIRED(obj);
return r;

View File

@ -219,6 +219,13 @@ int __kmp_mwait_enabled = FALSE;
int __kmp_mwait_hints = 0;
#endif
#if KMP_HAVE_UMWAIT
int __kmp_waitpkg_enabled = 0;
int __kmp_tpause_state = 0;
int __kmp_tpause_hint = 1;
int __kmp_tpause_enabled = 0;
#endif
/* map OMP 3.0 schedule types with our internal schedule types */
enum sched_type __kmp_sch_map[kmp_sched_upper - kmp_sched_lower_ext +
kmp_sched_upper_std - kmp_sched_lower - 2] = {
@ -425,6 +432,7 @@ kmp_int32 __kmp_use_yield_exp_set = 0;
kmp_uint32 __kmp_yield_init = KMP_INIT_WAIT;
kmp_uint32 __kmp_yield_next = KMP_NEXT_WAIT;
kmp_uint64 __kmp_pause_init = 1; // for tpause
/* ------------------------------------------------------ */
/* STATE mostly syncronized with global lock */

View File

@ -96,12 +96,19 @@ __kmp_acquire_tas_lock_timed_template(kmp_tas_lock_t *lck, kmp_int32 gtid) {
}
kmp_uint32 spins;
kmp_uint64 time;
KMP_FSYNC_PREPARE(lck);
KMP_INIT_YIELD(spins);
KMP_INIT_BACKOFF(time);
kmp_backoff_t backoff = __kmp_spin_backoff_params;
do {
#if !KMP_HAVE_UMWAIT
__kmp_spin_backoff(&backoff);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
#else
if (!__kmp_tpause_enabled)
__kmp_spin_backoff(&backoff);
#endif
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
} while (KMP_ATOMIC_LD_RLX(&lck->lk.poll) != tas_free ||
!__kmp_atomic_compare_store_acq(&lck->lk.poll, tas_free, tas_busy));
KMP_FSYNC_ACQUIRED(lck);
@ -2227,10 +2234,12 @@ __kmp_acquire_drdpa_lock_timed_template(kmp_drdpa_lock_t *lck, kmp_int32 gtid) {
// The current implementation of KMP_WAIT doesn't allow for mask
// and poll to be re-read every spin iteration.
kmp_uint32 spins;
kmp_uint64 time;
KMP_FSYNC_PREPARE(lck);
KMP_INIT_YIELD(spins);
KMP_INIT_BACKOFF(time);
while (polls[ticket & mask] < ticket) { // atomic load
KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
// Re-read the mask and the poll pointer from the lock structure.
//
// Make certain that "mask" is read before "polls" !!!
@ -2659,9 +2668,17 @@ void __kmp_spin_backoff(kmp_backoff_t *boff) {
kmp_uint32 i;
for (i = boff->step; i > 0; i--) {
kmp_uint64 goal = __kmp_tsc() + boff->min_tick;
do {
KMP_CPU_PAUSE();
} while (before(__kmp_tsc(), goal));
#if KMP_HAVE_UMWAIT
if (__kmp_umwait_enabled) {
__kmp_tpause(0, boff->min_tick);
} else {
#endif
do {
KMP_CPU_PAUSE();
} while (before(__kmp_tsc(), goal));
#if KMP_HAVE_UMWAIT
}
#endif
}
boff->step = (boff->step << 1 | 1) & (boff->max_backoff - 1);
}

View File

@ -651,12 +651,15 @@ extern int (*__kmp_acquire_user_lock_with_checks_)(kmp_user_lock_p lck,
if (lck->tas.lk.poll != 0 || \
!__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \
kmp_uint32 spins; \
kmp_uint64 time; \
KMP_FSYNC_PREPARE(lck); \
KMP_INIT_YIELD(spins); \
KMP_INIT_BACKOFF(time); \
do { \
KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \
} while (lck->tas.lk.poll != 0 || !__kmp_atomic_compare_store_acq( \
&lck->tas.lk.poll, 0, gtid + 1)); \
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \
} while ( \
lck->tas.lk.poll != 0 || \
!__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \
} \
KMP_FSYNC_ACQUIRED(lck); \
} else { \
@ -758,10 +761,12 @@ extern int (*__kmp_acquire_nested_user_lock_with_checks_)(kmp_user_lock_p lck,
if ((lck->tas.lk.poll != 0) || \
!__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)) { \
kmp_uint32 spins; \
kmp_uint64 time; \
KMP_FSYNC_PREPARE(lck); \
KMP_INIT_YIELD(spins); \
KMP_INIT_BACKOFF(time); \
do { \
KMP_YIELD_OVERSUB_ELSE_SPIN(spins); \
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time); \
} while ( \
(lck->tas.lk.poll != 0) || \
!__kmp_atomic_compare_store_acq(&lck->tas.lk.poll, 0, gtid + 1)); \

View File

@ -6895,7 +6895,9 @@ static void __kmp_check_mic_type() {
static void __kmp_user_level_mwait_init() {
struct kmp_cpuid buf;
__kmp_x86_cpuid(7, 0, &buf);
__kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
__kmp_waitpkg_enabled = ((buf.ecx >> 5) & 1);
__kmp_umwait_enabled = __kmp_waitpkg_enabled && __kmp_user_level_mwait;
__kmp_tpause_enabled = __kmp_waitpkg_enabled && (__kmp_tpause_state > 0);
KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
__kmp_umwait_enabled));
}

View File

@ -5171,6 +5171,27 @@ static void __kmp_stg_print_mwait_hints(kmp_str_buf_t *buffer, char const *name,
#endif // KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
#if KMP_HAVE_UMWAIT
// -----------------------------------------------------------------------------
// KMP_TPAUSE
// 0 = don't use TPAUSE, 1 = use C0.1 state, 2 = use C0.2 state
static void __kmp_stg_parse_tpause(char const *name, char const *value,
void *data) {
__kmp_stg_parse_int(name, value, 0, INT_MAX, &__kmp_tpause_state);
if (__kmp_tpause_state != 0) {
// The actual hint passed to tpause is: 0 for C0.2 and 1 for C0.1
if (__kmp_tpause_state == 2) // use C0.2
__kmp_tpause_hint = 0; // default was set to 1 for C0.1
}
} // __kmp_stg_parse_tpause
static void __kmp_stg_print_tpause(kmp_str_buf_t *buffer, char const *name,
void *data) {
__kmp_stg_print_int(buffer, name, __kmp_tpause_state);
} // __kmp_stg_print_tpause
#endif // KMP_HAVE_UMWAIT
// -----------------------------------------------------------------------------
// OMP_DISPLAY_ENV
@ -5536,6 +5557,10 @@ static kmp_setting_t __kmp_stg_table[] = {
{"KMP_MWAIT_HINTS", __kmp_stg_parse_mwait_hints,
__kmp_stg_print_mwait_hints, NULL, 0, 0},
#endif
#if KMP_HAVE_UMWAIT
{"KMP_TPAUSE", __kmp_stg_parse_tpause, __kmp_stg_print_tpause, NULL, 0, 0},
#endif
{"", NULL, NULL, NULL, 0, 0}}; // settings
static int const __kmp_stg_count =

View File

@ -3552,9 +3552,11 @@ void __kmp_reap_task_teams(void) {
void __kmp_wait_to_unref_task_teams(void) {
kmp_info_t *thread;
kmp_uint32 spins;
kmp_uint64 time;
int done;
KMP_INIT_YIELD(spins);
KMP_INIT_BACKOFF(time);
for (;;) {
done = TRUE;
@ -3604,7 +3606,7 @@ void __kmp_wait_to_unref_task_teams(void) {
}
// If oversubscribed or have waited a bit, yield.
KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
}
}

View File

@ -377,6 +377,7 @@ __kmp_wait_template(kmp_info_t *this_thr,
#else
kmp_uint32 hibernate;
#endif
kmp_uint64 time;
KMP_FSYNC_SPIN_INIT(spin, NULL);
if (flag->done_check()) {
@ -476,6 +477,7 @@ final_spin=FALSE)
#endif
KMP_INIT_YIELD(spins); // Setup for waiting
KMP_INIT_BACKOFF(time);
if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
__kmp_pause_status == kmp_soft_paused) {
@ -563,7 +565,7 @@ final_spin=FALSE)
// If we are oversubscribed, or have waited a bit (and
// KMP_LIBRARY=throughput), then yield
KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
#if KMP_STATS_ENABLED
// Check if thread has been signalled to idle state

View File

@ -1327,16 +1327,18 @@ static void __kmp_reap_common(kmp_info_t *th) {
// KMP_WAIT to cover this usage also.
void *obj = NULL;
kmp_uint32 spins;
kmp_uint64 time;
#if USE_ITT_BUILD
KMP_FSYNC_SPIN_INIT(obj, (void *)&th->th.th_info.ds.ds_alive);
#endif /* USE_ITT_BUILD */
KMP_INIT_YIELD(spins);
KMP_INIT_BACKOFF(time);
do {
#if USE_ITT_BUILD
KMP_FSYNC_SPIN_PREPARE(obj);
#endif /* USE_ITT_BUILD */
__kmp_is_thread_alive(th, &exit_val);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
KMP_YIELD_OVERSUB_ELSE_SPIN(spins, time);
} while (exit_val == STILL_ACTIVE && TCR_4(th->th.th_info.ds.ds_alive));
#if USE_ITT_BUILD
if (exit_val == STILL_ACTIVE) {