mutex: fix trylock spin wait contention

If there are 3 or more threads spin-waiting on the same mutex,
there will be excessive exclusive cacheline contention because
pthread_trylock() immediately tries to CAS in a new value, instead
of first checking if the lock is locked.

This diff adds a 'locked' hint flag, and we will only spin wait
without trylock()ing while set.  I don't know of any other portable
way to get the same behavior as pthread_mutex_lock().

This is pretty easy to test via ttest, e.g.

./ttest1 500 3 10000 1 100

Throughput is nearly 3x as fast.

This blames to the mutex profiling changes, however, we almost never
have 3 or more threads contending in properly configured production
workloads, but still worth fixing.
This commit is contained in:
Dave Watson 2018-11-26 08:11:00 -08:00
parent c4063ce439
commit b23336af96
2 changed files with 17 additions and 7 deletions

View File

@ -43,6 +43,11 @@ struct malloc_mutex_s {
#else
pthread_mutex_t lock;
#endif
/*
* Hint flag to avoid exclusive cache line contention
* during spin waiting
*/
atomic_b_t locked;
};
/*
* We only touch witness when configured w/ debug. However we
@ -97,21 +102,21 @@ struct malloc_mutex_s {
#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
# if defined(JEMALLOC_DEBUG)
# define MALLOC_MUTEX_INITIALIZER \
{{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}}, \
{{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
# else
# define MALLOC_MUTEX_INITIALIZER \
{{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT}}, \
{{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
# endif
#elif (defined(JEMALLOC_MUTEX_INIT_CB))
# if (defined(JEMALLOC_DEBUG))
# define MALLOC_MUTEX_INITIALIZER \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}}, \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
# else
# define MALLOC_MUTEX_INITIALIZER \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL}}, \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
# endif
@ -119,11 +124,11 @@ struct malloc_mutex_s {
# define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
# if defined(JEMALLOC_DEBUG)
# define MALLOC_MUTEX_INITIALIZER \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}}, \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
# else
# define MALLOC_MUTEX_INITIALIZER \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER}}, \
{{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \
WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
# endif
#endif
@ -148,6 +153,7 @@ void malloc_mutex_lock_slow(malloc_mutex_t *mutex);
static inline void
malloc_mutex_lock_final(malloc_mutex_t *mutex) {
MALLOC_MUTEX_LOCK(mutex);
atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
}
static inline bool
@ -173,6 +179,7 @@ malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
if (isthreaded) {
if (malloc_mutex_trylock_final(mutex)) {
atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
return true;
}
mutex_owner_stats_update(tsdn, mutex);
@ -212,6 +219,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
if (isthreaded) {
if (malloc_mutex_trylock_final(mutex)) {
malloc_mutex_lock_slow(mutex);
atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
}
mutex_owner_stats_update(tsdn, mutex);
}
@ -220,6 +228,7 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
static inline void
malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED);
witness_unlock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
if (isthreaded) {
MALLOC_MUTEX_UNLOCK(mutex);

View File

@ -55,7 +55,8 @@ malloc_mutex_lock_slow(malloc_mutex_t *mutex) {
int cnt = 0, max_cnt = MALLOC_MUTEX_MAX_SPIN;
do {
spin_cpu_spinwait();
if (!malloc_mutex_trylock_final(mutex)) {
if (!atomic_load_b(&mutex->locked, ATOMIC_RELAXED)
&& !malloc_mutex_trylock_final(mutex)) {
data->n_spin_acquired++;
return;
}