Bug 1874022 - pt 11. Improve cache locality r=glandium

+ Move sNow into the PHC class and put it next to mMutex, they're frequently modified together. + Reorganise PHC fields to place frequently-updated-together fields together, and separate them from the seldom-updated fields. + The only static fields remaining are write-once. Differential Revision: https://phabricator.services.mozilla.com/D210468
2024-11-27 06:43:32 +00:00 · 2024-06-26 02:06:41 +00:00 · 2024-06-26 02:06:41 +00:00 · 7c1ac112d0
commit 7c1ac112d0
parent 064646e2bb
3 changed files with 56 additions and 35 deletions
--- a/memory/build/PHC.cpp
+++ b/memory/build/PHC.cpp
@ -572,7 +572,7 @@ class PHC {
    Maybe<StackTrace> mFreeStack;

    // The time at which the page is available for reuse, as measured against
-    // sNow. When the page is in use this value will be kMaxTime.
+    // mNow. When the page is in use this value will be kMaxTime.
    // - NeverAllocated: must be 0.
    // - InUse: must be kMaxTime.
    // - Freed: must be > 0 and < kMaxTime.
@ -899,10 +899,16 @@ class PHC {

  static bool IsDisabledOnCurrentThread() { return tlsIsDisabled.get(); }

-  static Time Now() { return sNow; }
+  static Time Now() {
+    if (!sPHC) {
+      return 0;
+    }

-  static void AdvanceNow(uint32_t delay = 0) {
-    sNow += tlsLastDelay.get() - delay;
+    return sPHC->mNow;
+  }
+
+  void AdvanceNow(uint32_t delay = 0) {
+    mNow += tlsLastDelay.get() - delay;
    tlsLastDelay.set(delay);
  }

@ -919,7 +925,10 @@ class PHC {
    // executed on a new thread's first allocation, the result is the same: all
    // the thread's TLS fields will be initialised.

-    AdvanceNow();
+    // This accesses sPHC but we want to ensure it's still a static member
+    // function so that sPHC isn't dereferenced until after the hot path above.
+    MOZ_ASSERT(sPHC);
+    sPHC->AdvanceNow();

    // Use an atomic fetch-and-subtract.  This uses unsigned underflow semantics
    // to avoid doing a full compare-and-swap.
@ -1056,11 +1065,23 @@ class PHC {
 #endif
  }

+  // To improve locality we try to order this file by how frequently different
+  // fields are modified and place all the modified-together fields early and
+  // ideally within a single cache line.
 public:
  // The mutex that protects the other members.
-  Mutex mMutex MOZ_UNANNOTATED;
+  alignas(kCacheLineSize) Mutex mMutex MOZ_UNANNOTATED;

 private:
+  // The current time. We use ReleaseAcquire semantics since we attempt to
+  // update this by larger increments and don't want to lose an entire update.
+  Atomic<Time, ReleaseAcquire> mNow;
+
+  // This will only ever be updated from one thread.  The other threads should
+  // eventually get the update.
+  Atomic<PHCState, Relaxed> mPhcState =
+      Atomic<PHCState, Relaxed>(DEFAULT_STATE);
+
  // RNG for deciding which allocations to treat specially. It doesn't need to
  // be high quality.
  //
@ -1068,10 +1089,7 @@ class PHC {
  // PHC's constructor. Don't change it to UniquePtr or anything like that.
  non_crypto::XorShift128PlusRNG mRNG;

-  AllocPageInfo mAllocPages[kNumAllocPages];
 #if PHC_LOGGING
-  Time mFreeTime[kNumAllocPages];
-
  // How many allocations that could have been page allocs actually were? As
  // constrained kNumAllocPages. If the hit ratio isn't close to 100% it's
  // likely that the global constants are poorly chosen.
@ -1079,15 +1097,13 @@ class PHC {
  size_t mPageAllocMisses = 0;
 #endif

-  // This will only ever be updated from one thread.  The other threads should
-  // eventually get the update.
-  Atomic<PHCState, Relaxed> mPhcState =
-      Atomic<PHCState, Relaxed>(DEFAULT_STATE);
+  // The remaining fields are updated much less often, place them on the next
+  // cache line.

  // The average delay before doing any page allocations at the start of a
  // process. Note that roughly 1 million allocations occur in the main process
  // while starting the browser. The delay range is 1..gAvgFirstAllocDelay*2.
-  Delay mAvgFirstAllocDelay = 64 * 1024;
+  alignas(kCacheLineSize) Delay mAvgFirstAllocDelay = 64 * 1024;

  // The average delay until the next attempted page allocation, once we get
  // past the first delay. The delay range is 1..kAvgAllocDelay*2.
@ -1151,10 +1167,6 @@ class PHC {
  //
  static PHC_THREAD_LOCAL(bool) tlsIsDisabled;

-  // The current time. We use ReleaseAcquire semantics since we attempt to
-  // update this by larger increments and don't want to lose an entire update.
-  static Atomic<Time, ReleaseAcquire> sNow;
-
  // Delay until the next attempt at a page allocation.  The delay is made up of
  // two parts the global delay and each thread's local portion of that delay:
  //
@ -1170,6 +1182,11 @@ class PHC {
  // The last value we set tlsAllocDelay to before starting to count down.
  static PHC_THREAD_LOCAL(Delay) tlsLastDelay;

+  AllocPageInfo mAllocPages[kNumAllocPages];
+#if PHC_LOGGING
+  Time mFreeTime[kNumAllocPages];
+#endif
+
 public:
  Delay GetAvgAllocDelay(const MutexAutoLock&) { return mAvgAllocDelay; }
  Delay GetAvgFirstAllocDelay(const MutexAutoLock&) {
@ -1187,15 +1204,19 @@ class PHC {
  static PHC* sPHC;
 };

+// These globals are read together and hardly ever written.  They should be on
+// the same cache line.  They should be in a different cache line to data that
+// is manipulated often (sMutex and mNow are members of sPHC for that reason) so
+// that this cache line can be shared amoung cores.  This makes a measurable
+// impact to calls to maybe_init()
+alignas(kCacheLineSize) PHCRegion* PHC::sRegion;
+PHC* PHC::sPHC;
+
 PHC_THREAD_LOCAL(bool) PHC::tlsIsDisabled;
-Atomic<Time, ReleaseAcquire> PHC::sNow;
 PHC_THREAD_LOCAL(Delay) PHC::tlsAllocDelay;
 Atomic<Delay, ReleaseAcquire> PHC::sAllocDelay;
 PHC_THREAD_LOCAL(Delay) PHC::tlsLastDelay;

-PHCRegion* PHC::sRegion;
-PHC* PHC::sPHC;
-
 // This must be defined after the PHC class.
 PHCRegion::PHCRegion()
    : mPagesStart(AllocAllPages()), mPagesLimit(mPagesStart + kAllPagesSize) {
@ -1533,7 +1554,7 @@ MOZ_ALWAYS_INLINE static Maybe<void*> MaybePageRealloc(
  uintptr_t index = pk.AllocPageIndex();

  // A page-to-something transition.
-  PHC::AdvanceNow(PHC::LocalAllocDelay());
+  PHC::sPHC->AdvanceNow(PHC::LocalAllocDelay());

  // Note that `disable` has no effect unless it is emplaced below.
  Maybe<AutoDisableOnCurrentThread> disable;
@ -1634,7 +1655,7 @@ MOZ_ALWAYS_INLINE static bool MaybePageFree(const Maybe<arena_id_t>& aArenaId,
  }

  // At this point we know we have an allocation page.
-  PHC::AdvanceNow(PHC::LocalAllocDelay());
+  PHC::sPHC->AdvanceNow(PHC::LocalAllocDelay());
  uintptr_t index = pk.AllocPageIndex();

  // Note that `disable` has no effect unless it is emplaced below.
--- a/memory/build/mozjemalloc.cpp
+++ b/memory/build/mozjemalloc.cpp
@ -432,17 +432,6 @@ struct arena_chunk_t {
 // ***************************************************************************
 // Constants defining allocator size classes and behavior.

-// Maximum size of L1 cache line.  This is used to avoid cache line aliasing,
-// so over-estimates are okay (up to a point), but under-estimates will
-// negatively affect performance.
-static const size_t kCacheLineSize =
-#if defined(XP_DARWIN) && defined(__aarch64__)
-    128
-#else
-    64
-#endif
-    ;
-
 // Our size classes are inclusive ranges of memory sizes.  By describing the
 // minimums and how memory is allocated in each range the maximums can be
 // calculated.
--- a/memory/build/mozjemalloc.h
+++ b/memory/build/mozjemalloc.h
@ -144,6 +144,17 @@ constexpr uint8_t kAllocPoison = 0xe5;
 // Junk - write this junk value to freshly allocated cells.
 constexpr uint8_t kAllocJunk = 0xe4;

+// Maximum size of L1 cache line.  This is used to avoid cache line aliasing,
+// so over-estimates are okay (up to a point), but under-estimates will
+// negatively affect performance.
+constexpr size_t kCacheLineSize =
+#  if defined(XP_DARWIN) && defined(__aarch64__)
+    128
+#  else
+    64
+#  endif
+    ;
+
 #endif  // MOZ_MEMORY

 // Dummy implementation of the moz_arena_* API, falling back to a given