From 542d0e564f4d2367eca1a507940cc3c6cb8d9795 Mon Sep 17 00:00:00 2001
From: Jens Stutte <jstutte@mozilla.com>
Date: Wed, 13 Nov 2024 07:48:38 +0000
Subject: [PATCH] Bug 1924444 - Use a linked list with LIFO policy for a bin's
 mNotFullRuns to reduce overhead and foster locality. r=smaug,pbone

The current order of re-using regions from non-full runs relies on the
run's position in memory via a RedBlackTree. This leads to O(log(n))
complexity for the tree book-keeping and walking while providing a
somewhat arbitrary order when doing GetNonFullBinRun.

We can reduce the complexity here by using a DoublyLinkedList for
mNonFullRuns. The resulting LIFO order seems to have beneficial effects
for common gecko use cases, too.

Differential Revision: https://phabricator.services.mozilla.com/D225533
---
 memory/build/mozjemalloc.cpp | 168 +++++++++++++++--------------------
 1 file changed, 70 insertions(+), 98 deletions(-)

diff --git a/memory/build/mozjemalloc.cpp b/memory/build/mozjemalloc.cpp
index e297a7ff9ce7..3b7f12aa2f43 100644
--- a/memory/build/mozjemalloc.cpp
+++ b/memory/build/mozjemalloc.cpp
@@ -305,11 +305,7 @@ struct arena_t;
 
 // Each element of the chunk map corresponds to one page within the chunk.
 struct arena_chunk_map_t {
-  // Linkage for run trees.  There are two disjoint uses:
-  //
-  // 1) arena_t's tree or available runs.
-  // 2) arena_run_t conceptually uses this linkage for in-use non-full
-  //    runs, rather than directly embedding linkage.
+  // Linkage for run trees. Used for arena_t's tree or available runs.
   RedBlackTreeNode<arena_chunk_map_t> link;
 
   // Run address (or size) and various flags are stored together.  The bit
@@ -987,15 +983,6 @@ struct ArenaChunkMapLink {
   }
 };
 
-struct ArenaRunTreeTrait : public ArenaChunkMapLink {
-  static inline Order Compare(arena_chunk_map_t* aNode,
-                              arena_chunk_map_t* aOther) {
-    MOZ_ASSERT(aNode);
-    MOZ_ASSERT(aOther);
-    return CompareAddr(aNode, aOther);
-  }
-};
-
 struct ArenaAvailTreeTrait : public ArenaChunkMapLink {
   static inline Order Compare(arena_chunk_map_t* aNode,
                               arena_chunk_map_t* aOther) {
@@ -1051,6 +1038,9 @@ struct arena_run_t {
   unsigned mNumFree;
 #endif
 
+  // Used by arena_bin_t::mNonFullRuns.
+  DoublyLinkedListElement<arena_run_t> mRunListElem;
+
   // Bin this run is associated with.
   arena_bin_t* mBin;
 
@@ -1066,17 +1056,28 @@ struct arena_run_t {
   unsigned mRegionsMask[];  // Dynamically sized.
 };
 
-struct arena_bin_t {
-  // Current run being used to service allocations of this bin's size
-  // class.
-  arena_run_t* mCurrentRun;
+namespace mozilla {
 
-  // Tree of non-full runs.  This tree is used when looking for an
-  // existing run when mCurrentRun is no longer usable.  We choose the
-  // non-full run that is lowest in memory; this policy tends to keep
-  // objects packed well, and it can also help reduce the number of
-  // almost-empty chunks.
-  RedBlackTree<arena_chunk_map_t, ArenaRunTreeTrait> mNonFullRuns;
+template <>
+struct GetDoublyLinkedListElement<arena_run_t> {
+  static DoublyLinkedListElement<arena_run_t>& Get(arena_run_t* aThis) {
+    return aThis->mRunListElem;
+  }
+};
+
+}  // namespace mozilla
+
+struct arena_bin_t {
+  // We use a LIFO ("last-in-first-out") policy to refill non-full runs.
+  //
+  // This has the following reasons:
+  // 1. It is cheap, as all our non-full-runs' book-keeping is O(1), no
+  //    tree-balancing or walking is needed.
+  // 2. It also helps to increase the probability for CPU cache hits for the
+  //    book-keeping and the reused slots themselves, as the same memory was
+  //    most recently touched during free, especially when used from the same
+  //    core (or via the same shared cache, depending on the architecture).
+  DoublyLinkedList<arena_run_t> mNonFullRuns;
 
   // Bin's size class.
   size_t mSizeClass;
@@ -1293,7 +1294,9 @@ struct arena_t {
   void TrimRunTail(arena_chunk_t* aChunk, arena_run_t* aRun, size_t aOldSize,
                    size_t aNewSize, bool dirty) MOZ_REQUIRES(mLock);
 
-  arena_run_t* GetNonFullBinRun(arena_bin_t* aBin) MOZ_REQUIRES(mLock);
+  arena_run_t* GetNewEmptyBinRun(arena_bin_t* aBin) MOZ_REQUIRES(mLock);
+
+  inline arena_run_t* GetNonFullBinRun(arena_bin_t* aBin) MOZ_REQUIRES(mLock);
 
   inline uint8_t FindFreeBitInMask(uint32_t aMask, uint32_t& aRng)
       MOZ_REQUIRES(mLock);
@@ -3653,32 +3656,16 @@ void arena_t::TrimRunTail(arena_chunk_t* aChunk, arena_run_t* aRun,
   MOZ_ASSERT(!no_chunk);
 }
 
-arena_run_t* arena_t::GetNonFullBinRun(arena_bin_t* aBin) {
-  arena_chunk_map_t* mapelm;
+arena_run_t* arena_t::GetNewEmptyBinRun(arena_bin_t* aBin) {
   arena_run_t* run;
   unsigned i, remainder;
 
-  // Look for a usable run.
-  mapelm = aBin->mNonFullRuns.First();
-  if (mapelm) {
-    // run is guaranteed to have available space.
-    aBin->mNonFullRuns.Remove(mapelm);
-    run = (arena_run_t*)(mapelm->bits & ~gPageSizeMask);
-    return run;
-  }
-  // No existing runs have any space available.
-
   // Allocate a new run.
   run = AllocRun(static_cast<size_t>(aBin->mRunSizePages) << gPageSize2Pow,
                  false, false);
   if (!run) {
     return nullptr;
   }
-  // Don't initialize if a race in arena_t::RunAlloc() allowed an existing
-  // run to become usable.
-  if (run == aBin->mCurrentRun) {
-    return run;
-  }
 
   // Initialize run internals.
   run->mBin = aBin;
@@ -3702,10 +3689,27 @@ arena_run_t* arena_t::GetNonFullBinRun(arena_bin_t* aBin) {
   run->mMagic = ARENA_RUN_MAGIC;
 #endif
 
+  // Make sure we continue to use this run for subsequent allocations.
+  new (&run->mRunListElem) DoublyLinkedListElement<arena_run_t>();
+  aBin->mNonFullRuns.pushFront(run);
+
   aBin->mNumRuns++;
   return run;
 }
 
+arena_run_t* arena_t::GetNonFullBinRun(arena_bin_t* aBin) {
+  auto mrf_head = aBin->mNonFullRuns.begin();
+  if (mrf_head) {
+    // Take the head and if we are going to fill it, remove it from our list.
+    arena_run_t* run = &(*mrf_head);
+    if (run->mNumFree == 1) {
+      aBin->mNonFullRuns.remove(run);
+    }
+    return run;
+  }
+  return GetNewEmptyBinRun(aBin);
+}
+
 void arena_bin_t::Init(SizeClass aSizeClass) {
   size_t try_run_size;
   unsigned try_nregs, try_mask_nelms, try_reg0_offset;
@@ -3716,8 +3720,6 @@ void arena_bin_t::Init(SizeClass aSizeClass) {
 
   try_run_size = gPageSize;
 
-  mCurrentRun = nullptr;
-  mNonFullRuns.Init();
   mSizeClass = aSizeClass.Size();
   mNumRuns = 0;
 
@@ -3781,6 +3783,10 @@ void arena_bin_t::Init(SizeClass aSizeClass) {
              try_reg0_offset);
   MOZ_ASSERT((try_mask_nelms << (LOG2(sizeof(int)) + 3)) >= try_nregs);
 
+  // Our list management would break if mRunNumRegions == 1 and we should use
+  // a large size class instead, anyways.
+  MOZ_ASSERT(try_nregs > 1);
+
   // Copy final settings.
   MOZ_ASSERT((try_run_size >> gPageSize2Pow) <= UINT8_MAX);
   mRunSizePages = static_cast<uint8_t>(try_run_size >> gPageSize2Pow);
@@ -3875,10 +3881,7 @@ void* arena_t::MallocSmall(size_t aSize, bool aZero) {
     MOZ_ASSERT(!mRandomizeSmallAllocations || mPRNG ||
                (mIsPRNGInitializing && !isInitializingThread));
 
-    run = bin->mCurrentRun;
-    if (MOZ_UNLIKELY(!run || run->mNumFree == 0)) {
-      run = bin->mCurrentRun = GetNonFullBinRun(bin);
-    }
+    run = GetNonFullBinRun(bin);
     if (MOZ_UNLIKELY(!run)) {
       return nullptr;
     }
@@ -4325,53 +4328,28 @@ arena_chunk_t* arena_t::DallocSmall(arena_chunk_t* aChunk, void* aPtr,
   arena_chunk_t* dealloc_chunk = nullptr;
 
   if (run->mNumFree == bin->mRunNumRegions) {
-    // Deallocate run.
-    if (run == bin->mCurrentRun) {
-      bin->mCurrentRun = nullptr;
-    } else if (bin->mRunNumRegions != 1) {
-      size_t run_pageind =
-          (uintptr_t(run) - uintptr_t(aChunk)) >> gPageSize2Pow;
-      arena_chunk_map_t* run_mapelm = &aChunk->map[run_pageind];
-
-      // This block's conditional is necessary because if the
-      // run only contains one region, then it never gets
-      // inserted into the non-full runs tree.
-      MOZ_DIAGNOSTIC_ASSERT(bin->mNonFullRuns.Search(run_mapelm) == run_mapelm);
-      bin->mNonFullRuns.Remove(run_mapelm);
-    }
+    // This run is entirely freed, remove it from our bin.
 #if defined(MOZ_DIAGNOSTIC_ASSERT_ENABLED)
     run->mMagic = 0;
 #endif
+    MOZ_ASSERT(bin->mNonFullRuns.ElementProbablyInList(run));
+    bin->mNonFullRuns.remove(run);
     dealloc_chunk = DallocRun(run, true);
     bin->mNumRuns--;
-  } else if (run->mNumFree == 1 && run != bin->mCurrentRun) {
-    // Make sure that bin->mCurrentRun always refers to the lowest
-    // non-full run, if one exists.
-    if (!bin->mCurrentRun) {
-      bin->mCurrentRun = run;
-    } else if (uintptr_t(run) < uintptr_t(bin->mCurrentRun)) {
-      // Switch mCurrentRun.
-      if (bin->mCurrentRun->mNumFree > 0) {
-        arena_chunk_t* runcur_chunk = GetChunkForPtr(bin->mCurrentRun);
-        size_t runcur_pageind =
-            (uintptr_t(bin->mCurrentRun) - uintptr_t(runcur_chunk)) >>
-            gPageSize2Pow;
-        arena_chunk_map_t* runcur_mapelm = &runcur_chunk->map[runcur_pageind];
-
-        // Insert runcur.
-        MOZ_DIAGNOSTIC_ASSERT(!bin->mNonFullRuns.Search(runcur_mapelm));
-        bin->mNonFullRuns.Insert(runcur_mapelm);
-      }
-      bin->mCurrentRun = run;
-    } else {
-      size_t run_pageind =
-          (uintptr_t(run) - uintptr_t(aChunk)) >> gPageSize2Pow;
-      arena_chunk_map_t* run_mapelm = &aChunk->map[run_pageind];
-
-      MOZ_DIAGNOSTIC_ASSERT(bin->mNonFullRuns.Search(run_mapelm) == nullptr);
-      bin->mNonFullRuns.Insert(run_mapelm);
-    }
+  } else if (run->mNumFree == 1) {
+    // This is first slot we freed from this run, start tracking.
+    MOZ_ASSERT(!bin->mNonFullRuns.ElementProbablyInList(run));
+    bin->mNonFullRuns.pushFront(run);
   }
+  // else we just keep the run in mNonFullRuns where it is.
+  // Note that we could move it to the head of the list here to get a strict
+  // "most-recently-freed" order, but some of our benchmarks seem to be more
+  // sensible to the increased overhead that this brings than to the order
+  // supposedly slightly better for keeping CPU caches warm if we do.
+  // In general we cannot foresee the future, so any order we choose might
+  // perform different for different use cases and needs to be balanced with
+  // the book-keeping overhead via measurements.
+
   mStats.allocated_small -= size;
 
   return dealloc_chunk;
@@ -4674,7 +4652,7 @@ arena_t::~arena_t() {
     chunk_dealloc(mSpare, kChunkSize, ARENA_CHUNK);
   }
   for (i = 0; i < NUM_SMALL_CLASSES; i++) {
-    MOZ_RELEASE_ASSERT(!mBins[i].mNonFullRuns.First(), "Bin is not empty");
+    MOZ_RELEASE_ASSERT(mBins[i].mNonFullRuns.isEmpty(), "Bin is not empty");
   }
 #ifdef MOZ_DEBUG
   {
@@ -5393,14 +5371,8 @@ inline void MozJemalloc::jemalloc_stats_internal(
         size_t bin_unused = 0;
         size_t num_non_full_runs = 0;
 
-        for (auto mapelm : bin->mNonFullRuns.iter()) {
-          arena_run_t* run = (arena_run_t*)(mapelm->bits & ~gPageSizeMask);
-          bin_unused += run->mNumFree * bin->mSizeClass;
-          num_non_full_runs++;
-        }
-
-        if (bin->mCurrentRun) {
-          bin_unused += bin->mCurrentRun->mNumFree * bin->mSizeClass;
+        for (auto run : bin->mNonFullRuns) {
+          bin_unused += run.mNumFree * bin->mSizeClass;
           num_non_full_runs++;
         }