Bug 1669392 - Add more jemalloc size classes r=glandium

Differential Revision: https://phabricator.services.mozilla.com/D92729
2021-10-13 06:53:20 +00:00 · 2021-10-13 06:53:20 +00:00 · cc06afa32e
--- a/memory/build/Utils.h
+++ b/memory/build/Utils.h
@ -20,6 +20,20 @@ struct Log2 : mozilla::tl::CeilingLog2<N> {
 };
 #define LOG2(N) Log2<N>::value

+// Like Log2, but ignores 0.
+template <size_t N>
+struct Log2Or0 : mozilla::tl::CeilingLog2<N> {
+  using mozilla::tl::CeilingLog2<N>::value;
+  static_assert(1ULL << value == N, "Number is not a power of 2");
+};
+template <>
+struct Log2Or0<0> {
+  // This makes no sense but neither does any other value.  It's just enough
+  // that this can be used on the unused side of a conditional expression.
+  static const size_t value = 0;
+};
+#define LOG2_OR_0(N) Log2Or0<N>::value
+
 enum class Order {
  eLess = -1,
  eEqual = 0,
--- a/memory/build/mozjemalloc.cpp
+++ b/memory/build/mozjemalloc.cpp
@ -52,40 +52,57 @@
 //
 // Allocation requests are rounded up to the nearest size class, and no record
 // of the original request size is maintained.  Allocations are broken into
-// categories according to size class.  Assuming runtime defaults, 4 kB pages
-// and a 16 byte quantum on a 32-bit system, the size classes in each category
-// are as follows:
+// categories according to size class.  Assuming runtime defaults, the size
+// classes in each category are as follows (for x86, x86_64 and Apple Silicon):
 //
-//   |=====================================|
-//   | Category | Subcategory    |    Size |
-//   |=====================================|
-//   | Small    | Tiny           |       4 |
-//   |          |                |       8 |
-//   |          |----------------+---------|
-//   |          | Quantum-spaced |      16 |
-//   |          |                |      32 |
-//   |          |                |      48 |
-//   |          |                |     ... |
-//   |          |                |     480 |
-//   |          |                |     496 |
-//   |          |                |     512 |
-//   |          |----------------+---------|
-//   |          | Sub-page       |    1 kB |
-//   |          |                |    2 kB |
-//   |=====================================|
-//   | Large                     |    4 kB |
-//   |                           |    8 kB |
-//   |                           |   12 kB |
-//   |                           |     ... |
-//   |                           | 1012 kB |
-//   |                           | 1016 kB |
-//   |                           | 1020 kB |
-//   |=====================================|
-//   | Huge                      |    1 MB |
-//   |                           |    2 MB |
-//   |                           |    3 MB |
-//   |                           |     ... |
-//   |=====================================|
+//   |===============================================================|
+//   | Category | Subcategory    |     x86 |  x86_64 | Apple Silicon |
+//   |---------------------------+---------+---------+---------------+
+//   | Word size                 |  32 bit |  64 bit |        64 bit |
+//   | Page size                 |    4 Kb |    4 Kb |         16 Kb |
+//   |===============================================================|
+//   | Small    | Tiny           |     4/- |       - |             - |
+//   |          |                |       8 |     8/- |             8 |
+//   |          |----------------+---------|---------|---------------|
+//   |          | Quantum-spaced |      16 |      16 |            16 |
+//   |          |                |      32 |      32 |            32 |
+//   |          |                |      48 |      48 |            48 |
+//   |          |                |     ... |     ... |           ... |
+//   |          |                |     480 |     480 |           480 |
+//   |          |                |     496 |     496 |           496 |
+//   |          |----------------+---------|---------|---------------|
+//   |          | Quantum-wide-  |     512 |     512 |           512 |
+//   |          | spaced         |     768 |     768 |           768 |
+//   |          |                |     ... |     ... |           ... |
+//   |          |                |    3584 |    3584 |          3584 |
+//   |          |                |    3840 |    3840 |          3840 |
+//   |          |----------------+---------|---------|---------------|
+//   |          | Sub-page       |       - |       - |          4096 |
+//   |          |                |       - |       - |          8 kB |
+//   |===============================================================|
+//   | Large                     |    4 kB |    4 kB |             - |
+//   |                           |    8 kB |    8 kB |             - |
+//   |                           |   12 kB |   12 kB |             - |
+//   |                           |   16 kB |   16 kB |         16 kB |
+//   |                           |     ... |     ... |             - |
+//   |                           |   32 kB |   32 kB |         32 kB |
+//   |                           |     ... |     ... |           ... |
+//   |                           | 1008 kB | 1008 kB |       1008 kB |
+//   |                           | 1012 kB | 1012 kB |             - |
+//   |                           | 1016 kB | 1012 kB |             - |
+//   |                           | 1020 kB | 1020 kB |             - |
+//   |===============================================================|
+//   | Huge                      |    1 MB |    1 MB |          1 MB |
+//   |                           |    2 MB |    2 MB |          2 MB |
+//   |                           |    3 MB |    3 MB |          3 MB |
+//   |                           |     ... |     ... |           ... |
+//   |===============================================================|
+//
+// Legend:
+//   n:   Size class exists for this platform.
+//   n/-: This size class doesn't exist on Windows (see kMinTinyClass).
+//   -:   This size class doesn't exist for this platform.
+//   ...: Size classes follow a pattern here.
 //
 // NOTE: Due to Mozilla bug 691003, we cannot reserve less than one word for an
 // allocation on Linux or Mac.  So on 32-bit *nix, the smallest bucket size is
@ -377,6 +394,10 @@ struct arena_chunk_t {
 // negatively affect performance.
 static const size_t kCacheLineSize = 64;

+// Our size classes are inclusive ranges of memory sizes.  By describing the
+// minimums and how memory is allocated in each range the maximums can be
+// calculated.
+
 // Smallest size class to support.  On Windows the smallest allocation size
 // must be 8 bytes on 32-bit, 16 bytes on 64-bit.  On Linux and Mac, even
 // malloc(1) must reserve a word's worth of memory (see Mozilla bug 691003).
@ -389,28 +410,47 @@ static const size_t kMinTinyClass = sizeof(void*);
 // Maximum tiny size class.
 static const size_t kMaxTinyClass = 8;

-// Amount (quantum) separating quantum-spaced size classes.
-static const size_t kQuantum = 16;
-static const size_t kQuantumMask = kQuantum - 1;
-
 // Smallest quantum-spaced size classes. It could actually also be labelled a
 // tiny allocation, and is spaced as such from the largest tiny size class.
 // Tiny classes being powers of 2, this is twice as large as the largest of
 // them.
 static const size_t kMinQuantumClass = kMaxTinyClass * 2;
+static const size_t kMinQuantumWideClass = 512;
+static const size_t kMinSubPageClass = 4_KiB;

-// Largest quantum-spaced size classes.
-static const size_t kMaxQuantumClass = 512;
+// Amount (quantum) separating quantum-spaced size classes.
+static const size_t kQuantum = 16;
+static const size_t kQuantumMask = kQuantum - 1;
+static const size_t kQuantumWide = 256;
+static const size_t kQuantumWideMask = kQuantumWide - 1;
+
+static const size_t kMaxQuantumClass = kMinQuantumWideClass - kQuantum;
+static const size_t kMaxQuantumWideClass = kMinSubPageClass - kQuantumWide;
+
+// We can optimise some divisions to shifts if these are powers of two.
+static_assert(mozilla::IsPowerOfTwo(kQuantum),
+              "kQuantum is not a power of two");
+static_assert(mozilla::IsPowerOfTwo(kQuantumWide),
+              "kQuantumWide is not a power of two");

 static_assert(kMaxQuantumClass % kQuantum == 0,
              "kMaxQuantumClass is not a multiple of kQuantum");
+static_assert(kMaxQuantumWideClass % kQuantumWide == 0,
+              "kMaxQuantumWideClass is not a multiple of kQuantumWide");
+static_assert(kQuantum < kQuantumWide,
+              "kQuantum must be smaller than kQuantumWide");
+static_assert(mozilla::IsPowerOfTwo(kMinSubPageClass),
+              "kMinSubPageClass is not a power of two");

 // Number of (2^n)-spaced tiny classes.
 static const size_t kNumTinyClasses =
-    LOG2(kMinQuantumClass) - LOG2(kMinTinyClass);
+    LOG2(kMaxTinyClass) - LOG2(kMinTinyClass) + 1;

 // Number of quantum-spaced classes.
-static const size_t kNumQuantumClasses = kMaxQuantumClass / kQuantum;
+static const size_t kNumQuantumClasses =
+    (kMaxQuantumClass - kMinQuantumClass) / kQuantum + 1;
+static const size_t kNumQuantumWideClasses =
+    (kMaxQuantumWideClass - kMinQuantumWideClass) / kQuantumWide + 1;

 // Size and alignment of memory chunks that are allocated by the OS's virtual
 // memory system.
@ -443,6 +483,7 @@ static size_t gPageSize;
 #  define END_GLOBALS
 #  define DEFINE_GLOBAL(type) static const type
 #  define GLOBAL_LOG2 LOG2
+#  define GLOBAL_LOG2_OR_0 LOG2_OR_0
 #  define GLOBAL_ASSERT_HELPER1(x) static_assert(x, #  x)
 #  define GLOBAL_ASSERT_HELPER2(x, y) static_assert(x, y)
 #  define GLOBAL_ASSERT(...)                                               \
@ -455,6 +496,7 @@ static size_t gPageSize;
 #  define END_GLOBALS }
 #  define DEFINE_GLOBAL(type)
 #  define GLOBAL_LOG2 FloorLog2
+#  define GLOBAL_LOG2_OR_0 FloorLog2
 #  define GLOBAL_ASSERT MOZ_RELEASE_ASSERT
 #endif

@ -467,15 +509,21 @@ DECLARE_GLOBAL(size_t, gChunkHeaderNumPages)
 DECLARE_GLOBAL(size_t, gMaxLargeClass)

 DEFINE_GLOBALS
-// Largest sub-page size class.
-DEFINE_GLOBAL(size_t) gMaxSubPageClass = gPageSize / 2;
+
+// Largest sub-page size class, or zero if there are none
+DEFINE_GLOBAL(size_t)
+gMaxSubPageClass = gPageSize / 2 >= kMinSubPageClass ? gPageSize / 2 : 0;

 // Max size class for bins.
-#define gMaxBinClass gMaxSubPageClass
+#define gMaxBinClass \
+  (gMaxSubPageClass ? gMaxSubPageClass : kMaxQuantumWideClass)

-// Number of (2^n)-spaced sub-page bins.
+// Number of sub-page bins.
 DEFINE_GLOBAL(uint8_t)
-gNumSubPageClasses = GLOBAL_LOG2(gMaxSubPageClass) - LOG2(kMaxQuantumClass);
+gNumSubPageClasses =
+    static_cast<uint8_t>(gMaxSubPageClass ? GLOBAL_LOG2_OR_0(gMaxSubPageClass) -
+                                                LOG2(kMinSubPageClass) + 1
+                                          : 0);

 DEFINE_GLOBAL(uint8_t) gPageSize2Pow = GLOBAL_LOG2(gPageSize);
 DEFINE_GLOBAL(size_t) gPageSizeMask = gPageSize - 1;
@ -500,9 +548,16 @@ gMaxLargeClass =
 GLOBAL_ASSERT(1ULL << gPageSize2Pow == gPageSize,
              "Page size is not a power of two");
 GLOBAL_ASSERT(kQuantum >= sizeof(void*));
-GLOBAL_ASSERT(kQuantum <= gPageSize);
+GLOBAL_ASSERT(kQuantum <= kQuantumWide);
+GLOBAL_ASSERT(kQuantumWide <= (kMinSubPageClass - kMaxQuantumClass));
+
+GLOBAL_ASSERT(kQuantumWide <= kMaxQuantumClass);
+
+GLOBAL_ASSERT(gMaxSubPageClass >= kMinSubPageClass || gMaxSubPageClass == 0);
+GLOBAL_ASSERT(gMaxLargeClass >= gMaxSubPageClass);
 GLOBAL_ASSERT(kChunkSize >= gPageSize);
 GLOBAL_ASSERT(kQuantum * 4 <= kChunkSize);
+
 END_GLOBALS

 // Recycle at most 128 MiB of chunks. This means we retain at most
@ -526,13 +581,19 @@ static size_t opt_dirty_max = DIRTY_MAX_DEFAULT;

 // Return the smallest quantum multiple that is >= a.
 #define QUANTUM_CEILING(a) (((a) + (kQuantumMask)) & ~(kQuantumMask))
+#define QUANTUM_WIDE_CEILING(a) \
+  (((a) + (kQuantumWideMask)) & ~(kQuantumWideMask))
+
+// Return the smallest sub page-size  that is >= a.
+#define SUBPAGE_CEILING(a) (RoundUpPow2(a))

 // Return the smallest pagesize multiple that is >= s.
 #define PAGE_CEILING(s) (((s) + gPageSizeMask) & ~gPageSizeMask)

 // Number of all the small-allocated classes
 #define NUM_SMALL_CLASSES                                          \
-  (kNumTinyClasses + kNumQuantumClasses + gNumSubPageClasses)
+  (kNumTinyClasses + kNumQuantumClasses + kNumQuantumWideClasses + \
+   gNumSubPageClasses)

 // ***************************************************************************
 // MALLOC_DECOMMIT and MALLOC_DOUBLE_PURGE are mutually exclusive.
@ -658,6 +719,7 @@ class SizeClass {
  enum ClassType {
    Tiny,
    Quantum,
+    QuantumWide,
    SubPage,
    Large,
  };
@ -669,9 +731,12 @@ class SizeClass {
    } else if (aSize <= kMaxQuantumClass) {
      mType = Quantum;
      mSize = QUANTUM_CEILING(aSize);
+    } else if (aSize <= kMaxQuantumWideClass) {
+      mType = QuantumWide;
+      mSize = QUANTUM_WIDE_CEILING(aSize);
    } else if (aSize <= gMaxSubPageClass) {
      mType = SubPage;
-      mSize = RoundUpPow2(aSize);
+      mSize = SUBPAGE_CEILING(aSize);
    } else if (aSize <= gMaxLargeClass) {
      mType = Large;
      mSize = PAGE_CEILING(aSize);
@ -878,7 +943,10 @@ struct arena_bin_t {
  //   304  12 KiB    320  12 KiB    336   4 KiB    352   8 KiB
  //   368   4 KiB    384   8 KiB    400  20 KiB    416  16 KiB
  //   432  12 KiB    448   4 KiB    464  16 KiB    480   8 KiB
-  //   496  20 KiB    512  32 KiB   1024  64 KiB   2048 128 KiB
+  //   496  20 KiB    512  32 KiB    768  16 KiB   1024  64 KiB
+  //  1280  24 KiB   1536  32 KiB   1792  16 KiB   2048 128 KiB
+  //  2304  16 KiB   2560  48 KiB   2816  36 KiB   3072  64 KiB
+  //  3328  36 KiB   3584  32 KiB   3840  64 KiB
  inline void Init(SizeClass aSizeClass);
 };

@ -972,8 +1040,12 @@ struct arena_t {
  //       33  |  496 |
  //       34  |  512 |
  //   --------+------+
-  //       35  | 1024 |
-  //       36  | 2048 |
+  //       35  |  768 |
+  //       36  | 1024 |
+  //           :      :
+  //           :      :
+  //       46  | 3584 |
+  //       47  | 3840 |
  //   --------+------+
  arena_bin_t mBins[1];  // Dynamically sized.

@ -2821,11 +2893,21 @@ void* arena_t::MallocSmall(size_t aSize, bool aZero) {
      bin = &mBins[FloorLog2(aSize / kMinTinyClass)];
      break;
    case SizeClass::Quantum:
-      bin = &mBins[kNumTinyClasses + (aSize / kQuantum) - 1];
+      // Although we divide 2 things by kQuantum, the compiler will
+      // reduce `kMinQuantumClass / kQuantum` and `kNumTinyClasses` to a
+      // single constant.
+      bin = &mBins[kNumTinyClasses + (aSize / kQuantum) -
+                   (kMinQuantumClass / kQuantum)];
+      break;
+    case SizeClass::QuantumWide:
+      bin =
+          &mBins[kNumTinyClasses + kNumQuantumClasses + (aSize / kQuantumWide) -
+                 (kMinQuantumWideClass / kQuantumWide)];
      break;
    case SizeClass::SubPage:
-      bin = &mBins[kNumTinyClasses + kNumQuantumClasses +
-                   (FloorLog2(aSize / kMaxQuantumClass) - 1)];
+      bin =
+          &mBins[kNumTinyClasses + kNumQuantumClasses + kNumQuantumWideClasses +
+                 (FloorLog2(aSize) - LOG2(kMinSubPageClass))];
      break;
    default:
      MOZ_MAKE_COMPILER_ASSUME_IS_UNREACHABLE("Unexpected size class type");
@ -3558,8 +3640,8 @@ arena_t::arena_t(arena_params_t* aParams, bool aIsPrivate) {
    arena_bin_t& bin = mBins[i];
    bin.Init(sizeClass);

-    // SizeClass doesn't want sizes larger than gMaxSubPageClass for now.
-    if (sizeClass.Size() == gMaxSubPageClass) {
+    // SizeClass doesn't want sizes larger than gMaxBinClass for now.
+    if (sizeClass.Size() == gMaxBinClass) {
      break;
    }
    sizeClass = sizeClass.Next();
@ -4253,6 +4335,9 @@ inline void MozJemalloc::jemalloc_stats_internal(
  aStats->opt_zero = opt_zero;
  aStats->quantum = kQuantum;
  aStats->quantum_max = kMaxQuantumClass;
+  aStats->quantum_wide = kQuantumWide;
+  aStats->quantum_wide_max = kMaxQuantumWideClass;
+  aStats->subpage_max = gMaxSubPageClass;
  aStats->large_max = gMaxLargeClass;
  aStats->chunksize = kChunkSize;
  aStats->page_size = gPageSize;
--- a/memory/build/mozjemalloc_types.h
+++ b/memory/build/mozjemalloc_types.h
@ -82,7 +82,9 @@ typedef struct {
  size_t narenas;           // Number of arenas.
  size_t quantum;           // Allocation quantum.
  size_t quantum_max;       // Max quantum-spaced allocation size.
-  // The next size class, sub-pagesize's max is always page_size/2.
+  size_t quantum_wide;      // Allocation quantum (QuantuWide).
+  size_t quantum_wide_max;  // Max quantum-wide-spaced allocation size.
+  size_t subpage_max;       // Max subpage allocation size.
  size_t large_max;         // Max sub-chunksize allocation size.
  size_t chunksize;         // Size of each virtual memory mapping.
  size_t page_size;         // Size of pages.
@ -111,7 +113,8 @@ typedef struct {
  size_t bytes_per_run;      // The number of bytes per run, including headers.
 } jemalloc_bin_stats_t;

-#define JEMALLOC_MAX_STATS_BINS 40
+// This is the total number of bins.
+#define JEMALLOC_MAX_STATS_BINS 51

 enum PtrInfoTag {
  // The pointer is not currently known to the allocator.
--- a/memory/gtest/TestJemalloc.cpp
+++ b/memory/gtest/TestJemalloc.cpp
@ -136,9 +136,10 @@ TEST(Jemalloc, PtrInfo)
  jemalloc_ptr_info_t info;
  Vector<char*> small, large, huge;

-  // For small (<= 2KiB) allocations, test every position within many possible
-  // sizes.
-  size_t small_max = stats.page_size / 2;
+  // For small (less than half the page size) allocations, test every position
+  // within many possible sizes.
+  size_t small_max =
+      stats.subpage_max ? stats.subpage_max : stats.quantum_wide_max;
  for (size_t n = 0; n <= small_max; n += 8) {
    auto p = (char*)moz_arena_malloc(arenaId, n);
    size_t usable = moz_malloc_size_of(p);
@ -149,7 +150,7 @@ TEST(Jemalloc, PtrInfo)
    }
  }

-  // Similar for large (2KiB + 1 KiB .. 1MiB - 8KiB) allocations.
+  // Similar for large (small_max + 1 KiB .. 1MiB - 8KiB) allocations.
  for (size_t n = small_max + 1_KiB; n <= stats.large_max; n += 1_KiB) {
    auto p = (char*)moz_arena_malloc(arenaId, n);
    size_t usable = moz_malloc_size_of(p);
@ -199,7 +200,7 @@ TEST(Jemalloc, PtrInfo)
  // the former.
  ASSERT_TRUE(isFreedAlloc != 0);
  ASSERT_TRUE(isFreedPage != 0);
-  ASSERT_TRUE(isFreedAlloc / isFreedPage > 10);
+  ASSERT_TRUE(isFreedAlloc / isFreedPage > 8);

  // Free the large allocations and recheck them.
  len = large.length();
@ -277,7 +278,7 @@ TEST(Jemalloc, PtrInfo)
  moz_dispose_arena(arenaId);
 }

-size_t sSizes[] = {1,      42,      79,      918,     1.5_KiB,
+size_t sSizes[] = {1,      42,      79,      918,     1.4_KiB,
                   73_KiB, 129_KiB, 1.1_MiB, 2.6_MiB, 5.1_MiB};

 TEST(Jemalloc, Arenas)
--- a/memory/replace/logalloc/replay/Replay.cpp
+++ b/memory/replace/logalloc/replay/Replay.cpp
@ -802,7 +802,8 @@ class Replay {
          num_sloppy_objects++;
        }

-        if (used <= stats.page_size / 2) {
+        if (used <=
+            (stats.subpage_max ? stats.subpage_max : stats.quantum_wide_max)) {
          // We know that this is an inefficient linear search, but there's a
          // small number of bins and this is simple.
          for (unsigned i = 0; i < JEMALLOC_MAX_STATS_BINS; i++) {
@ -844,7 +845,8 @@ class Replay {
    FdPrintf(mStdErr, "bookkeep:         %9zu\n", stats.bookkeeping);
    FdPrintf(mStdErr, "bin-unused:       %9zu\n", stats.bin_unused);
    FdPrintf(mStdErr, "quantum-max:      %9zu\n", stats.quantum_max);
-    FdPrintf(mStdErr, "subpage-max:  %9zu\n", stats.page_size / 2);
+    FdPrintf(mStdErr, "quantum-wide-max: %9zu\n", stats.quantum_wide_max);
+    FdPrintf(mStdErr, "subpage-max:      %9zu\n", stats.subpage_max);
    FdPrintf(mStdErr, "large-max:        %9zu\n", stats.large_max);
    if (mCalculateSlop) {
      size_t slop = mTotalAllocatedSize - mTotalRequestedSize;
@ -916,6 +918,9 @@ class Replay {
      } else if (bin.size <= stats.quantum_max) {
        // 4 buckets, (4 bytes per bucket with a 16 byte quantum).
        dist = Distribution(bin.size, last_size, stats.quantum / 4);
+      } else if (bin.size <= stats.quantum_wide_max) {
+        // 8 buckets, (32 bytes per bucket with a 256 byte quantum-wide).
+        dist = Distribution(bin.size, last_size, stats.quantum_wide / 8);
      } else {
        // 16 buckets.
        dist = Distribution(bin.size, last_size, (bin.size - last_size) / 16);