diff --git a/memory/build/Utils.h b/memory/build/Utils.h index 6032bf23d0c6..76d180d22eb8 100644 --- a/memory/build/Utils.h +++ b/memory/build/Utils.h @@ -20,6 +20,20 @@ struct Log2 : mozilla::tl::CeilingLog2 { }; #define LOG2(N) Log2::value +// Like Log2, but ignores 0. +template +struct Log2Or0 : mozilla::tl::CeilingLog2 { + using mozilla::tl::CeilingLog2::value; + static_assert(1ULL << value == N, "Number is not a power of 2"); +}; +template <> +struct Log2Or0<0> { + // This makes no sense but neither does any other value. It's just enough + // that this can be used on the unused side of a conditional expression. + static const size_t value = 0; +}; +#define LOG2_OR_0(N) Log2Or0::value + enum class Order { eLess = -1, eEqual = 0, diff --git a/memory/build/mozjemalloc.cpp b/memory/build/mozjemalloc.cpp index 1947da229c93..022332399c36 100644 --- a/memory/build/mozjemalloc.cpp +++ b/memory/build/mozjemalloc.cpp @@ -52,40 +52,57 @@ // // Allocation requests are rounded up to the nearest size class, and no record // of the original request size is maintained. Allocations are broken into -// categories according to size class. Assuming runtime defaults, 4 kB pages -// and a 16 byte quantum on a 32-bit system, the size classes in each category -// are as follows: +// categories according to size class. Assuming runtime defaults, the size +// classes in each category are as follows (for x86, x86_64 and Apple Silicon): // -// |=====================================| -// | Category | Subcategory | Size | -// |=====================================| -// | Small | Tiny | 4 | -// | | | 8 | -// | |----------------+---------| -// | | Quantum-spaced | 16 | -// | | | 32 | -// | | | 48 | -// | | | ... | -// | | | 480 | -// | | | 496 | -// | | | 512 | -// | |----------------+---------| -// | | Sub-page | 1 kB | -// | | | 2 kB | -// |=====================================| -// | Large | 4 kB | -// | | 8 kB | -// | | 12 kB | -// | | ... | -// | | 1012 kB | -// | | 1016 kB | -// | | 1020 kB | -// |=====================================| -// | Huge | 1 MB | -// | | 2 MB | -// | | 3 MB | -// | | ... | -// |=====================================| +// |===============================================================| +// | Category | Subcategory | x86 | x86_64 | Apple Silicon | +// |---------------------------+---------+---------+---------------+ +// | Word size | 32 bit | 64 bit | 64 bit | +// | Page size | 4 Kb | 4 Kb | 16 Kb | +// |===============================================================| +// | Small | Tiny | 4/- | - | - | +// | | | 8 | 8/- | 8 | +// | |----------------+---------|---------|---------------| +// | | Quantum-spaced | 16 | 16 | 16 | +// | | | 32 | 32 | 32 | +// | | | 48 | 48 | 48 | +// | | | ... | ... | ... | +// | | | 480 | 480 | 480 | +// | | | 496 | 496 | 496 | +// | |----------------+---------|---------|---------------| +// | | Quantum-wide- | 512 | 512 | 512 | +// | | spaced | 768 | 768 | 768 | +// | | | ... | ... | ... | +// | | | 3584 | 3584 | 3584 | +// | | | 3840 | 3840 | 3840 | +// | |----------------+---------|---------|---------------| +// | | Sub-page | - | - | 4096 | +// | | | - | - | 8 kB | +// |===============================================================| +// | Large | 4 kB | 4 kB | - | +// | | 8 kB | 8 kB | - | +// | | 12 kB | 12 kB | - | +// | | 16 kB | 16 kB | 16 kB | +// | | ... | ... | - | +// | | 32 kB | 32 kB | 32 kB | +// | | ... | ... | ... | +// | | 1008 kB | 1008 kB | 1008 kB | +// | | 1012 kB | 1012 kB | - | +// | | 1016 kB | 1012 kB | - | +// | | 1020 kB | 1020 kB | - | +// |===============================================================| +// | Huge | 1 MB | 1 MB | 1 MB | +// | | 2 MB | 2 MB | 2 MB | +// | | 3 MB | 3 MB | 3 MB | +// | | ... | ... | ... | +// |===============================================================| +// +// Legend: +// n: Size class exists for this platform. +// n/-: This size class doesn't exist on Windows (see kMinTinyClass). +// -: This size class doesn't exist for this platform. +// ...: Size classes follow a pattern here. // // NOTE: Due to Mozilla bug 691003, we cannot reserve less than one word for an // allocation on Linux or Mac. So on 32-bit *nix, the smallest bucket size is @@ -377,6 +394,10 @@ struct arena_chunk_t { // negatively affect performance. static const size_t kCacheLineSize = 64; +// Our size classes are inclusive ranges of memory sizes. By describing the +// minimums and how memory is allocated in each range the maximums can be +// calculated. + // Smallest size class to support. On Windows the smallest allocation size // must be 8 bytes on 32-bit, 16 bytes on 64-bit. On Linux and Mac, even // malloc(1) must reserve a word's worth of memory (see Mozilla bug 691003). @@ -389,28 +410,47 @@ static const size_t kMinTinyClass = sizeof(void*); // Maximum tiny size class. static const size_t kMaxTinyClass = 8; -// Amount (quantum) separating quantum-spaced size classes. -static const size_t kQuantum = 16; -static const size_t kQuantumMask = kQuantum - 1; - // Smallest quantum-spaced size classes. It could actually also be labelled a // tiny allocation, and is spaced as such from the largest tiny size class. // Tiny classes being powers of 2, this is twice as large as the largest of // them. static const size_t kMinQuantumClass = kMaxTinyClass * 2; +static const size_t kMinQuantumWideClass = 512; +static const size_t kMinSubPageClass = 4_KiB; -// Largest quantum-spaced size classes. -static const size_t kMaxQuantumClass = 512; +// Amount (quantum) separating quantum-spaced size classes. +static const size_t kQuantum = 16; +static const size_t kQuantumMask = kQuantum - 1; +static const size_t kQuantumWide = 256; +static const size_t kQuantumWideMask = kQuantumWide - 1; + +static const size_t kMaxQuantumClass = kMinQuantumWideClass - kQuantum; +static const size_t kMaxQuantumWideClass = kMinSubPageClass - kQuantumWide; + +// We can optimise some divisions to shifts if these are powers of two. +static_assert(mozilla::IsPowerOfTwo(kQuantum), + "kQuantum is not a power of two"); +static_assert(mozilla::IsPowerOfTwo(kQuantumWide), + "kQuantumWide is not a power of two"); static_assert(kMaxQuantumClass % kQuantum == 0, "kMaxQuantumClass is not a multiple of kQuantum"); +static_assert(kMaxQuantumWideClass % kQuantumWide == 0, + "kMaxQuantumWideClass is not a multiple of kQuantumWide"); +static_assert(kQuantum < kQuantumWide, + "kQuantum must be smaller than kQuantumWide"); +static_assert(mozilla::IsPowerOfTwo(kMinSubPageClass), + "kMinSubPageClass is not a power of two"); // Number of (2^n)-spaced tiny classes. static const size_t kNumTinyClasses = - LOG2(kMinQuantumClass) - LOG2(kMinTinyClass); + LOG2(kMaxTinyClass) - LOG2(kMinTinyClass) + 1; // Number of quantum-spaced classes. -static const size_t kNumQuantumClasses = kMaxQuantumClass / kQuantum; +static const size_t kNumQuantumClasses = + (kMaxQuantumClass - kMinQuantumClass) / kQuantum + 1; +static const size_t kNumQuantumWideClasses = + (kMaxQuantumWideClass - kMinQuantumWideClass) / kQuantumWide + 1; // Size and alignment of memory chunks that are allocated by the OS's virtual // memory system. @@ -443,6 +483,7 @@ static size_t gPageSize; # define END_GLOBALS # define DEFINE_GLOBAL(type) static const type # define GLOBAL_LOG2 LOG2 +# define GLOBAL_LOG2_OR_0 LOG2_OR_0 # define GLOBAL_ASSERT_HELPER1(x) static_assert(x, # x) # define GLOBAL_ASSERT_HELPER2(x, y) static_assert(x, y) # define GLOBAL_ASSERT(...) \ @@ -455,6 +496,7 @@ static size_t gPageSize; # define END_GLOBALS } # define DEFINE_GLOBAL(type) # define GLOBAL_LOG2 FloorLog2 +# define GLOBAL_LOG2_OR_0 FloorLog2 # define GLOBAL_ASSERT MOZ_RELEASE_ASSERT #endif @@ -467,15 +509,21 @@ DECLARE_GLOBAL(size_t, gChunkHeaderNumPages) DECLARE_GLOBAL(size_t, gMaxLargeClass) DEFINE_GLOBALS -// Largest sub-page size class. -DEFINE_GLOBAL(size_t) gMaxSubPageClass = gPageSize / 2; + +// Largest sub-page size class, or zero if there are none +DEFINE_GLOBAL(size_t) +gMaxSubPageClass = gPageSize / 2 >= kMinSubPageClass ? gPageSize / 2 : 0; // Max size class for bins. -#define gMaxBinClass gMaxSubPageClass +#define gMaxBinClass \ + (gMaxSubPageClass ? gMaxSubPageClass : kMaxQuantumWideClass) -// Number of (2^n)-spaced sub-page bins. +// Number of sub-page bins. DEFINE_GLOBAL(uint8_t) -gNumSubPageClasses = GLOBAL_LOG2(gMaxSubPageClass) - LOG2(kMaxQuantumClass); +gNumSubPageClasses = + static_cast(gMaxSubPageClass ? GLOBAL_LOG2_OR_0(gMaxSubPageClass) - + LOG2(kMinSubPageClass) + 1 + : 0); DEFINE_GLOBAL(uint8_t) gPageSize2Pow = GLOBAL_LOG2(gPageSize); DEFINE_GLOBAL(size_t) gPageSizeMask = gPageSize - 1; @@ -500,9 +548,16 @@ gMaxLargeClass = GLOBAL_ASSERT(1ULL << gPageSize2Pow == gPageSize, "Page size is not a power of two"); GLOBAL_ASSERT(kQuantum >= sizeof(void*)); -GLOBAL_ASSERT(kQuantum <= gPageSize); +GLOBAL_ASSERT(kQuantum <= kQuantumWide); +GLOBAL_ASSERT(kQuantumWide <= (kMinSubPageClass - kMaxQuantumClass)); + +GLOBAL_ASSERT(kQuantumWide <= kMaxQuantumClass); + +GLOBAL_ASSERT(gMaxSubPageClass >= kMinSubPageClass || gMaxSubPageClass == 0); +GLOBAL_ASSERT(gMaxLargeClass >= gMaxSubPageClass); GLOBAL_ASSERT(kChunkSize >= gPageSize); GLOBAL_ASSERT(kQuantum * 4 <= kChunkSize); + END_GLOBALS // Recycle at most 128 MiB of chunks. This means we retain at most @@ -526,13 +581,19 @@ static size_t opt_dirty_max = DIRTY_MAX_DEFAULT; // Return the smallest quantum multiple that is >= a. #define QUANTUM_CEILING(a) (((a) + (kQuantumMask)) & ~(kQuantumMask)) +#define QUANTUM_WIDE_CEILING(a) \ + (((a) + (kQuantumWideMask)) & ~(kQuantumWideMask)) + +// Return the smallest sub page-size that is >= a. +#define SUBPAGE_CEILING(a) (RoundUpPow2(a)) // Return the smallest pagesize multiple that is >= s. #define PAGE_CEILING(s) (((s) + gPageSizeMask) & ~gPageSizeMask) // Number of all the small-allocated classes -#define NUM_SMALL_CLASSES \ - (kNumTinyClasses + kNumQuantumClasses + gNumSubPageClasses) +#define NUM_SMALL_CLASSES \ + (kNumTinyClasses + kNumQuantumClasses + kNumQuantumWideClasses + \ + gNumSubPageClasses) // *************************************************************************** // MALLOC_DECOMMIT and MALLOC_DOUBLE_PURGE are mutually exclusive. @@ -658,6 +719,7 @@ class SizeClass { enum ClassType { Tiny, Quantum, + QuantumWide, SubPage, Large, }; @@ -669,9 +731,12 @@ class SizeClass { } else if (aSize <= kMaxQuantumClass) { mType = Quantum; mSize = QUANTUM_CEILING(aSize); + } else if (aSize <= kMaxQuantumWideClass) { + mType = QuantumWide; + mSize = QUANTUM_WIDE_CEILING(aSize); } else if (aSize <= gMaxSubPageClass) { mType = SubPage; - mSize = RoundUpPow2(aSize); + mSize = SUBPAGE_CEILING(aSize); } else if (aSize <= gMaxLargeClass) { mType = Large; mSize = PAGE_CEILING(aSize); @@ -878,7 +943,10 @@ struct arena_bin_t { // 304 12 KiB 320 12 KiB 336 4 KiB 352 8 KiB // 368 4 KiB 384 8 KiB 400 20 KiB 416 16 KiB // 432 12 KiB 448 4 KiB 464 16 KiB 480 8 KiB - // 496 20 KiB 512 32 KiB 1024 64 KiB 2048 128 KiB + // 496 20 KiB 512 32 KiB 768 16 KiB 1024 64 KiB + // 1280 24 KiB 1536 32 KiB 1792 16 KiB 2048 128 KiB + // 2304 16 KiB 2560 48 KiB 2816 36 KiB 3072 64 KiB + // 3328 36 KiB 3584 32 KiB 3840 64 KiB inline void Init(SizeClass aSizeClass); }; @@ -972,8 +1040,12 @@ struct arena_t { // 33 | 496 | // 34 | 512 | // --------+------+ - // 35 | 1024 | - // 36 | 2048 | + // 35 | 768 | + // 36 | 1024 | + // : : + // : : + // 46 | 3584 | + // 47 | 3840 | // --------+------+ arena_bin_t mBins[1]; // Dynamically sized. @@ -2821,11 +2893,21 @@ void* arena_t::MallocSmall(size_t aSize, bool aZero) { bin = &mBins[FloorLog2(aSize / kMinTinyClass)]; break; case SizeClass::Quantum: - bin = &mBins[kNumTinyClasses + (aSize / kQuantum) - 1]; + // Although we divide 2 things by kQuantum, the compiler will + // reduce `kMinQuantumClass / kQuantum` and `kNumTinyClasses` to a + // single constant. + bin = &mBins[kNumTinyClasses + (aSize / kQuantum) - + (kMinQuantumClass / kQuantum)]; + break; + case SizeClass::QuantumWide: + bin = + &mBins[kNumTinyClasses + kNumQuantumClasses + (aSize / kQuantumWide) - + (kMinQuantumWideClass / kQuantumWide)]; break; case SizeClass::SubPage: - bin = &mBins[kNumTinyClasses + kNumQuantumClasses + - (FloorLog2(aSize / kMaxQuantumClass) - 1)]; + bin = + &mBins[kNumTinyClasses + kNumQuantumClasses + kNumQuantumWideClasses + + (FloorLog2(aSize) - LOG2(kMinSubPageClass))]; break; default: MOZ_MAKE_COMPILER_ASSUME_IS_UNREACHABLE("Unexpected size class type"); @@ -3558,8 +3640,8 @@ arena_t::arena_t(arena_params_t* aParams, bool aIsPrivate) { arena_bin_t& bin = mBins[i]; bin.Init(sizeClass); - // SizeClass doesn't want sizes larger than gMaxSubPageClass for now. - if (sizeClass.Size() == gMaxSubPageClass) { + // SizeClass doesn't want sizes larger than gMaxBinClass for now. + if (sizeClass.Size() == gMaxBinClass) { break; } sizeClass = sizeClass.Next(); @@ -4253,6 +4335,9 @@ inline void MozJemalloc::jemalloc_stats_internal( aStats->opt_zero = opt_zero; aStats->quantum = kQuantum; aStats->quantum_max = kMaxQuantumClass; + aStats->quantum_wide = kQuantumWide; + aStats->quantum_wide_max = kMaxQuantumWideClass; + aStats->subpage_max = gMaxSubPageClass; aStats->large_max = gMaxLargeClass; aStats->chunksize = kChunkSize; aStats->page_size = gPageSize; diff --git a/memory/build/mozjemalloc_types.h b/memory/build/mozjemalloc_types.h index 14c2641fff7f..97c41095c64d 100644 --- a/memory/build/mozjemalloc_types.h +++ b/memory/build/mozjemalloc_types.h @@ -77,16 +77,18 @@ typedef struct arena_params_s { // file. typedef struct { // Run-time configuration settings. - bool opt_junk; // Fill allocated memory with kAllocJunk? - bool opt_zero; // Fill allocated memory with 0x0? - size_t narenas; // Number of arenas. - size_t quantum; // Allocation quantum. - size_t quantum_max; // Max quantum-spaced allocation size. - // The next size class, sub-pagesize's max is always page_size/2. - size_t large_max; // Max sub-chunksize allocation size. - size_t chunksize; // Size of each virtual memory mapping. - size_t page_size; // Size of pages. - size_t dirty_max; // Max dirty pages per arena. + bool opt_junk; // Fill allocated memory with kAllocJunk? + bool opt_zero; // Fill allocated memory with 0x0? + size_t narenas; // Number of arenas. + size_t quantum; // Allocation quantum. + size_t quantum_max; // Max quantum-spaced allocation size. + size_t quantum_wide; // Allocation quantum (QuantuWide). + size_t quantum_wide_max; // Max quantum-wide-spaced allocation size. + size_t subpage_max; // Max subpage allocation size. + size_t large_max; // Max sub-chunksize allocation size. + size_t chunksize; // Size of each virtual memory mapping. + size_t page_size; // Size of pages. + size_t dirty_max; // Max dirty pages per arena. // Current memory usage statistics. size_t mapped; // Bytes mapped (not necessarily committed). @@ -111,7 +113,8 @@ typedef struct { size_t bytes_per_run; // The number of bytes per run, including headers. } jemalloc_bin_stats_t; -#define JEMALLOC_MAX_STATS_BINS 40 +// This is the total number of bins. +#define JEMALLOC_MAX_STATS_BINS 51 enum PtrInfoTag { // The pointer is not currently known to the allocator. diff --git a/memory/gtest/TestJemalloc.cpp b/memory/gtest/TestJemalloc.cpp index 3452a2550d09..ba1cdb4747bc 100644 --- a/memory/gtest/TestJemalloc.cpp +++ b/memory/gtest/TestJemalloc.cpp @@ -136,9 +136,10 @@ TEST(Jemalloc, PtrInfo) jemalloc_ptr_info_t info; Vector small, large, huge; - // For small (<= 2KiB) allocations, test every position within many possible - // sizes. - size_t small_max = stats.page_size / 2; + // For small (less than half the page size) allocations, test every position + // within many possible sizes. + size_t small_max = + stats.subpage_max ? stats.subpage_max : stats.quantum_wide_max; for (size_t n = 0; n <= small_max; n += 8) { auto p = (char*)moz_arena_malloc(arenaId, n); size_t usable = moz_malloc_size_of(p); @@ -149,7 +150,7 @@ TEST(Jemalloc, PtrInfo) } } - // Similar for large (2KiB + 1 KiB .. 1MiB - 8KiB) allocations. + // Similar for large (small_max + 1 KiB .. 1MiB - 8KiB) allocations. for (size_t n = small_max + 1_KiB; n <= stats.large_max; n += 1_KiB) { auto p = (char*)moz_arena_malloc(arenaId, n); size_t usable = moz_malloc_size_of(p); @@ -199,7 +200,7 @@ TEST(Jemalloc, PtrInfo) // the former. ASSERT_TRUE(isFreedAlloc != 0); ASSERT_TRUE(isFreedPage != 0); - ASSERT_TRUE(isFreedAlloc / isFreedPage > 10); + ASSERT_TRUE(isFreedAlloc / isFreedPage > 8); // Free the large allocations and recheck them. len = large.length(); @@ -277,7 +278,7 @@ TEST(Jemalloc, PtrInfo) moz_dispose_arena(arenaId); } -size_t sSizes[] = {1, 42, 79, 918, 1.5_KiB, +size_t sSizes[] = {1, 42, 79, 918, 1.4_KiB, 73_KiB, 129_KiB, 1.1_MiB, 2.6_MiB, 5.1_MiB}; TEST(Jemalloc, Arenas) diff --git a/memory/replace/logalloc/replay/Replay.cpp b/memory/replace/logalloc/replay/Replay.cpp index 0ae160a2f5cc..201bdfe85420 100644 --- a/memory/replace/logalloc/replay/Replay.cpp +++ b/memory/replace/logalloc/replay/Replay.cpp @@ -802,7 +802,8 @@ class Replay { num_sloppy_objects++; } - if (used <= stats.page_size / 2) { + if (used <= + (stats.subpage_max ? stats.subpage_max : stats.quantum_wide_max)) { // We know that this is an inefficient linear search, but there's a // small number of bins and this is simple. for (unsigned i = 0; i < JEMALLOC_MAX_STATS_BINS; i++) { @@ -828,24 +829,25 @@ class Replay { stats.bookkeeping + stats.bin_unused; FdPrintf(mStdErr, "\n"); - FdPrintf(mStdErr, "Objects: %9zu\n", num_objects); - FdPrintf(mStdErr, "Slots: %9zu\n", mNumUsedSlots); - FdPrintf(mStdErr, "Ops: %9zu\n", mOps); - FdPrintf(mStdErr, "mapped: %9zu\n", stats.mapped); - FdPrintf(mStdErr, "committed: %9zu\n", committed); + FdPrintf(mStdErr, "Objects: %9zu\n", num_objects); + FdPrintf(mStdErr, "Slots: %9zu\n", mNumUsedSlots); + FdPrintf(mStdErr, "Ops: %9zu\n", mOps); + FdPrintf(mStdErr, "mapped: %9zu\n", stats.mapped); + FdPrintf(mStdErr, "committed: %9zu\n", committed); #ifdef XP_LINUX if (rss) { - FdPrintf(mStdErr, "rss: %9zu\n", rss); + FdPrintf(mStdErr, "rss: %9zu\n", rss); } #endif - FdPrintf(mStdErr, "allocated: %9zu\n", stats.allocated); - FdPrintf(mStdErr, "waste: %9zu\n", stats.waste); - FdPrintf(mStdErr, "dirty: %9zu\n", stats.page_cache); - FdPrintf(mStdErr, "bookkeep: %9zu\n", stats.bookkeeping); - FdPrintf(mStdErr, "bin-unused: %9zu\n", stats.bin_unused); - FdPrintf(mStdErr, "quantum-max: %9zu\n", stats.quantum_max); - FdPrintf(mStdErr, "subpage-max: %9zu\n", stats.page_size / 2); - FdPrintf(mStdErr, "large-max: %9zu\n", stats.large_max); + FdPrintf(mStdErr, "allocated: %9zu\n", stats.allocated); + FdPrintf(mStdErr, "waste: %9zu\n", stats.waste); + FdPrintf(mStdErr, "dirty: %9zu\n", stats.page_cache); + FdPrintf(mStdErr, "bookkeep: %9zu\n", stats.bookkeeping); + FdPrintf(mStdErr, "bin-unused: %9zu\n", stats.bin_unused); + FdPrintf(mStdErr, "quantum-max: %9zu\n", stats.quantum_max); + FdPrintf(mStdErr, "quantum-wide-max: %9zu\n", stats.quantum_wide_max); + FdPrintf(mStdErr, "subpage-max: %9zu\n", stats.subpage_max); + FdPrintf(mStdErr, "large-max: %9zu\n", stats.large_max); if (mCalculateSlop) { size_t slop = mTotalAllocatedSize - mTotalRequestedSize; FdPrintf(mStdErr, @@ -916,6 +918,9 @@ class Replay { } else if (bin.size <= stats.quantum_max) { // 4 buckets, (4 bytes per bucket with a 16 byte quantum). dist = Distribution(bin.size, last_size, stats.quantum / 4); + } else if (bin.size <= stats.quantum_wide_max) { + // 8 buckets, (32 bytes per bucket with a 256 byte quantum-wide). + dist = Distribution(bin.size, last_size, stats.quantum_wide / 8); } else { // 16 buckets. dist = Distribution(bin.size, last_size, (bin.size - last_size) / 16);