From 87faa9248952da05f1665c6c9bb87284e81bf5cb Mon Sep 17 00:00:00 2001
From: Mike Hommey <mh+mozilla@glandium.org>
Date: Tue, 7 Nov 2017 14:36:07 +0900
Subject: [PATCH] Bug 1414168 - Change and move the relaxed calculation rule
 for small size classes. r=njn

First and foremost, the code and corresponding comment weren't in
agreement on what's going on.

The code checks:
  RUN_MAX_OVRHD * (bin->mSizeClass << 3) <= RUN_MAX_OVRHD_RELAX
which is equivalent to:
  (bin->mSizeClass << 3) <= RUN_MAX_OVRHD_RELAX / RUN_MAX_OVRHD
replacing constants:
  (bin->mSizeClass << 3) <= 0x1800 / 0x3d

The left hand side is just bin->mSizeClass * 8, and the right hand side
is about 100, so this can be roughly summarized as:
  bin->mSizeClass <= 12

The comment says the overhead constraint is relaxed for runs with a
per-region overhead greater than RUN_MAX_OVRHD / (mSizeClass << (3+RUN_BFP)).
Which, on itself, doesn't make sense, because it translates to
61 / (mSizeClass * 32768), which, even for a size class of 1 would mean
less than 0.2%, and this value would be even smaller for bigger classes.
The comment would make more sense with RUN_MAX_OVRHD_RELAX, but would
still not match what the code was doing.

So we change how the relaxed rule works, as per the comment in the new
code, and make it happen after the standard run overhead constraint has
been checked.

--HG--
extra : rebase_source : cec35b5bfec416761fbfbcffdc2b39f0098af849
---
 memory/build/mozjemalloc.cpp | 37 +++++++++++++-----------------------
 1 file changed, 13 insertions(+), 24 deletions(-)

diff --git a/memory/build/mozjemalloc.cpp b/memory/build/mozjemalloc.cpp
index 8de5d5a09bb5..35b3ae3e08e7 100644
--- a/memory/build/mozjemalloc.cpp
+++ b/memory/build/mozjemalloc.cpp
@@ -514,25 +514,6 @@ static Atomic<size_t, ReleaseAcquire> gRecycledSize;
 
 static size_t opt_dirty_max = DIRTY_MAX_DEFAULT;
 
-// RUN_MAX_OVRHD indicates maximum desired run header overhead.  Runs are sized
-// as small as possible such that this setting is still honored, without
-// violating other constraints.  The goal is to make runs as small as possible
-// without exceeding a per run external fragmentation threshold.
-//
-// We use binary fixed point math for overhead computations, where the binary
-// point is implicitly RUN_BFP bits to the left.
-//
-// Note that it is possible to set RUN_MAX_OVRHD low enough that it cannot be
-// honored for some/all object sizes, since there is one bit of header overhead
-// per object (plus a constant).  This constraint is relaxed (ignored) for runs
-// that are so small that the per-region overhead is greater than:
-//
-//   (RUN_MAX_OVRHD / (reg_size << (3+RUN_BFP))
-#define RUN_BFP 12
-//                                    \/   Implicit binary fixed point.
-#define RUN_MAX_OVRHD 0x0000003dU
-#define RUN_MAX_OVRHD_RELAX 0x00001800U
-
 // Return the smallest chunk multiple that is >= s.
 #define CHUNK_CEILING(s) (((s) + kChunkSizeMask) & ~kChunkSizeMask)
 
@@ -3019,15 +3000,23 @@ arena_bin_run_size_calc(arena_bin_t* bin, size_t min_run_size)
       break;
     }
 
-    // This doesn't match the comment above RUN_MAX_OVRHD_RELAX.
-    if (RUN_MAX_OVRHD * (bin->mSizeClass << 3) <= RUN_MAX_OVRHD_RELAX) {
-      break;
-    }
-
     // Try to keep the run overhead below kRunOverhead.
     if (Fraction(try_reg0_offset, try_run_size) <= arena_bin_t::kRunOverhead) {
       break;
     }
+
+    // The run header includes one bit per region of the given size. For sizes
+    // small enough, the number of regions is large enough that growing the run
+    // size barely moves the needle for the overhead because of all those bits.
+    // For example, for a size of 8 bytes, adding 4KiB to the run size adds
+    // close to 512 bits to the header, which is 64 bytes.
+    // With such overhead, there is no way to get to the wanted overhead above,
+    // so we give up if the required size for regs_mask more than doubles the
+    // size of the run header.
+    if (try_mask_nelms * sizeof(unsigned) >= sizeof(arena_run_t)) {
+      break;
+    }
+
   }
 
   MOZ_ASSERT(sizeof(arena_run_t) + (sizeof(unsigned) * (good_mask_nelms - 1)) <=