From 158177e39995aa62df781b8ef64e800bf0c8670f Mon Sep 17 00:00:00 2001
From: Alan Wu <XrXr@users.noreply.github.com>
Date: Thu, 25 Jul 2024 19:48:28 -0400
Subject: [PATCH] Improve allocation throughput by outlining cache miss code
 path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, GCC 11 on x86-64 inlined the heavy weight logic for
potentially triggering GC into newobj_alloc(). This slowed down
the hotter code path where the ractor cache hits, causing a degradation
to allocation throughput.

Outline the logic into a separate function and have it never inlined.

This restores allocation throughput to the same level as
98eeadc ("Development of 3.4.0 started.").

To evaluate, instrument miniruby so it allocates a bunch of objects and
then exits:

    diff --git a/eval.c b/eval.c
    --- a/eval.c
    +++ b/eval.c
    @@ -92,6 +92,15 @@ ruby_setup(void)
         }
         EC_POP_TAG();

    +rb_gc_disable();
    +rb_execution_context_t *ec = GET_EC();
    +long const n = 20000000;
    +for (long i = 0; i < n; ++i) {
    +    rb_wb_protected_newobj_of(ec, 0, T_OBJECT, 40);
    +}
    +printf("alloc %ld\n", n);
    +exit(0);
    +
         return state;
     }

With `3.3-equiv` being 98eeadc, and `pre` being f2728c3393d
and `post` being this commit, I have:

    $ hyperfine -L buildtag post,pre,3.3-equiv '/ruby/build-{buildtag}/miniruby'
    Benchmark 1: /ruby/build-post/miniruby
      Time (mean ± σ):     873.4 ms ±   2.8 ms    [User: 377.6 ms, System: 490.2 ms]
      Range (min … max):   868.3 ms … 877.8 ms    10 runs

    Benchmark 2: /ruby/build-pre/miniruby
      Time (mean ± σ):     960.1 ms ±   2.8 ms    [User: 430.8 ms, System: 523.9 ms]
      Range (min … max):   955.5 ms … 964.2 ms    10 runs

    Benchmark 3: /ruby/build-3.3-equiv/miniruby
      Time (mean ± σ):     886.9 ms ±   2.8 ms    [User: 379.5 ms, System: 501.0 ms]
      Range (min … max):   883.0 ms … 890.8 ms    10 runs

    Summary
      '/ruby/build-post/miniruby' ran
        1.02 ± 0.00 times faster than '/ruby/build-3.3-equiv/miniruby'
        1.10 ± 0.00 times faster than '/ruby/build-pre/miniruby'

These results are from a Skylake server with GCC 11.
---
 gc/default.c | 78 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 44 insertions(+), 34 deletions(-)

diff --git a/gc/default.c b/gc/default.c
index 5dd7cc1df7..04a0fd7bb5 100644
--- a/gc/default.c
+++ b/gc/default.c
@@ -2570,53 +2570,63 @@ rb_gc_impl_size_pool_sizes(void *objspace_ptr)
     return size_pool_sizes;
 }
 
+NOINLINE(static VALUE newobj_cache_miss(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, size_t size_pool_idx, bool vm_locked));
+
 static VALUE
-newobj_alloc(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, size_t size_pool_idx, bool vm_locked)
+newobj_cache_miss(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, size_t size_pool_idx, bool vm_locked)
 {
     rb_size_pool_t *size_pool = &size_pools[size_pool_idx];
     rb_heap_t *heap = SIZE_POOL_EDEN_HEAP(size_pool);
+    VALUE obj = Qfalse;
 
-    VALUE obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx);
+    unsigned int lev = 0;
+    bool unlock_vm = false;
 
-    if (RB_UNLIKELY(obj == Qfalse)) {
-        unsigned int lev = 0;
-        bool unlock_vm = false;
+    if (!vm_locked) {
+        lev = rb_gc_cr_lock();
+        vm_locked = true;
+        unlock_vm = true;
+    }
 
-        if (!vm_locked) {
-            lev = rb_gc_cr_lock();
-            vm_locked = true;
-            unlock_vm = true;
+    {
+        if (is_incremental_marking(objspace)) {
+            gc_continue(objspace, size_pool, heap);
+            cache->incremental_mark_step_allocated_slots = 0;
+
+            // Retry allocation after resetting incremental_mark_step_allocated_slots
+            obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx);
         }
 
-        {
-            if (is_incremental_marking(objspace)) {
-                gc_continue(objspace, size_pool, heap);
-                cache->incremental_mark_step_allocated_slots = 0;
+        if (obj == Qfalse) {
+            // Get next free page (possibly running GC)
+            struct heap_page *page = heap_next_free_page(objspace, size_pool, heap);
+            ractor_cache_set_page(objspace, cache, size_pool_idx, page);
 
-                // Retry allocation after resetting incremental_mark_step_allocated_slots
-                obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx);
-            }
-
-            if (obj == Qfalse) {
-                // Get next free page (possibly running GC)
-                struct heap_page *page = heap_next_free_page(objspace, size_pool, heap);
-                ractor_cache_set_page(objspace, cache, size_pool_idx, page);
-
-                // Retry allocation after moving to new page
-                obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx);
-            }
-        }
-
-        if (unlock_vm) {
-            rb_gc_cr_unlock(lev);
-        }
-
-        if (RB_UNLIKELY(obj == Qfalse)) {
-            rb_memerror();
+            // Retry allocation after moving to new page
+            obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx);
         }
     }
 
-    size_pool->total_allocated_objects++;
+    if (unlock_vm) {
+        rb_gc_cr_unlock(lev);
+    }
+
+    if (RB_UNLIKELY(obj == Qfalse)) {
+        rb_memerror();
+    }
+    return obj;
+}
+
+static VALUE
+newobj_alloc(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, size_t size_pool_idx, bool vm_locked)
+{
+    VALUE obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx);
+
+    if (RB_UNLIKELY(obj == Qfalse)) {
+	obj = newobj_cache_miss(objspace, cache, size_pool_idx, vm_locked);
+    }
+
+    size_pools[size_pool_idx].total_allocated_objects++;
 
     return obj;
 }