From 158177e39995aa62df781b8ef64e800bf0c8670f Mon Sep 17 00:00:00 2001 From: Alan Wu Date: Thu, 25 Jul 2024 19:48:28 -0400 Subject: [PATCH] Improve allocation throughput by outlining cache miss code path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, GCC 11 on x86-64 inlined the heavy weight logic for potentially triggering GC into newobj_alloc(). This slowed down the hotter code path where the ractor cache hits, causing a degradation to allocation throughput. Outline the logic into a separate function and have it never inlined. This restores allocation throughput to the same level as 98eeadc ("Development of 3.4.0 started."). To evaluate, instrument miniruby so it allocates a bunch of objects and then exits: diff --git a/eval.c b/eval.c --- a/eval.c +++ b/eval.c @@ -92,6 +92,15 @@ ruby_setup(void) } EC_POP_TAG(); +rb_gc_disable(); +rb_execution_context_t *ec = GET_EC(); +long const n = 20000000; +for (long i = 0; i < n; ++i) { + rb_wb_protected_newobj_of(ec, 0, T_OBJECT, 40); +} +printf("alloc %ld\n", n); +exit(0); + return state; } With `3.3-equiv` being 98eeadc, and `pre` being f2728c3393d and `post` being this commit, I have: $ hyperfine -L buildtag post,pre,3.3-equiv '/ruby/build-{buildtag}/miniruby' Benchmark 1: /ruby/build-post/miniruby Time (mean ± σ): 873.4 ms ± 2.8 ms [User: 377.6 ms, System: 490.2 ms] Range (min … max): 868.3 ms … 877.8 ms 10 runs Benchmark 2: /ruby/build-pre/miniruby Time (mean ± σ): 960.1 ms ± 2.8 ms [User: 430.8 ms, System: 523.9 ms] Range (min … max): 955.5 ms … 964.2 ms 10 runs Benchmark 3: /ruby/build-3.3-equiv/miniruby Time (mean ± σ): 886.9 ms ± 2.8 ms [User: 379.5 ms, System: 501.0 ms] Range (min … max): 883.0 ms … 890.8 ms 10 runs Summary '/ruby/build-post/miniruby' ran 1.02 ± 0.00 times faster than '/ruby/build-3.3-equiv/miniruby' 1.10 ± 0.00 times faster than '/ruby/build-pre/miniruby' These results are from a Skylake server with GCC 11. --- gc/default.c | 78 +++++++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 34 deletions(-) diff --git a/gc/default.c b/gc/default.c index 5dd7cc1df7..04a0fd7bb5 100644 --- a/gc/default.c +++ b/gc/default.c @@ -2570,53 +2570,63 @@ rb_gc_impl_size_pool_sizes(void *objspace_ptr) return size_pool_sizes; } +NOINLINE(static VALUE newobj_cache_miss(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, size_t size_pool_idx, bool vm_locked)); + static VALUE -newobj_alloc(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, size_t size_pool_idx, bool vm_locked) +newobj_cache_miss(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, size_t size_pool_idx, bool vm_locked) { rb_size_pool_t *size_pool = &size_pools[size_pool_idx]; rb_heap_t *heap = SIZE_POOL_EDEN_HEAP(size_pool); + VALUE obj = Qfalse; - VALUE obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx); + unsigned int lev = 0; + bool unlock_vm = false; - if (RB_UNLIKELY(obj == Qfalse)) { - unsigned int lev = 0; - bool unlock_vm = false; + if (!vm_locked) { + lev = rb_gc_cr_lock(); + vm_locked = true; + unlock_vm = true; + } - if (!vm_locked) { - lev = rb_gc_cr_lock(); - vm_locked = true; - unlock_vm = true; + { + if (is_incremental_marking(objspace)) { + gc_continue(objspace, size_pool, heap); + cache->incremental_mark_step_allocated_slots = 0; + + // Retry allocation after resetting incremental_mark_step_allocated_slots + obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx); } - { - if (is_incremental_marking(objspace)) { - gc_continue(objspace, size_pool, heap); - cache->incremental_mark_step_allocated_slots = 0; + if (obj == Qfalse) { + // Get next free page (possibly running GC) + struct heap_page *page = heap_next_free_page(objspace, size_pool, heap); + ractor_cache_set_page(objspace, cache, size_pool_idx, page); - // Retry allocation after resetting incremental_mark_step_allocated_slots - obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx); - } - - if (obj == Qfalse) { - // Get next free page (possibly running GC) - struct heap_page *page = heap_next_free_page(objspace, size_pool, heap); - ractor_cache_set_page(objspace, cache, size_pool_idx, page); - - // Retry allocation after moving to new page - obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx); - } - } - - if (unlock_vm) { - rb_gc_cr_unlock(lev); - } - - if (RB_UNLIKELY(obj == Qfalse)) { - rb_memerror(); + // Retry allocation after moving to new page + obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx); } } - size_pool->total_allocated_objects++; + if (unlock_vm) { + rb_gc_cr_unlock(lev); + } + + if (RB_UNLIKELY(obj == Qfalse)) { + rb_memerror(); + } + return obj; +} + +static VALUE +newobj_alloc(rb_objspace_t *objspace, rb_ractor_newobj_cache_t *cache, size_t size_pool_idx, bool vm_locked) +{ + VALUE obj = ractor_cache_allocate_slot(objspace, cache, size_pool_idx); + + if (RB_UNLIKELY(obj == Qfalse)) { + obj = newobj_cache_miss(objspace, cache, size_pool_idx, vm_locked); + } + + size_pools[size_pool_idx].total_allocated_objects++; return obj; }