Tune codegen for rb_yield() calls landing in ISeqs

Unlike in older revisions in the year, GCC 11 isn't inlining the call
to vm_push_frame() inside invoke_iseq_block_from_c() anymore. We do
want it to be inlined since rb_yield() speed is fairly important.
Logs from -fopt-info-optimized-inline reveal that GCC was blowing its
code size budget inlining invoke_block_from_c_bh() into its various
callers, leaving suboptimal code for its body.

Take away some uses of the `inline` keyword and merge a common tail
call to vm_exec() for overall better code.

This tweak gives about 18% on a micro benchmark and 1% on the
chunky-png benchmark from yjit-bench. I tested on a Skylake server.

```
$ cat c-to-ruby-call.yml
benchmark:
  - 0.upto(10_000_000) {}

$ benchmark-driver --chruby '+patch;master' c-to-ruby-call.yml
Warming up --------------------------------------
0.upto(10_000_000) {}      2.299 i/s -       3.000 times in 1.304689s (434.90ms/i)
Calculating -------------------------------------
                          +patch      master
0.upto(10_000_000) {}      2.299       1.943 i/s -       6.000 times in 2.609393s 3.088353s

Comparison:
             0.upto(10_000_000) {}
               +patch:         2.3 i/s
               master:         1.9 i/s - 1.18x  slower

$ ruby run_benchmarks.rb --chruby 'master;+patch' chunky-png
<snip>

----------  -----------  ----------  -----------  ----------  --------------  -------------
bench       master (ms)  stddev (%)  +patch (ms)  stddev (%)  +patch 1st itr  master/+patch
chunky-png  1156.1       0.1         1142.2       0.2         1.01            1.01
----------  -----------  ----------  -----------  ----------  --------------  -------------
```
This commit is contained in:
Your Name 2024-08-03 00:53:13 +00:00 коммит произвёл Alan Wu
Родитель e271feb866
Коммит 34715bdd91
2 изменённых файлов: 8 добавлений и 11 удалений

17
vm.c
Просмотреть файл

@ -1509,7 +1509,7 @@ rb_binding_add_dynavars(VALUE bindval, rb_binding_t *bind, int dyncount, const I
/* C -> Ruby: block */
static inline VALUE
static inline void
invoke_block(rb_execution_context_t *ec, const rb_iseq_t *iseq, VALUE self, const struct rb_captured_block *captured, const rb_cref_t *cref, VALUE type, int opt_pc)
{
int arg_size = ISEQ_BODY(iseq)->param.size;
@ -1521,15 +1521,13 @@ invoke_block(rb_execution_context_t *ec, const rb_iseq_t *iseq, VALUE self, cons
ec->cfp->sp + arg_size,
ISEQ_BODY(iseq)->local_table_size - arg_size,
ISEQ_BODY(iseq)->stack_max);
return vm_exec(ec);
}
static VALUE
static inline void
invoke_bmethod(rb_execution_context_t *ec, const rb_iseq_t *iseq, VALUE self, const struct rb_captured_block *captured, const rb_callable_method_entry_t *me, VALUE type, int opt_pc)
{
/* bmethod call from outside the VM */
int arg_size = ISEQ_BODY(iseq)->param.size;
VALUE ret;
VM_ASSERT(me->def->type == VM_METHOD_TYPE_BMETHOD);
@ -1542,9 +1540,6 @@ invoke_bmethod(rb_execution_context_t *ec, const rb_iseq_t *iseq, VALUE self, co
ISEQ_BODY(iseq)->stack_max);
VM_ENV_FLAGS_SET(ec->cfp->ep, VM_FRAME_FLAG_FINISH);
ret = vm_exec(ec);
return ret;
}
ALWAYS_INLINE(static VALUE
@ -1591,14 +1586,16 @@ invoke_iseq_block_from_c(rb_execution_context_t *ec, const struct rb_captured_bl
cfp->sp = sp;
if (me == NULL) {
return invoke_block(ec, iseq, self, captured, cref, type, opt_pc);
invoke_block(ec, iseq, self, captured, cref, type, opt_pc);
}
else {
return invoke_bmethod(ec, iseq, self, captured, me, type, opt_pc);
invoke_bmethod(ec, iseq, self, captured, me, type, opt_pc);
}
return vm_exec(ec);
}
static inline VALUE
static VALUE
invoke_block_from_c_bh(rb_execution_context_t *ec, VALUE block_handler,
int argc, const VALUE *argv,
int kw_splat, VALUE passed_block_handler, const rb_cref_t *cref,

Просмотреть файл

@ -3751,7 +3751,7 @@ vm_method_cfunc_entry(const rb_callable_method_entry_t *me)
return UNALIGNED_MEMBER_PTR(me->def, body.cfunc);
}
static inline VALUE
static VALUE
vm_call_cfunc_with_frame_(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling,
int argc, VALUE *argv, VALUE *stack_bottom)
{