ruby/vm_insnhelper.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

7368 строки
236 KiB
C
Исходник Обычный вид История

/**********************************************************************
vm_insnhelper.c - instruction helper functions.
$Author$
Copyright (C) 2007 Koichi Sasada
**********************************************************************/
#include "ruby/internal/config.h"
#include <math.h>
Add explicit compiler fence when pushing frames to ensure safe profiling **What does this PR do?** This PR tweaks the `vm_push_frame` function to add an explicit compiler fence (`atomic_signal_fence`) to ensure profilers that use signals to interrupt applications (stackprof, vernier, pf2, Datadog profiler) can safely sample from the signal handler. **Motivation:** The `vm_push_frame` was specifically tweaked in https://github.com/ruby/ruby/pull/3296 to initialize the a frame before updating the `cfp` pointer. But since there's nothing stopping the compiler from reordering the initialization of a frame (`*cfp =`) with the update of the cfp pointer (`ec->cfp = cfp`) we've been hesitant to rely on this on the Datadog profiler. In practice, after some experimentation + talking to folks, this reordering does not seem to happen. But since modern compilers have a way for us to exactly tell them not to do the reordering (`atomic_signal_fence`), this seems even better. I've actually extracted `vm_push_frame` into the "Compiler Explorer" website, which you can use to see the assembly output of this function across many compilers and architectures: https://godbolt.org/z/3oxd1446K On that link you can observe two things across many compilers: 1. The compilers are not reordering the writes 2. The barrier does not change the generated assembly output (== has no cost in practice) **Additional Notes:** The checks added in `configure.ac` define two new macros: * `HAVE_STDATOMIC_H` * `HAVE_DECL_ATOMIC_SIGNAL_FENCE` Since Ruby generates an arch-specific `config.h` header with these macros upon installation, this can be used by profilers and other libraries to test if Ruby was compiled with the fence enabled. **How to test the change?** As I mentioned above, you can check https://godbolt.org/z/3oxd1446K to confirm the compiled output of `vm_push_frame` does not change in most compilers (at least all that I've checked on that site).
2024-06-21 13:48:37 +03:00
#ifdef HAVE_STDATOMIC_H
#include <stdatomic.h>
#endif
#include "constant.h"
#include "debug_counter.h"
#include "internal.h"
#include "internal/class.h"
#include "internal/compar.h"
#include "internal/hash.h"
#include "internal/numeric.h"
#include "internal/proc.h"
#include "internal/random.h"
#include "internal/variable.h"
#include "internal/struct.h"
#include "variable.h"
/* finish iseq array */
#include "insns.inc"
#include "insns_info.inc"
extern rb_method_definition_t *rb_method_definition_create(rb_method_type_t type, ID mid);
extern void rb_method_definition_set(const rb_method_entry_t *me, rb_method_definition_t *def, void *opts);
extern int rb_method_definition_eq(const rb_method_definition_t *d1, const rb_method_definition_t *d2);
extern VALUE rb_make_no_method_exception(VALUE exc, VALUE format, VALUE obj,
int argc, const VALUE *argv, int priv);
static const struct rb_callcache vm_empty_cc;
static const struct rb_callcache vm_empty_cc_for_super;
/* control stack frame */
static rb_control_frame_t *vm_get_ruby_level_caller_cfp(const rb_execution_context_t *ec, const rb_control_frame_t *cfp);
2023-03-07 09:02:03 +03:00
VALUE
ruby_vm_special_exception_copy(VALUE exc)
{
VALUE e = rb_obj_alloc(rb_class_real(RBASIC_CLASS(exc)));
rb_obj_copy_ivar(e, exc);
return e;
}
NORETURN(static void ec_stack_overflow(rb_execution_context_t *ec, int));
static void
ec_stack_overflow(rb_execution_context_t *ec, int setup)
{
VALUE mesg = rb_ec_vm_ptr(ec)->special_exceptions[ruby_error_sysstack];
ec->raised_flag = RAISED_STACKOVERFLOW;
if (setup) {
VALUE at = rb_ec_backtrace_object(ec);
mesg = ruby_vm_special_exception_copy(mesg);
rb_ivar_set(mesg, idBt, at);
rb_ivar_set(mesg, idBt_locations, at);
}
ec->errinfo = mesg;
EC_JUMP_TAG(ec, TAG_RAISE);
}
NORETURN(static void vm_stackoverflow(void));
static void
vm_stackoverflow(void)
{
ec_stack_overflow(GET_EC(), TRUE);
}
2023-03-07 09:02:03 +03:00
NORETURN(void rb_ec_stack_overflow(rb_execution_context_t *ec, int crit));
void
rb_ec_stack_overflow(rb_execution_context_t *ec, int crit)
{
if (rb_during_gc()) {
rb_bug("system stack overflow during GC. Faulty native extension?");
}
if (crit) {
ec->raised_flag = RAISED_STACKOVERFLOW;
ec->errinfo = rb_ec_vm_ptr(ec)->special_exceptions[ruby_error_stackfatal];
EC_JUMP_TAG(ec, TAG_RAISE);
}
#ifdef USE_SIGALTSTACK
ec_stack_overflow(ec, TRUE);
#else
ec_stack_overflow(ec, FALSE);
#endif
}
static inline void stack_check(rb_execution_context_t *ec);
#if VM_CHECK_MODE > 0
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
static int
callable_class_p(VALUE klass)
{
#if VM_CHECK_MODE >= 2
if (!klass) return FALSE;
switch (RB_BUILTIN_TYPE(klass)) {
default:
break;
case T_ICLASS:
if (!RB_TYPE_P(RCLASS_SUPER(klass), T_MODULE)) break;
case T_MODULE:
return TRUE;
}
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
while (klass) {
if (klass == rb_cBasicObject) {
return TRUE;
}
klass = RCLASS_SUPER(klass);
}
return FALSE;
#else
return klass != 0;
#endif
}
static int
callable_method_entry_p(const rb_callable_method_entry_t *cme)
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
{
if (cme == NULL) {
return TRUE;
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
}
else {
VM_ASSERT(IMEMO_TYPE_P((VALUE)cme, imemo_ment));
if (callable_class_p(cme->defined_class)) {
return TRUE;
}
else {
return FALSE;
}
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
}
}
static void
vm_check_frame_detail(VALUE type, int req_block, int req_me, int req_cref, VALUE specval, VALUE cref_or_me, int is_cframe, const rb_iseq_t *iseq)
{
unsigned int magic = (unsigned int)(type & VM_FRAME_MAGIC_MASK);
enum imemo_type cref_or_me_type = imemo_env; /* impossible value */
if (RB_TYPE_P(cref_or_me, T_IMEMO)) {
cref_or_me_type = imemo_type(cref_or_me);
}
if (type & VM_FRAME_FLAG_BMETHOD) {
req_me = TRUE;
}
if (req_block && (type & VM_ENV_FLAG_LOCAL) == 0) {
rb_bug("vm_push_frame: specval (%p) should be a block_ptr on %x frame", (void *)specval, magic);
}
if (!req_block && (type & VM_ENV_FLAG_LOCAL) != 0) {
rb_bug("vm_push_frame: specval (%p) should not be a block_ptr on %x frame", (void *)specval, magic);
}
if (req_me) {
if (cref_or_me_type != imemo_ment) {
rb_bug("vm_push_frame: (%s) should be method entry on %x frame", rb_obj_info(cref_or_me), magic);
}
}
else {
if (req_cref && cref_or_me_type != imemo_cref) {
rb_bug("vm_push_frame: (%s) should be CREF on %x frame", rb_obj_info(cref_or_me), magic);
}
else { /* cref or Qfalse */
if (cref_or_me != Qfalse && cref_or_me_type != imemo_cref) {
if (((type & VM_FRAME_FLAG_LAMBDA) || magic == VM_FRAME_MAGIC_IFUNC) && (cref_or_me_type == imemo_ment)) {
/* ignore */
}
else {
rb_bug("vm_push_frame: (%s) should be false or cref on %x frame", rb_obj_info(cref_or_me), magic);
}
}
}
}
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
if (cref_or_me_type == imemo_ment) {
const rb_callable_method_entry_t *me = (const rb_callable_method_entry_t *)cref_or_me;
if (!callable_method_entry_p(me)) {
rb_bug("vm_push_frame: ment (%s) should be callable on %x frame.", rb_obj_info(cref_or_me), magic);
}
}
if ((type & VM_FRAME_MAGIC_MASK) == VM_FRAME_MAGIC_DUMMY) {
VM_ASSERT(iseq == NULL ||
RBASIC_CLASS((VALUE)iseq) == 0 || // dummy frame for loading
RUBY_VM_NORMAL_ISEQ_P(iseq) //argument error
);
}
else {
VM_ASSERT(is_cframe == !RUBY_VM_NORMAL_ISEQ_P(iseq));
}
}
static void
vm_check_frame(VALUE type,
VALUE specval,
VALUE cref_or_me,
const rb_iseq_t *iseq)
{
VALUE given_magic = type & VM_FRAME_MAGIC_MASK;
VM_ASSERT(FIXNUM_P(type));
#define CHECK(magic, req_block, req_me, req_cref, is_cframe) \
case magic: \
vm_check_frame_detail(type, req_block, req_me, req_cref, \
specval, cref_or_me, is_cframe, iseq); \
break
switch (given_magic) {
/* BLK ME CREF CFRAME */
CHECK(VM_FRAME_MAGIC_METHOD, TRUE, TRUE, FALSE, FALSE);
CHECK(VM_FRAME_MAGIC_CLASS, TRUE, FALSE, TRUE, FALSE);
CHECK(VM_FRAME_MAGIC_TOP, TRUE, FALSE, TRUE, FALSE);
CHECK(VM_FRAME_MAGIC_CFUNC, TRUE, TRUE, FALSE, TRUE);
CHECK(VM_FRAME_MAGIC_BLOCK, FALSE, FALSE, FALSE, FALSE);
CHECK(VM_FRAME_MAGIC_IFUNC, FALSE, FALSE, FALSE, TRUE);
CHECK(VM_FRAME_MAGIC_EVAL, FALSE, FALSE, FALSE, FALSE);
CHECK(VM_FRAME_MAGIC_RESCUE, FALSE, FALSE, FALSE, FALSE);
CHECK(VM_FRAME_MAGIC_DUMMY, TRUE, FALSE, FALSE, FALSE);
default:
rb_bug("vm_push_frame: unknown type (%x)", (unsigned int)given_magic);
}
#undef CHECK
}
static VALUE vm_stack_canary; /* Initialized later */
static bool vm_stack_canary_was_born = false;
// Return the index of the instruction right before the given PC.
// This is needed because insn_entry advances PC before the insn body.
static unsigned int
previous_insn_index(const rb_iseq_t *iseq, const VALUE *pc)
{
unsigned int pos = 0;
while (pos < ISEQ_BODY(iseq)->iseq_size) {
int opcode = rb_vm_insn_addr2opcode((void *)ISEQ_BODY(iseq)->iseq_encoded[pos]);
unsigned int next_pos = pos + insn_len(opcode);
if (ISEQ_BODY(iseq)->iseq_encoded + next_pos == pc) {
return pos;
}
pos = next_pos;
}
rb_bug("failed to find the previous insn");
}
2023-03-07 08:34:31 +03:00
void
2020-12-25 17:36:25 +03:00
rb_vm_check_canary(const rb_execution_context_t *ec, VALUE *sp)
{
const struct rb_control_frame_struct *reg_cfp = ec->cfp;
const struct rb_iseq_struct *iseq;
if (! LIKELY(vm_stack_canary_was_born)) {
return; /* :FIXME: isn't it rather fatal to enter this branch? */
}
avoid buffer overflow in vm_check_canary ec->cfp->iseq might not exist at the very beginning of a thread. ================================================================= ==82954==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x7fc86f334810 at pc 0x55ceaf013125 bp 0x7ffe2eddbbf0 sp 0x7ffe2eddbbe8 READ of size 8 at 0x7fc86f334810 thread T0 #0 0x55ceaf013124 in vm_check_canary vm_insnhelper.c:217:24 #1 0x55ceaefb4796 in vm_push_frame vm_insnhelper.c:276:5 #2 0x55ceaf0124bd in th_init vm.c:2661:5 #3 0x55ceaf00d5eb in ruby_thread_init vm.c:2690:5 #4 0x55ceaf00d4b1 in rb_thread_alloc vm.c:2703:5 #5 0x55ceaef0038b in thread_s_new thread.c:872:20 #6 0x55ceaf04d8c1 in call_cfunc_m1 vm_insnhelper.c:2041:12 #7 0x55ceaf03118d in vm_call_cfunc_with_frame vm_insnhelper.c:2207:11 #8 0x55ceaf017985 in vm_call_cfunc vm_insnhelper.c:2225:12 #9 0x55ceaf01548b in vm_call_method_each_type vm_insnhelper.c:2560:9 #10 0x55ceaf014c96 in vm_call_method vm_insnhelper.c:2686:13 #11 0x55ceaefb5de4 in vm_call_general vm_insnhelper.c:2730:12 #12 0x55ceaf03c868 in vm_sendish vm_insnhelper.c:3623:11 #13 0x55ceaefc95bb in vm_exec_core insns.def:771:11 #14 0x55ceaf006700 in rb_vm_exec vm.c:1892:22 #15 0x55ceaf00acbf in rb_iseq_eval_main vm.c:2151:11 #16 0x55ceaea250ca in ruby_exec_internal eval.c:262:2 #17 0x55ceaea2498b in ruby_exec_node eval.c:326:12 #18 0x55ceaea247d0 in ruby_run_node eval.c:318:25 #19 0x55ceae88c486 in main main.c:42:9 #20 0x7fc874330b96 in __libc_start_main /build/glibc-OTsEL5/glibc-2.27/csu/../csu/libc-start.c:310 #21 0x55ceae7e5289 in _start (miniruby+0x15f289) 0x7fc86f334810 is located 16 bytes to the right of 1048576-byte region [0x7fc86f234800,0x7fc86f334800) allocated by thread T0 here: #0 0x55ceae85d56d in malloc (miniruby+0x1d756d) #1 0x55ceaea71d12 in objspace_xmalloc0 gc.c:9416:5 #2 0x55ceaea71cd2 in ruby_xmalloc2_body gc.c:9623:12 #3 0x55ceaea7d09c in ruby_xmalloc2 gc.c:11479:12 #4 0x55ceaf00c3b7 in rb_thread_recycle_stack vm.c:2462:12 #5 0x55ceaf012256 in th_init vm.c:2656:29 #6 0x55ceaf00d5eb in ruby_thread_init vm.c:2690:5 #7 0x55ceaf00d4b1 in rb_thread_alloc vm.c:2703:5 #8 0x55ceaef0038b in thread_s_new thread.c:872:20 #9 0x55ceaf04d8c1 in call_cfunc_m1 vm_insnhelper.c:2041:12 #10 0x55ceaf03118d in vm_call_cfunc_with_frame vm_insnhelper.c:2207:11 #11 0x55ceaf017985 in vm_call_cfunc vm_insnhelper.c:2225:12 #12 0x55ceaf01548b in vm_call_method_each_type vm_insnhelper.c:2560:9 #13 0x55ceaf014c96 in vm_call_method vm_insnhelper.c:2686:13 #14 0x55ceaefb5de4 in vm_call_general vm_insnhelper.c:2730:12 #15 0x55ceaf03c868 in vm_sendish vm_insnhelper.c:3623:11 #16 0x55ceaefc95bb in vm_exec_core insns.def:771:11 #17 0x55ceaf006700 in rb_vm_exec vm.c:1892:22 #18 0x55ceaf00acbf in rb_iseq_eval_main vm.c:2151:11 #19 0x55ceaea250ca in ruby_exec_internal eval.c:262:2 #20 0x55ceaea2498b in ruby_exec_node eval.c:326:12 #21 0x55ceaea247d0 in ruby_run_node eval.c:318:25 #22 0x55ceae88c486 in main main.c:42:9 #23 0x7fc874330b96 in __libc_start_main /build/glibc-OTsEL5/glibc-2.27/csu/../csu/libc-start.c:310 SUMMARY: AddressSanitizer: heap-buffer-overflow vm_insnhelper.c:217:24 in vm_check_canary Shadow bytes around the buggy address: 0x0ff98de5e8b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ff98de5e8c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ff98de5e8d0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ff98de5e8e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0x0ff98de5e8f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 =>0x0ff98de5e900: fa fa[fa]fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0ff98de5e910: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0ff98de5e920: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0ff98de5e930: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0ff98de5e940: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa 0x0ff98de5e950: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa Shadow byte legend (one shadow byte represents 8 application bytes): Addressable: 00 Partially addressable: 01 02 03 04 05 06 07 Heap left redzone: fa Freed heap region: fd Stack left redzone: f1 Stack mid redzone: f2 Stack right redzone: f3 Stack after return: f5 Stack use after scope: f8 Global redzone: f9 Global init order: f6 Poisoned by user: f7 Container overflow: fc Array cookie: ac Intra object redzone: bb ASan internal: fe Left alloca redzone: ca Right alloca redzone: cb Shadow gap: cc ==82954==ABORTING
2019-04-25 09:03:18 +03:00
else if ((VALUE *)reg_cfp == ec->vm_stack + ec->vm_stack_size) {
/* This is at the very beginning of a thread. cfp does not exist. */
return;
}
else if (! (iseq = GET_ISEQ())) {
return;
}
else if (LIKELY(sp[0] != vm_stack_canary)) {
return;
}
else {
/* we are going to call methods below; squash the canary to
* prevent infinite loop. */
sp[0] = Qundef;
}
const VALUE *orig = rb_iseq_original_iseq(iseq);
const VALUE iseqw = rb_iseqw_new(iseq);
const VALUE inspection = rb_inspect(iseqw);
const char *stri = rb_str_to_cstr(inspection);
const VALUE disasm = rb_iseq_disasm(iseq);
const char *strd = rb_str_to_cstr(disasm);
const ptrdiff_t pos = previous_insn_index(iseq, GET_PC());
const enum ruby_vminsn_type insn = (enum ruby_vminsn_type)orig[pos];
const char *name = insn_name(insn);
/* rb_bug() is not capable of outputting this large contents. It
is designed to run form a SIGSEGV handler, which tends to be
very restricted. */
ruby_debug_printf(
"We are killing the stack canary set by %s, "
"at %s@pc=%"PRIdPTR"\n"
"watch out the C stack trace.\n"
"%s",
name, stri, pos, strd);
rb_bug("see above.");
}
2020-12-25 17:36:25 +03:00
#define vm_check_canary(ec, sp) rb_vm_check_canary(ec, sp)
#else
#define vm_check_canary(ec, sp)
#define vm_check_frame(a, b, c, d)
#endif /* VM_CHECK_MODE > 0 */
#if USE_DEBUG_COUNTER
static void
vm_push_frame_debug_counter_inc(
2020-07-10 06:48:47 +03:00
const struct rb_execution_context_struct *ec,
const struct rb_control_frame_struct *reg_cfp,
VALUE type)
{
const struct rb_control_frame_struct *prev_cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(reg_cfp);
RB_DEBUG_COUNTER_INC(frame_push);
if (RUBY_VM_END_CONTROL_FRAME(ec) != prev_cfp) {
const bool curr = VM_FRAME_RUBYFRAME_P(reg_cfp);
const bool prev = VM_FRAME_RUBYFRAME_P(prev_cfp);
if (prev) {
2020-07-10 06:32:48 +03:00
if (curr) {
RB_DEBUG_COUNTER_INC(frame_R2R);
}
else {
RB_DEBUG_COUNTER_INC(frame_R2C);
}
}
else {
2020-07-10 06:32:48 +03:00
if (curr) {
RB_DEBUG_COUNTER_INC(frame_C2R);
}
else {
RB_DEBUG_COUNTER_INC(frame_C2C);
}
}
}
switch (type & VM_FRAME_MAGIC_MASK) {
case VM_FRAME_MAGIC_METHOD: RB_DEBUG_COUNTER_INC(frame_push_method); return;
case VM_FRAME_MAGIC_BLOCK: RB_DEBUG_COUNTER_INC(frame_push_block); return;
case VM_FRAME_MAGIC_CLASS: RB_DEBUG_COUNTER_INC(frame_push_class); return;
case VM_FRAME_MAGIC_TOP: RB_DEBUG_COUNTER_INC(frame_push_top); return;
case VM_FRAME_MAGIC_CFUNC: RB_DEBUG_COUNTER_INC(frame_push_cfunc); return;
case VM_FRAME_MAGIC_IFUNC: RB_DEBUG_COUNTER_INC(frame_push_ifunc); return;
case VM_FRAME_MAGIC_EVAL: RB_DEBUG_COUNTER_INC(frame_push_eval); return;
case VM_FRAME_MAGIC_RESCUE: RB_DEBUG_COUNTER_INC(frame_push_rescue); return;
case VM_FRAME_MAGIC_DUMMY: RB_DEBUG_COUNTER_INC(frame_push_dummy); return;
}
rb_bug("unreachable");
}
#else
#define vm_push_frame_debug_counter_inc(ec, cfp, t) /* void */
#endif
// Return a poison value to be set above the stack top to verify leafness.
VALUE
rb_vm_stack_canary(void)
{
#if VM_CHECK_MODE > 0
return vm_stack_canary;
#else
return 0;
#endif
}
STATIC_ASSERT(VM_ENV_DATA_INDEX_ME_CREF, VM_ENV_DATA_INDEX_ME_CREF == -2);
STATIC_ASSERT(VM_ENV_DATA_INDEX_SPECVAL, VM_ENV_DATA_INDEX_SPECVAL == -1);
STATIC_ASSERT(VM_ENV_DATA_INDEX_FLAGS, VM_ENV_DATA_INDEX_FLAGS == -0);
static void
vm_push_frame(rb_execution_context_t *ec,
const rb_iseq_t *iseq,
VALUE type,
VALUE self,
VALUE specval,
VALUE cref_or_me,
const VALUE *pc,
VALUE *sp,
int local_size,
int stack_max)
{
rb_control_frame_t *const cfp = RUBY_VM_NEXT_CONTROL_FRAME(ec->cfp);
vm_check_frame(type, specval, cref_or_me, iseq);
VM_ASSERT(local_size >= 0);
* vm_core.h: remove lfp (local frame pointer) and rename dfp (dynamic frame pointer) to ep (environment pointer). This change make VM `normal' (similar to other interpreters). Before this commit: Each frame has two env pointers lfp and dfp. lfp points local environment which is method/class/toplevel frame. lfp[0] is block pointer. dfp is block local frame. dfp[0] points previous (parent) environment pointer. lfp == dfp when frame is method/class/toplevel. You can get lfp from dfp by traversing previous environment pointers. After this commit: Each frame has only `ep' to point respective enviornoment. If there is parent environment, then ep[0] points parent envioenment (as dfp). If there are no more environment, then ep[0] points block pointer (as lfp). We call such ep as `LEP' (local EP). We add some macros to get LEP and to detect LEP or not. In short, we replace dfp and lfp with ep and LEP. rb_block_t and rb_binding_t member `lfp' and `dfp' are removed and member `ep' is added. rename rb_thread_t's member `local_lfp' and `local_svar' to `root_lep' and `root_svar'. (VM_EP_PREV_EP(ep)): get previous environment pointer. This macro assume that ep is not LEP. (VM_EP_BLOCK_PTR(ep)): get block pointer. This macro assume that ep is LEP. (VM_EP_LEP_P(ep)): detect ep is LEP or not. (VM_ENVVAL_BLOCK_PTR(ptr)): make block pointer. (VM_ENVVAL_BLOCK_PTR_P(v)): detect v is block pointer. (VM_ENVVAL_PREV_EP_PTR(ptr)): make prev environment pointer. (VM_ENVVAL_PREV_EP_PTR_P(v)): detect v is prev env pointer. * vm.c: apply above changes. (VM_EP_LEP(ep)): get LEP. (VM_CF_LEP(cfp)): get LEP of cfp->ep. (VM_CF_PREV_EP(cfp)): utility function VM_EP_PREV_EP(cfp->ep). (VM_CF_BLOCK_PTR(cfp)): utility function VM_EP_BLOCK_PTR(cfp->ep). * vm.c, vm_eval.c, vm_insnhelper.c, vm_insnhelper.h, insns.def: apply above changes. * cont.c: ditto. * eval.c, eval_intern.h: ditto. * proc.c: ditto. * thread.c: ditto. * vm_dump.c: ditto. * vm_exec.h: fix function name (on vm debug mode). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@36030 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-06-11 07:14:59 +04:00
/* check stack overflow */
CHECK_VM_STACK_OVERFLOW0(cfp, sp, local_size + stack_max);
vm_check_canary(ec, sp);
/* setup vm value stack */
* vm_core.h: remove lfp (local frame pointer) and rename dfp (dynamic frame pointer) to ep (environment pointer). This change make VM `normal' (similar to other interpreters). Before this commit: Each frame has two env pointers lfp and dfp. lfp points local environment which is method/class/toplevel frame. lfp[0] is block pointer. dfp is block local frame. dfp[0] points previous (parent) environment pointer. lfp == dfp when frame is method/class/toplevel. You can get lfp from dfp by traversing previous environment pointers. After this commit: Each frame has only `ep' to point respective enviornoment. If there is parent environment, then ep[0] points parent envioenment (as dfp). If there are no more environment, then ep[0] points block pointer (as lfp). We call such ep as `LEP' (local EP). We add some macros to get LEP and to detect LEP or not. In short, we replace dfp and lfp with ep and LEP. rb_block_t and rb_binding_t member `lfp' and `dfp' are removed and member `ep' is added. rename rb_thread_t's member `local_lfp' and `local_svar' to `root_lep' and `root_svar'. (VM_EP_PREV_EP(ep)): get previous environment pointer. This macro assume that ep is not LEP. (VM_EP_BLOCK_PTR(ep)): get block pointer. This macro assume that ep is LEP. (VM_EP_LEP_P(ep)): detect ep is LEP or not. (VM_ENVVAL_BLOCK_PTR(ptr)): make block pointer. (VM_ENVVAL_BLOCK_PTR_P(v)): detect v is block pointer. (VM_ENVVAL_PREV_EP_PTR(ptr)): make prev environment pointer. (VM_ENVVAL_PREV_EP_PTR_P(v)): detect v is prev env pointer. * vm.c: apply above changes. (VM_EP_LEP(ep)): get LEP. (VM_CF_LEP(cfp)): get LEP of cfp->ep. (VM_CF_PREV_EP(cfp)): utility function VM_EP_PREV_EP(cfp->ep). (VM_CF_BLOCK_PTR(cfp)): utility function VM_EP_BLOCK_PTR(cfp->ep). * vm.c, vm_eval.c, vm_insnhelper.c, vm_insnhelper.h, insns.def: apply above changes. * cont.c: ditto. * eval.c, eval_intern.h: ditto. * proc.c: ditto. * thread.c: ditto. * vm_dump.c: ditto. * vm_exec.h: fix function name (on vm debug mode). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@36030 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-06-11 07:14:59 +04:00
/* initialize local variables */
for (int i=0; i < local_size; i++) {
* vm_core.h: remove lfp (local frame pointer) and rename dfp (dynamic frame pointer) to ep (environment pointer). This change make VM `normal' (similar to other interpreters). Before this commit: Each frame has two env pointers lfp and dfp. lfp points local environment which is method/class/toplevel frame. lfp[0] is block pointer. dfp is block local frame. dfp[0] points previous (parent) environment pointer. lfp == dfp when frame is method/class/toplevel. You can get lfp from dfp by traversing previous environment pointers. After this commit: Each frame has only `ep' to point respective enviornoment. If there is parent environment, then ep[0] points parent envioenment (as dfp). If there are no more environment, then ep[0] points block pointer (as lfp). We call such ep as `LEP' (local EP). We add some macros to get LEP and to detect LEP or not. In short, we replace dfp and lfp with ep and LEP. rb_block_t and rb_binding_t member `lfp' and `dfp' are removed and member `ep' is added. rename rb_thread_t's member `local_lfp' and `local_svar' to `root_lep' and `root_svar'. (VM_EP_PREV_EP(ep)): get previous environment pointer. This macro assume that ep is not LEP. (VM_EP_BLOCK_PTR(ep)): get block pointer. This macro assume that ep is LEP. (VM_EP_LEP_P(ep)): detect ep is LEP or not. (VM_ENVVAL_BLOCK_PTR(ptr)): make block pointer. (VM_ENVVAL_BLOCK_PTR_P(v)): detect v is block pointer. (VM_ENVVAL_PREV_EP_PTR(ptr)): make prev environment pointer. (VM_ENVVAL_PREV_EP_PTR_P(v)): detect v is prev env pointer. * vm.c: apply above changes. (VM_EP_LEP(ep)): get LEP. (VM_CF_LEP(cfp)): get LEP of cfp->ep. (VM_CF_PREV_EP(cfp)): utility function VM_EP_PREV_EP(cfp->ep). (VM_CF_BLOCK_PTR(cfp)): utility function VM_EP_BLOCK_PTR(cfp->ep). * vm.c, vm_eval.c, vm_insnhelper.c, vm_insnhelper.h, insns.def: apply above changes. * cont.c: ditto. * eval.c, eval_intern.h: ditto. * proc.c: ditto. * thread.c: ditto. * vm_dump.c: ditto. * vm_exec.h: fix function name (on vm debug mode). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@36030 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-06-11 07:14:59 +04:00
*sp++ = Qnil;
}
/* setup ep with managing data */
*sp++ = cref_or_me; /* ep[-2] / Qnil or T_IMEMO(cref) or T_IMEMO(ment) */
*sp++ = specval /* ep[-1] / block handler or prev env ptr */;
*sp++ = type; /* ep[-0] / ENV_FLAGS */
/* setup new frame */
*cfp = (const struct rb_control_frame_struct) {
.pc = pc,
.sp = sp,
.iseq = iseq,
.self = self,
.ep = sp - 1,
.block_code = NULL,
#if VM_DEBUG_BP_CHECK
.bp_check = sp,
#endif
2021-02-10 00:24:06 +03:00
.jit_return = NULL
};
Add explicit compiler fence when pushing frames to ensure safe profiling **What does this PR do?** This PR tweaks the `vm_push_frame` function to add an explicit compiler fence (`atomic_signal_fence`) to ensure profilers that use signals to interrupt applications (stackprof, vernier, pf2, Datadog profiler) can safely sample from the signal handler. **Motivation:** The `vm_push_frame` was specifically tweaked in https://github.com/ruby/ruby/pull/3296 to initialize the a frame before updating the `cfp` pointer. But since there's nothing stopping the compiler from reordering the initialization of a frame (`*cfp =`) with the update of the cfp pointer (`ec->cfp = cfp`) we've been hesitant to rely on this on the Datadog profiler. In practice, after some experimentation + talking to folks, this reordering does not seem to happen. But since modern compilers have a way for us to exactly tell them not to do the reordering (`atomic_signal_fence`), this seems even better. I've actually extracted `vm_push_frame` into the "Compiler Explorer" website, which you can use to see the assembly output of this function across many compilers and architectures: https://godbolt.org/z/3oxd1446K On that link you can observe two things across many compilers: 1. The compilers are not reordering the writes 2. The barrier does not change the generated assembly output (== has no cost in practice) **Additional Notes:** The checks added in `configure.ac` define two new macros: * `HAVE_STDATOMIC_H` * `HAVE_DECL_ATOMIC_SIGNAL_FENCE` Since Ruby generates an arch-specific `config.h` header with these macros upon installation, this can be used by profilers and other libraries to test if Ruby was compiled with the fence enabled. **How to test the change?** As I mentioned above, you can check https://godbolt.org/z/3oxd1446K to confirm the compiled output of `vm_push_frame` does not change in most compilers (at least all that I've checked on that site).
2024-06-21 13:48:37 +03:00
/* Ensure the initialization of `*cfp` above never gets reordered with the update of `ec->cfp` below.
This is a no-op in all cases we've looked at (https://godbolt.org/z/3oxd1446K), but should guarantee it for all
future/untested compilers/platforms. */
#if defined HAVE_DECL_ATOMIC_SIGNAL_FENCE && HAVE_DECL_ATOMIC_SIGNAL_FENCE
Add explicit compiler fence when pushing frames to ensure safe profiling **What does this PR do?** This PR tweaks the `vm_push_frame` function to add an explicit compiler fence (`atomic_signal_fence`) to ensure profilers that use signals to interrupt applications (stackprof, vernier, pf2, Datadog profiler) can safely sample from the signal handler. **Motivation:** The `vm_push_frame` was specifically tweaked in https://github.com/ruby/ruby/pull/3296 to initialize the a frame before updating the `cfp` pointer. But since there's nothing stopping the compiler from reordering the initialization of a frame (`*cfp =`) with the update of the cfp pointer (`ec->cfp = cfp`) we've been hesitant to rely on this on the Datadog profiler. In practice, after some experimentation + talking to folks, this reordering does not seem to happen. But since modern compilers have a way for us to exactly tell them not to do the reordering (`atomic_signal_fence`), this seems even better. I've actually extracted `vm_push_frame` into the "Compiler Explorer" website, which you can use to see the assembly output of this function across many compilers and architectures: https://godbolt.org/z/3oxd1446K On that link you can observe two things across many compilers: 1. The compilers are not reordering the writes 2. The barrier does not change the generated assembly output (== has no cost in practice) **Additional Notes:** The checks added in `configure.ac` define two new macros: * `HAVE_STDATOMIC_H` * `HAVE_DECL_ATOMIC_SIGNAL_FENCE` Since Ruby generates an arch-specific `config.h` header with these macros upon installation, this can be used by profilers and other libraries to test if Ruby was compiled with the fence enabled. **How to test the change?** As I mentioned above, you can check https://godbolt.org/z/3oxd1446K to confirm the compiled output of `vm_push_frame` does not change in most compilers (at least all that I've checked on that site).
2024-06-21 13:48:37 +03:00
atomic_signal_fence(memory_order_seq_cst);
#endif
ec->cfp = cfp;
if (VMDEBUG == 2) {
SDR();
}
vm_push_frame_debug_counter_inc(ec, cfp, type);
}
Avoid checking interrupt when loading iseq The interrupt check will unintentionally release the VM lock when loading an iseq. And this will cause issues with the `debug` gem's [`ObjectSpace.each_iseq` method](https://github.com/ruby/debug/blob/0fcfc28acae33ec1c08068fb7c33703cfa681fa7/ext/debug/iseq_collector.c#L61-L67), which wraps iseqs with a wrapper and exposes their internal states when they're actually not ready to be used. And when that happens, errors like this would occur and kill the `debug` gem's thread: ``` DEBUGGER: ReaderThreadError: uninitialized InstructionSequence ┃ DEBUGGER: Disconnected. ┃ ["/opt/rubies/ruby-3.2.0/lib/ruby/gems/3.2.0/gems/debug-1.7.1/lib/debug/breakpoint.rb:247:in `absolute_path'", ┃ "/opt/rubies/ruby-3.2.0/lib/ruby/gems/3.2.0/gems/debug-1.7.1/lib/debug/breakpoint.rb:247:in `block in iterate_iseq'", ┃ "/opt/rubies/ruby-3.2.0/lib/ruby/gems/3.2.0/gems/debug-1.7.1/lib/debug/breakpoint.rb:246:in `each_iseq'", ... ``` A way to reproduce the issue is to satisfy these conditions at the same time: 1. `debug` gem calling `ObjectSpace.each_iseq` (e.g. [activating a `LineBreakpoint`](https://github.com/ruby/debug/blob/0fcfc28acae33ec1c08068fb7c33703cfa681fa7/lib/debug/breakpoint.rb#L246)). 2. A large amount of iseq being loaded from another thread (possibly through the `bootsnap` gem). 3. 1 and 2 iterating through the same iseq(s) at the same time. Because this issue requires external dependencies and a rather complicated timing setup to reproduce, I wasn't able to write a test case for it. But here's some pseudo code to help reproduce it: ```rb require "debug/session" Thread.new do 100.times do ObjectSpace.each_iseq do |iseq| iseq.absolute_path end end end sleep 0.1 load_a_bunch_of_iseq possibly_through_bootsnap ``` [Bug #19348] Co-authored-by: Peter Zhu <peter@peterzhu.ca>
2023-01-16 23:42:51 +03:00
void
rb_vm_pop_frame_no_int(rb_execution_context_t *ec)
{
rb_control_frame_t *cfp = ec->cfp;
if (VMDEBUG == 2) SDR();
ec->cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp);
}
/* return TRUE if the frame is finished */
static inline int
vm_pop_frame(rb_execution_context_t *ec, rb_control_frame_t *cfp, const VALUE *ep)
{
VALUE flags = ep[VM_ENV_DATA_INDEX_FLAGS];
if (VMDEBUG == 2) SDR();
RUBY_VM_CHECK_INTS(ec);
ec->cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp);
return flags & VM_FRAME_FLAG_FINISH;
}
2023-03-07 09:02:03 +03:00
void
rb_vm_pop_frame(rb_execution_context_t *ec)
{
vm_pop_frame(ec, ec->cfp, ec->cfp->ep);
}
// it pushes pseudo-frame with fname filename.
VALUE
rb_vm_push_frame_fname(rb_execution_context_t *ec, VALUE fname)
{
VALUE tmpbuf = rb_imemo_tmpbuf_auto_free_pointer();
void *ptr = ruby_xcalloc(sizeof(struct rb_iseq_constant_body) + sizeof(struct rb_iseq_struct), 1);
rb_imemo_tmpbuf_set_ptr(tmpbuf, ptr);
struct rb_iseq_struct *dmy_iseq = (struct rb_iseq_struct *)ptr;
struct rb_iseq_constant_body *dmy_body = (struct rb_iseq_constant_body *)&dmy_iseq[1];
dmy_iseq->body = dmy_body;
dmy_body->type = ISEQ_TYPE_TOP;
dmy_body->location.pathobj = fname;
vm_push_frame(ec,
dmy_iseq, //const rb_iseq_t *iseq,
VM_FRAME_MAGIC_DUMMY | VM_ENV_FLAG_LOCAL | VM_FRAME_FLAG_FINISH, // VALUE type,
ec->cfp->self, // VALUE self,
VM_BLOCK_HANDLER_NONE, // VALUE specval,
Qfalse, // VALUE cref_or_me,
NULL, // const VALUE *pc,
ec->cfp->sp, // VALUE *sp,
0, // int local_size,
0); // int stack_max
return tmpbuf;
}
/* method dispatch */
static inline VALUE
rb_arity_error_new(int argc, int min, int max)
{
VALUE err_mess = rb_sprintf("wrong number of arguments (given %d, expected %d", argc, min);
if (min == max) {
/* max is not needed */
}
else if (max == UNLIMITED_ARGUMENTS) {
rb_str_cat_cstr(err_mess, "+");
}
else {
rb_str_catf(err_mess, "..%d", max);
}
rb_str_cat_cstr(err_mess, ")");
return rb_exc_new3(rb_eArgError, err_mess);
}
2023-03-07 09:02:03 +03:00
void
rb_error_arity(int argc, int min, int max)
{
rb_exc_raise(rb_arity_error_new(argc, min, max));
}
/* lvar */
NOINLINE(static void vm_env_write_slowpath(const VALUE *ep, int index, VALUE v));
static void
vm_env_write_slowpath(const VALUE *ep, int index, VALUE v)
{
/* remember env value forcely */
rb_gc_writebarrier_remember(VM_ENV_ENVVAL(ep));
VM_FORCE_WRITE(&ep[index], v);
VM_ENV_FLAGS_UNSET(ep, VM_ENV_FLAG_WB_REQUIRED);
RB_DEBUG_COUNTER_INC(lvar_set_slowpath);
}
// YJIT assumes this function never runs GC
static inline void
vm_env_write(const VALUE *ep, int index, VALUE v)
{
VALUE flags = ep[VM_ENV_DATA_INDEX_FLAGS];
if (LIKELY((flags & VM_ENV_FLAG_WB_REQUIRED) == 0)) {
VM_STACK_ENV_WRITE(ep, index, v);
}
else {
vm_env_write_slowpath(ep, index, v);
}
}
void
rb_vm_env_write(const VALUE *ep, int index, VALUE v)
{
vm_env_write(ep, index, v);
}
2023-03-07 09:02:03 +03:00
VALUE
rb_vm_bh_to_procval(const rb_execution_context_t *ec, VALUE block_handler)
{
if (block_handler == VM_BLOCK_HANDLER_NONE) {
return Qnil;
}
else {
switch (vm_block_handler_type(block_handler)) {
case block_handler_type_iseq:
case block_handler_type_ifunc:
return rb_vm_make_proc(ec, VM_BH_TO_CAPT_BLOCK(block_handler), rb_cProc);
case block_handler_type_symbol:
return rb_sym_to_proc(VM_BH_TO_SYMBOL(block_handler));
case block_handler_type_proc:
return VM_BH_TO_PROC(block_handler);
default:
VM_UNREACHABLE(rb_vm_bh_to_procval);
}
}
}
/* svar */
#if VM_CHECK_MODE > 0
static int
vm_svar_valid_p(VALUE svar)
{
if (RB_TYPE_P((VALUE)svar, T_IMEMO)) {
switch (imemo_type(svar)) {
case imemo_svar:
case imemo_cref:
case imemo_ment:
return TRUE;
default:
break;
}
}
rb_bug("vm_svar_valid_p: unknown type: %s", rb_obj_info(svar));
return FALSE;
}
#endif
static inline struct vm_svar *
lep_svar(const rb_execution_context_t *ec, const VALUE *lep)
{
VALUE svar;
if (lep && (ec == NULL || ec->root_lep != lep)) {
svar = lep[VM_ENV_DATA_INDEX_ME_CREF];
}
else {
svar = ec->root_svar;
}
VM_ASSERT(svar == Qfalse || vm_svar_valid_p(svar));
return (struct vm_svar *)svar;
}
static inline void
lep_svar_write(const rb_execution_context_t *ec, const VALUE *lep, const struct vm_svar *svar)
{
VM_ASSERT(vm_svar_valid_p((VALUE)svar));
if (lep && (ec == NULL || ec->root_lep != lep)) {
vm_env_write(lep, VM_ENV_DATA_INDEX_ME_CREF, (VALUE)svar);
}
else {
RB_OBJ_WRITE(rb_ec_thread_ptr(ec)->self, &ec->root_svar, svar);
}
}
static VALUE
lep_svar_get(const rb_execution_context_t *ec, const VALUE *lep, rb_num_t key)
{
const struct vm_svar *svar = lep_svar(ec, lep);
if ((VALUE)svar == Qfalse || imemo_type((VALUE)svar) != imemo_svar) return Qnil;
switch (key) {
case VM_SVAR_LASTLINE:
return svar->lastline;
case VM_SVAR_BACKREF:
return svar->backref;
default: {
const VALUE ary = svar->others;
2022-07-21 19:23:58 +03:00
if (NIL_P(ary)) {
return Qnil;
}
else {
return rb_ary_entry(ary, key - VM_SVAR_EXTRA_START);
}
}
}
}
static struct vm_svar *
svar_new(VALUE obj)
{
struct vm_svar *svar = IMEMO_NEW(struct vm_svar, imemo_svar, obj);
*((VALUE *)&svar->lastline) = Qnil;
*((VALUE *)&svar->backref) = Qnil;
*((VALUE *)&svar->others) = Qnil;
return svar;
}
static void
lep_svar_set(const rb_execution_context_t *ec, const VALUE *lep, rb_num_t key, VALUE val)
{
struct vm_svar *svar = lep_svar(ec, lep);
if ((VALUE)svar == Qfalse || imemo_type((VALUE)svar) != imemo_svar) {
lep_svar_write(ec, lep, svar = svar_new((VALUE)svar));
}
switch (key) {
case VM_SVAR_LASTLINE:
RB_OBJ_WRITE(svar, &svar->lastline, val);
return;
case VM_SVAR_BACKREF:
RB_OBJ_WRITE(svar, &svar->backref, val);
return;
default: {
VALUE ary = svar->others;
if (NIL_P(ary)) {
RB_OBJ_WRITE(svar, &svar->others, ary = rb_ary_new());
}
rb_ary_store(ary, key - VM_SVAR_EXTRA_START, val);
}
}
}
static inline VALUE
vm_getspecial(const rb_execution_context_t *ec, const VALUE *lep, rb_num_t key, rb_num_t type)
{
VALUE val;
if (type == 0) {
val = lep_svar_get(ec, lep, key);
}
else {
VALUE backref = lep_svar_get(ec, lep, VM_SVAR_BACKREF);
2022-07-21 19:23:58 +03:00
if (type & 0x01) {
switch (type >> 1) {
case '&':
val = rb_reg_last_match(backref);
break;
case '`':
val = rb_reg_match_pre(backref);
break;
case '\'':
val = rb_reg_match_post(backref);
break;
case '+':
val = rb_reg_match_last(backref);
break;
default:
rb_bug("unexpected back-ref");
}
}
else {
val = rb_reg_nth_match((int)(type >> 1), backref);
}
}
return val;
}
static inline VALUE
vm_backref_defined(const rb_execution_context_t *ec, const VALUE *lep, rb_num_t type)
{
VALUE backref = lep_svar_get(ec, lep, VM_SVAR_BACKREF);
int nth = 0;
if (type & 0x01) {
switch (type >> 1) {
case '&':
case '`':
case '\'':
break;
case '+':
return rb_reg_last_defined(backref);
default:
rb_bug("unexpected back-ref");
}
}
else {
nth = (int)(type >> 1);
}
return rb_reg_nth_defined(nth, backref);
}
PUREFUNC(static rb_callable_method_entry_t *check_method_entry(VALUE obj, int can_be_svar));
static rb_callable_method_entry_t *
check_method_entry(VALUE obj, int can_be_svar)
{
if (obj == Qfalse) return NULL;
#if VM_CHECK_MODE > 0
if (!RB_TYPE_P(obj, T_IMEMO)) rb_bug("check_method_entry: unknown type: %s", rb_obj_info(obj));
#endif
switch (imemo_type(obj)) {
case imemo_ment:
return (rb_callable_method_entry_t *)obj;
case imemo_cref:
return NULL;
case imemo_svar:
if (can_be_svar) {
return check_method_entry(((struct vm_svar *)obj)->cref_or_me, FALSE);
}
default:
#if VM_CHECK_MODE > 0
rb_bug("check_method_entry: svar should not be there:");
#endif
return NULL;
}
}
2023-03-07 09:02:03 +03:00
const rb_callable_method_entry_t *
rb_vm_frame_method_entry(const rb_control_frame_t *cfp)
{
const VALUE *ep = cfp->ep;
rb_callable_method_entry_t *me;
while (!VM_ENV_LOCAL_P(ep)) {
if ((me = check_method_entry(ep[VM_ENV_DATA_INDEX_ME_CREF], FALSE)) != NULL) return me;
ep = VM_ENV_PREV_EP(ep);
}
return check_method_entry(ep[VM_ENV_DATA_INDEX_ME_CREF], TRUE);
}
`Primitive.mandatory_only?` for fast path Compare with the C methods, A built-in methods written in Ruby is slower if only mandatory parameters are given because it needs to check the argumens and fill default values for optional and keyword parameters (C methods can check the number of parameters with `argc`, so there are no overhead). Passing mandatory arguments are common (optional arguments are exceptional, in many cases) so it is important to provide the fast path for such common cases. `Primitive.mandatory_only?` is a special builtin function used with `if` expression like that: ```ruby def self.at(time, subsec = false, unit = :microsecond, in: nil) if Primitive.mandatory_only? Primitive.time_s_at1(time) else Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end end ``` and it makes two ISeq, ``` def self.at(time, subsec = false, unit = :microsecond, in: nil) Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end def self.at(time) Primitive.time_s_at1(time) end ``` and (2) is pointed by (1). Note that `Primitive.mandatory_only?` should be used only in a condition of an `if` statement and the `if` statement should be equal to the methdo body (you can not put any expression before and after the `if` statement). A method entry with `mandatory_only?` (`Time.at` on the above case) is marked as `iseq_overload`. When the method will be dispatch only with mandatory arguments (`Time.at(0)` for example), make another method entry with ISeq (2) as mandatory only method entry and it will be cached in an inline method cache. The idea is similar discussed in https://bugs.ruby-lang.org/issues/16254 but it only checks mandatory parameters or more, because many cases only mandatory parameters are given. If we find other cases (optional or keyword parameters are used frequently and it hurts performance), we can extend the feature.
2021-11-12 20:12:20 +03:00
static const rb_iseq_t *
Improve the performance of super This PR improves the performance of `super` calls. While working on some Rails optimizations jhawthorn discovered that `super` calls were slower than expected. The changes here do the following: 1) Adds a check for whether the call frame is not equal to the method entry iseq. This avoids the `rb_obj_is_kind_of` check on the next line which is quite slow. If the current call frame is equal to the method entry we know we can't have an instance eval, etc. 2) Changes `FL_TEST` to `FL_TEST_RAW`. This is safe because we've already done the check for `T_ICLASS` above. 3) Adds a benchmark for `T_ICLASS` super calls. 4) Note: makes a chage for `method_entry_cref` to use `const`. On master the benchmarks showed that `super` is 1.76x slower. Our changes improved the performance so that it is now only 1.36x slower. Benchmark IPS: ``` Warming up -------------------------------------- super 244.918k i/100ms method call 383.007k i/100ms Calculating ------------------------------------- super 2.280M (± 6.7%) i/s - 11.511M in 5.071758s method call 3.834M (± 4.9%) i/s - 19.150M in 5.008444s Comparison: method call: 3833648.3 i/s super: 2279837.9 i/s - 1.68x (± 0.00) slower ``` With changes: ``` Warming up -------------------------------------- super 308.777k i/100ms method call 375.051k i/100ms Calculating ------------------------------------- super 2.951M (± 5.4%) i/s - 14.821M in 5.039592s method call 3.551M (± 4.9%) i/s - 18.002M in 5.081695s Comparison: method call: 3551372.7 i/s super: 2950557.9 i/s - 1.20x (± 0.00) slower ``` Ruby VM benchmarks also showed an improvement: Existing `vm_super` benchmark`. ``` $ make benchmark ITEM=vm_super | |compare-ruby|built-ruby| |:---------|-----------:|---------:| |vm_super | 21.555M| 37.819M| | | -| 1.75x| ``` New `vm_iclass_super` benchmark: ``` $ make benchmark ITEM=vm_iclass_super | |compare-ruby|built-ruby| |:----------------|-----------:|---------:| |vm_iclass_super | 1.669M| 3.683M| | | -| 2.21x| ``` This is the benchmark script used for the benchmark-ips benchmarks: ```ruby require "benchmark/ips" class Foo def zuper; end def top; end last_method = "top" ("A".."M").each do |module_name| eval <<-EOM module #{module_name} def zuper; super; end def #{module_name.downcase} #{last_method} end end prepend #{module_name} EOM last_method = module_name.downcase end end foo = Foo.new Benchmark.ips do |x| x.report "super" do foo.zuper end x.report "method call" do foo.m end x.compare! end ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org> Co-authored-by: John Hawthorn <john@hawthorn.email>
2020-08-11 20:22:43 +03:00
method_entry_iseqptr(const rb_callable_method_entry_t *me)
{
switch (me->def->type) {
case VM_METHOD_TYPE_ISEQ:
return me->def->body.iseq.iseqptr;
default:
return NULL;
}
}
static rb_cref_t *
Improve the performance of super This PR improves the performance of `super` calls. While working on some Rails optimizations jhawthorn discovered that `super` calls were slower than expected. The changes here do the following: 1) Adds a check for whether the call frame is not equal to the method entry iseq. This avoids the `rb_obj_is_kind_of` check on the next line which is quite slow. If the current call frame is equal to the method entry we know we can't have an instance eval, etc. 2) Changes `FL_TEST` to `FL_TEST_RAW`. This is safe because we've already done the check for `T_ICLASS` above. 3) Adds a benchmark for `T_ICLASS` super calls. 4) Note: makes a chage for `method_entry_cref` to use `const`. On master the benchmarks showed that `super` is 1.76x slower. Our changes improved the performance so that it is now only 1.36x slower. Benchmark IPS: ``` Warming up -------------------------------------- super 244.918k i/100ms method call 383.007k i/100ms Calculating ------------------------------------- super 2.280M (± 6.7%) i/s - 11.511M in 5.071758s method call 3.834M (± 4.9%) i/s - 19.150M in 5.008444s Comparison: method call: 3833648.3 i/s super: 2279837.9 i/s - 1.68x (± 0.00) slower ``` With changes: ``` Warming up -------------------------------------- super 308.777k i/100ms method call 375.051k i/100ms Calculating ------------------------------------- super 2.951M (± 5.4%) i/s - 14.821M in 5.039592s method call 3.551M (± 4.9%) i/s - 18.002M in 5.081695s Comparison: method call: 3551372.7 i/s super: 2950557.9 i/s - 1.20x (± 0.00) slower ``` Ruby VM benchmarks also showed an improvement: Existing `vm_super` benchmark`. ``` $ make benchmark ITEM=vm_super | |compare-ruby|built-ruby| |:---------|-----------:|---------:| |vm_super | 21.555M| 37.819M| | | -| 1.75x| ``` New `vm_iclass_super` benchmark: ``` $ make benchmark ITEM=vm_iclass_super | |compare-ruby|built-ruby| |:----------------|-----------:|---------:| |vm_iclass_super | 1.669M| 3.683M| | | -| 2.21x| ``` This is the benchmark script used for the benchmark-ips benchmarks: ```ruby require "benchmark/ips" class Foo def zuper; end def top; end last_method = "top" ("A".."M").each do |module_name| eval <<-EOM module #{module_name} def zuper; super; end def #{module_name.downcase} #{last_method} end end prepend #{module_name} EOM last_method = module_name.downcase end end foo = Foo.new Benchmark.ips do |x| x.report "super" do foo.zuper end x.report "method call" do foo.m end x.compare! end ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org> Co-authored-by: John Hawthorn <john@hawthorn.email>
2020-08-11 20:22:43 +03:00
method_entry_cref(const rb_callable_method_entry_t *me)
{
switch (me->def->type) {
case VM_METHOD_TYPE_ISEQ:
return me->def->body.iseq.cref;
default:
return NULL;
}
}
#if VM_CHECK_MODE == 0
PUREFUNC(static rb_cref_t *check_cref(VALUE, int));
#endif
static rb_cref_t *
check_cref(VALUE obj, int can_be_svar)
{
if (obj == Qfalse) return NULL;
#if VM_CHECK_MODE > 0
if (!RB_TYPE_P(obj, T_IMEMO)) rb_bug("check_cref: unknown type: %s", rb_obj_info(obj));
#endif
switch (imemo_type(obj)) {
case imemo_ment:
return method_entry_cref((rb_callable_method_entry_t *)obj);
case imemo_cref:
return (rb_cref_t *)obj;
case imemo_svar:
if (can_be_svar) {
return check_cref(((struct vm_svar *)obj)->cref_or_me, FALSE);
}
default:
#if VM_CHECK_MODE > 0
rb_bug("check_method_entry: svar should not be there:");
#endif
return NULL;
}
}
static inline rb_cref_t *
vm_env_cref(const VALUE *ep)
{
rb_cref_t *cref;
while (!VM_ENV_LOCAL_P(ep)) {
if ((cref = check_cref(ep[VM_ENV_DATA_INDEX_ME_CREF], FALSE)) != NULL) return cref;
ep = VM_ENV_PREV_EP(ep);
}
return check_cref(ep[VM_ENV_DATA_INDEX_ME_CREF], TRUE);
}
static int
is_cref(const VALUE v, int can_be_svar)
{
if (RB_TYPE_P(v, T_IMEMO)) {
switch (imemo_type(v)) {
case imemo_cref:
return TRUE;
case imemo_svar:
if (can_be_svar) return is_cref(((struct vm_svar *)v)->cref_or_me, FALSE);
default:
break;
}
}
return FALSE;
}
static int
vm_env_cref_by_cref(const VALUE *ep)
{
while (!VM_ENV_LOCAL_P(ep)) {
if (is_cref(ep[VM_ENV_DATA_INDEX_ME_CREF], FALSE)) return TRUE;
ep = VM_ENV_PREV_EP(ep);
}
return is_cref(ep[VM_ENV_DATA_INDEX_ME_CREF], TRUE);
}
static rb_cref_t *
cref_replace_with_duplicated_cref_each_frame(const VALUE *vptr, int can_be_svar, VALUE parent)
{
const VALUE v = *vptr;
rb_cref_t *cref, *new_cref;
if (RB_TYPE_P(v, T_IMEMO)) {
switch (imemo_type(v)) {
case imemo_cref:
cref = (rb_cref_t *)v;
new_cref = vm_cref_dup(cref);
if (parent) {
RB_OBJ_WRITE(parent, vptr, new_cref);
}
else {
VM_FORCE_WRITE(vptr, (VALUE)new_cref);
}
return (rb_cref_t *)new_cref;
case imemo_svar:
if (can_be_svar) {
2021-10-19 11:09:32 +03:00
return cref_replace_with_duplicated_cref_each_frame(&((struct vm_svar *)v)->cref_or_me, FALSE, v);
}
2019-07-14 16:20:47 +03:00
/* fall through */
case imemo_ment:
rb_bug("cref_replace_with_duplicated_cref_each_frame: unreachable");
default:
break;
}
}
return NULL;
}
static rb_cref_t *
vm_cref_replace_with_duplicated_cref(const VALUE *ep)
{
if (vm_env_cref_by_cref(ep)) {
rb_cref_t *cref;
VALUE envval;
2022-07-21 19:23:58 +03:00
while (!VM_ENV_LOCAL_P(ep)) {
envval = VM_ENV_ESCAPED_P(ep) ? VM_ENV_ENVVAL(ep) : Qfalse;
if ((cref = cref_replace_with_duplicated_cref_each_frame(&ep[VM_ENV_DATA_INDEX_ME_CREF], FALSE, envval)) != NULL) {
return cref;
}
ep = VM_ENV_PREV_EP(ep);
}
envval = VM_ENV_ESCAPED_P(ep) ? VM_ENV_ENVVAL(ep) : Qfalse;
return cref_replace_with_duplicated_cref_each_frame(&ep[VM_ENV_DATA_INDEX_ME_CREF], TRUE, envval);
}
else {
rb_bug("vm_cref_dup: unreachable");
}
}
static rb_cref_t *
vm_get_cref(const VALUE *ep)
{
rb_cref_t *cref = vm_env_cref(ep);
if (cref != NULL) {
return cref;
}
else {
rb_bug("vm_get_cref: unreachable");
}
}
rb_cref_t *
rb_vm_get_cref(const VALUE *ep)
{
return vm_get_cref(ep);
}
static rb_cref_t *
vm_ec_cref(const rb_execution_context_t *ec)
{
const rb_control_frame_t *cfp = rb_vm_get_ruby_level_next_cfp(ec, ec->cfp);
if (cfp == NULL) {
return NULL;
}
return vm_get_cref(cfp->ep);
}
static const rb_cref_t *
vm_get_const_key_cref(const VALUE *ep)
{
const rb_cref_t *cref = vm_get_cref(ep);
const rb_cref_t *key_cref = cref;
while (cref) {
if (RCLASS_SINGLETON_P(CREF_CLASS(cref)) ||
RCLASS_EXT(CREF_CLASS(cref))->cloned) {
return key_cref;
}
cref = CREF_NEXT(cref);
}
/* does not include singleton class */
return NULL;
}
void
rb_vm_rewrite_cref(rb_cref_t *cref, VALUE old_klass, VALUE new_klass, rb_cref_t **new_cref_ptr)
{
rb_cref_t *new_cref;
while (cref) {
if (CREF_CLASS(cref) == old_klass) {
new_cref = vm_cref_new_use_prev(new_klass, METHOD_VISI_UNDEF, FALSE, cref, FALSE);
*new_cref_ptr = new_cref;
return;
}
new_cref = vm_cref_new_use_prev(CREF_CLASS(cref), METHOD_VISI_UNDEF, FALSE, cref, FALSE);
cref = CREF_NEXT(cref);
*new_cref_ptr = new_cref;
2021-10-19 11:09:32 +03:00
new_cref_ptr = &new_cref->next;
}
*new_cref_ptr = NULL;
}
static rb_cref_t *
Lazily create singletons on instance_{exec,eval} (#5146) * Lazily create singletons on instance_{exec,eval} Previously when instance_exec or instance_eval was called on an object, that object would be given a singleton class so that method definitions inside the block would be added to the object rather than its class. This commit aims to improve performance by delaying the creation of the singleton class unless/until one is needed for method definition. Most of the time instance_eval is used without any method definition. This was implemented by adding a flag to the cref indicating that it represents a singleton of the object rather than a class itself. In this case CREF_CLASS returns the object's existing class, but in cases that we are defining a method (either via definemethod or VM_SPECIAL_OBJECT_CBASE which is used for undef and alias). This also happens to fix what I believe is a bug. Previously instance_eval behaved differently with regards to constant access for true/false/nil than for all other objects. I don't think this was intentional. String::Foo = "foo" "".instance_eval("Foo") # => "foo" Integer::Foo = "foo" 123.instance_eval("Foo") # => "foo" TrueClass::Foo = "foo" true.instance_eval("Foo") # NameError: uninitialized constant Foo This also slightly changes the error message when trying to define a method through instance_eval on an object which can't have a singleton class. Before: $ ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': no class/module to add method (TypeError) After: $ ./ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) IMO this error is a small improvement on the original and better matches the (both old and new) message when definging a method using `def self.` $ ruby -e '123.instance_eval{ def self.foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) Co-authored-by: Matthew Draper <matthew@trebex.net> * Remove "under" argument from yield_under * Move CREF_SINGLETON_SET into vm_cref_new * Simplify vm_get_const_base * Fix leaf VM_SPECIAL_OBJECT_CONST_BASE Co-authored-by: Matthew Draper <matthew@trebex.net>
2021-12-03 02:53:39 +03:00
vm_cref_push(const rb_execution_context_t *ec, VALUE klass, const VALUE *ep, int pushed_by_eval, int singleton)
{
rb_cref_t *prev_cref = NULL;
if (ep) {
prev_cref = vm_env_cref(ep);
}
else {
rb_control_frame_t *cfp = vm_get_ruby_level_caller_cfp(ec, ec->cfp);
if (cfp) {
prev_cref = vm_env_cref(cfp->ep);
}
}
Lazily create singletons on instance_{exec,eval} (#5146) * Lazily create singletons on instance_{exec,eval} Previously when instance_exec or instance_eval was called on an object, that object would be given a singleton class so that method definitions inside the block would be added to the object rather than its class. This commit aims to improve performance by delaying the creation of the singleton class unless/until one is needed for method definition. Most of the time instance_eval is used without any method definition. This was implemented by adding a flag to the cref indicating that it represents a singleton of the object rather than a class itself. In this case CREF_CLASS returns the object's existing class, but in cases that we are defining a method (either via definemethod or VM_SPECIAL_OBJECT_CBASE which is used for undef and alias). This also happens to fix what I believe is a bug. Previously instance_eval behaved differently with regards to constant access for true/false/nil than for all other objects. I don't think this was intentional. String::Foo = "foo" "".instance_eval("Foo") # => "foo" Integer::Foo = "foo" 123.instance_eval("Foo") # => "foo" TrueClass::Foo = "foo" true.instance_eval("Foo") # NameError: uninitialized constant Foo This also slightly changes the error message when trying to define a method through instance_eval on an object which can't have a singleton class. Before: $ ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': no class/module to add method (TypeError) After: $ ./ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) IMO this error is a small improvement on the original and better matches the (both old and new) message when definging a method using `def self.` $ ruby -e '123.instance_eval{ def self.foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) Co-authored-by: Matthew Draper <matthew@trebex.net> * Remove "under" argument from yield_under * Move CREF_SINGLETON_SET into vm_cref_new * Simplify vm_get_const_base * Fix leaf VM_SPECIAL_OBJECT_CONST_BASE Co-authored-by: Matthew Draper <matthew@trebex.net>
2021-12-03 02:53:39 +03:00
return vm_cref_new(klass, METHOD_VISI_PUBLIC, FALSE, prev_cref, pushed_by_eval, singleton);
}
static inline VALUE
vm_get_cbase(const VALUE *ep)
{
const rb_cref_t *cref = vm_get_cref(ep);
Lazily create singletons on instance_{exec,eval} (#5146) * Lazily create singletons on instance_{exec,eval} Previously when instance_exec or instance_eval was called on an object, that object would be given a singleton class so that method definitions inside the block would be added to the object rather than its class. This commit aims to improve performance by delaying the creation of the singleton class unless/until one is needed for method definition. Most of the time instance_eval is used without any method definition. This was implemented by adding a flag to the cref indicating that it represents a singleton of the object rather than a class itself. In this case CREF_CLASS returns the object's existing class, but in cases that we are defining a method (either via definemethod or VM_SPECIAL_OBJECT_CBASE which is used for undef and alias). This also happens to fix what I believe is a bug. Previously instance_eval behaved differently with regards to constant access for true/false/nil than for all other objects. I don't think this was intentional. String::Foo = "foo" "".instance_eval("Foo") # => "foo" Integer::Foo = "foo" 123.instance_eval("Foo") # => "foo" TrueClass::Foo = "foo" true.instance_eval("Foo") # NameError: uninitialized constant Foo This also slightly changes the error message when trying to define a method through instance_eval on an object which can't have a singleton class. Before: $ ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': no class/module to add method (TypeError) After: $ ./ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) IMO this error is a small improvement on the original and better matches the (both old and new) message when definging a method using `def self.` $ ruby -e '123.instance_eval{ def self.foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) Co-authored-by: Matthew Draper <matthew@trebex.net> * Remove "under" argument from yield_under * Move CREF_SINGLETON_SET into vm_cref_new * Simplify vm_get_const_base * Fix leaf VM_SPECIAL_OBJECT_CONST_BASE Co-authored-by: Matthew Draper <matthew@trebex.net>
2021-12-03 02:53:39 +03:00
return CREF_CLASS_FOR_DEFINITION(cref);
}
static inline VALUE
vm_get_const_base(const VALUE *ep)
{
const rb_cref_t *cref = vm_get_cref(ep);
while (cref) {
Lazily create singletons on instance_{exec,eval} (#5146) * Lazily create singletons on instance_{exec,eval} Previously when instance_exec or instance_eval was called on an object, that object would be given a singleton class so that method definitions inside the block would be added to the object rather than its class. This commit aims to improve performance by delaying the creation of the singleton class unless/until one is needed for method definition. Most of the time instance_eval is used without any method definition. This was implemented by adding a flag to the cref indicating that it represents a singleton of the object rather than a class itself. In this case CREF_CLASS returns the object's existing class, but in cases that we are defining a method (either via definemethod or VM_SPECIAL_OBJECT_CBASE which is used for undef and alias). This also happens to fix what I believe is a bug. Previously instance_eval behaved differently with regards to constant access for true/false/nil than for all other objects. I don't think this was intentional. String::Foo = "foo" "".instance_eval("Foo") # => "foo" Integer::Foo = "foo" 123.instance_eval("Foo") # => "foo" TrueClass::Foo = "foo" true.instance_eval("Foo") # NameError: uninitialized constant Foo This also slightly changes the error message when trying to define a method through instance_eval on an object which can't have a singleton class. Before: $ ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': no class/module to add method (TypeError) After: $ ./ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) IMO this error is a small improvement on the original and better matches the (both old and new) message when definging a method using `def self.` $ ruby -e '123.instance_eval{ def self.foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) Co-authored-by: Matthew Draper <matthew@trebex.net> * Remove "under" argument from yield_under * Move CREF_SINGLETON_SET into vm_cref_new * Simplify vm_get_const_base * Fix leaf VM_SPECIAL_OBJECT_CONST_BASE Co-authored-by: Matthew Draper <matthew@trebex.net>
2021-12-03 02:53:39 +03:00
if (!CREF_PUSHED_BY_EVAL(cref)) {
return CREF_CLASS_FOR_DEFINITION(cref);
}
cref = CREF_NEXT(cref);
}
Lazily create singletons on instance_{exec,eval} (#5146) * Lazily create singletons on instance_{exec,eval} Previously when instance_exec or instance_eval was called on an object, that object would be given a singleton class so that method definitions inside the block would be added to the object rather than its class. This commit aims to improve performance by delaying the creation of the singleton class unless/until one is needed for method definition. Most of the time instance_eval is used without any method definition. This was implemented by adding a flag to the cref indicating that it represents a singleton of the object rather than a class itself. In this case CREF_CLASS returns the object's existing class, but in cases that we are defining a method (either via definemethod or VM_SPECIAL_OBJECT_CBASE which is used for undef and alias). This also happens to fix what I believe is a bug. Previously instance_eval behaved differently with regards to constant access for true/false/nil than for all other objects. I don't think this was intentional. String::Foo = "foo" "".instance_eval("Foo") # => "foo" Integer::Foo = "foo" 123.instance_eval("Foo") # => "foo" TrueClass::Foo = "foo" true.instance_eval("Foo") # NameError: uninitialized constant Foo This also slightly changes the error message when trying to define a method through instance_eval on an object which can't have a singleton class. Before: $ ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': no class/module to add method (TypeError) After: $ ./ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) IMO this error is a small improvement on the original and better matches the (both old and new) message when definging a method using `def self.` $ ruby -e '123.instance_eval{ def self.foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) Co-authored-by: Matthew Draper <matthew@trebex.net> * Remove "under" argument from yield_under * Move CREF_SINGLETON_SET into vm_cref_new * Simplify vm_get_const_base * Fix leaf VM_SPECIAL_OBJECT_CONST_BASE Co-authored-by: Matthew Draper <matthew@trebex.net>
2021-12-03 02:53:39 +03:00
return Qundef;
}
static inline void
vm_check_if_namespace(VALUE klass)
{
if (!RB_TYPE_P(klass, T_CLASS) && !RB_TYPE_P(klass, T_MODULE)) {
rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a class/module", klass);
}
}
static inline void
vm_ensure_not_refinement_module(VALUE self)
{
if (RB_TYPE_P(self, T_MODULE) && FL_TEST(self, RMODULE_IS_REFINEMENT)) {
rb_warn("not defined at the refinement, but at the outer class/module");
}
}
static inline VALUE
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
vm_get_iclass(const rb_control_frame_t *cfp, VALUE klass)
{
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
return klass;
}
static inline VALUE
2019-11-25 09:05:53 +03:00
vm_get_ev_const(rb_execution_context_t *ec, VALUE orig_klass, ID id, bool allow_nil, int is_defined)
{
void rb_const_warn_if_deprecated(const rb_const_entry_t *ce, VALUE klass, ID id);
VALUE val;
2021-10-03 16:34:45 +03:00
if (NIL_P(orig_klass) && allow_nil) {
/* in current lexical scope */
const rb_cref_t *root_cref = vm_get_cref(ec->cfp->ep);
const rb_cref_t *cref;
VALUE klass = Qnil;
2022-07-21 19:23:58 +03:00
while (root_cref && CREF_PUSHED_BY_EVAL(root_cref)) {
root_cref = CREF_NEXT(root_cref);
}
cref = root_cref;
while (cref && CREF_NEXT(cref)) {
if (CREF_PUSHED_BY_EVAL(cref)) {
klass = Qnil;
}
else {
klass = CREF_CLASS(cref);
}
cref = CREF_NEXT(cref);
2022-07-21 19:23:58 +03:00
if (!NIL_P(klass)) {
VALUE av, am = 0;
rb_const_entry_t *ce;
search_continue:
if ((ce = rb_const_lookup(klass, id))) {
rb_const_warn_if_deprecated(ce, klass, id);
val = ce->value;
2022-11-15 07:24:08 +03:00
if (UNDEF_P(val)) {
if (am == klass) break;
am = klass;
if (is_defined) return 1;
if (rb_autoloading_value(klass, id, &av, NULL)) return av;
rb_autoload_load(klass, id);
goto search_continue;
}
else {
if (is_defined) {
return 1;
}
else {
if (UNLIKELY(!rb_ractor_main_p())) {
if (!rb_ractor_shareable_p(val)) {
rb_raise(rb_eRactorIsolationError,
"can not access non-shareable objects in constant %"PRIsVALUE"::%s by non-main ractor.", rb_class_path(klass), rb_id2name(id));
}
}
return val;
}
}
}
}
}
2022-07-21 19:23:58 +03:00
/* search self */
if (root_cref && !NIL_P(CREF_CLASS(root_cref))) {
klass = vm_get_iclass(ec->cfp, CREF_CLASS(root_cref));
}
else {
klass = CLASS_OF(ec->cfp->self);
}
2022-07-21 19:23:58 +03:00
if (is_defined) {
return rb_const_defined(klass, id);
}
else {
return rb_const_get(klass, id);
}
}
else {
vm_check_if_namespace(orig_klass);
if (is_defined) {
return rb_public_const_defined_from(orig_klass, id);
}
else {
return rb_public_const_get_from(orig_klass, id);
}
}
}
VALUE
rb_vm_get_ev_const(rb_execution_context_t *ec, VALUE orig_klass, ID id, VALUE allow_nil)
{
return vm_get_ev_const(ec, orig_klass, id, allow_nil == Qtrue, 0);
}
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
static inline VALUE
vm_get_ev_const_chain(rb_execution_context_t *ec, const ID *segments)
{
VALUE val = Qnil;
int idx = 0;
int allow_nil = TRUE;
if (segments[0] == idNULL) {
val = rb_cObject;
idx++;
allow_nil = FALSE;
}
while (segments[idx]) {
ID id = segments[idx++];
val = vm_get_ev_const(ec, val, id, allow_nil, 0);
allow_nil = FALSE;
}
return val;
}
static inline VALUE
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
vm_get_cvar_base(const rb_cref_t *cref, const rb_control_frame_t *cfp, int top_level_raise)
{
VALUE klass;
if (!cref) {
rb_bug("vm_get_cvar_base: no cref");
}
while (CREF_NEXT(cref) &&
(NIL_P(CREF_CLASS(cref)) || RCLASS_SINGLETON_P(CREF_CLASS(cref)) ||
Lazily create singletons on instance_{exec,eval} (#5146) * Lazily create singletons on instance_{exec,eval} Previously when instance_exec or instance_eval was called on an object, that object would be given a singleton class so that method definitions inside the block would be added to the object rather than its class. This commit aims to improve performance by delaying the creation of the singleton class unless/until one is needed for method definition. Most of the time instance_eval is used without any method definition. This was implemented by adding a flag to the cref indicating that it represents a singleton of the object rather than a class itself. In this case CREF_CLASS returns the object's existing class, but in cases that we are defining a method (either via definemethod or VM_SPECIAL_OBJECT_CBASE which is used for undef and alias). This also happens to fix what I believe is a bug. Previously instance_eval behaved differently with regards to constant access for true/false/nil than for all other objects. I don't think this was intentional. String::Foo = "foo" "".instance_eval("Foo") # => "foo" Integer::Foo = "foo" 123.instance_eval("Foo") # => "foo" TrueClass::Foo = "foo" true.instance_eval("Foo") # NameError: uninitialized constant Foo This also slightly changes the error message when trying to define a method through instance_eval on an object which can't have a singleton class. Before: $ ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': no class/module to add method (TypeError) After: $ ./ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) IMO this error is a small improvement on the original and better matches the (both old and new) message when definging a method using `def self.` $ ruby -e '123.instance_eval{ def self.foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) Co-authored-by: Matthew Draper <matthew@trebex.net> * Remove "under" argument from yield_under * Move CREF_SINGLETON_SET into vm_cref_new * Simplify vm_get_const_base * Fix leaf VM_SPECIAL_OBJECT_CONST_BASE Co-authored-by: Matthew Draper <matthew@trebex.net>
2021-12-03 02:53:39 +03:00
CREF_PUSHED_BY_EVAL(cref) || CREF_SINGLETON(cref))) {
cref = CREF_NEXT(cref);
}
if (top_level_raise && !CREF_NEXT(cref)) {
rb_raise(rb_eRuntimeError, "class variable access from toplevel");
}
klass = vm_get_iclass(cfp, CREF_CLASS(cref));
if (NIL_P(klass)) {
rb_raise(rb_eTypeError, "no class variables available");
}
return klass;
}
ALWAYS_INLINE(static void fill_ivar_cache(const rb_iseq_t *iseq, IVC ic, const struct rb_callcache *cc, int is_attr, attr_index_t index, shape_id_t shape_id));
static inline void
fill_ivar_cache(const rb_iseq_t *iseq, IVC ic, const struct rb_callcache *cc, int is_attr, attr_index_t index, shape_id_t shape_id)
{
if (is_attr) {
vm_cc_attr_index_set(cc, index, shape_id);
}
else {
vm_ic_attr_index_set(iseq, ic, index, shape_id);
}
}
2022-10-01 09:58:47 +03:00
#define ractor_incidental_shareable_p(cond, val) \
(!(cond) || rb_ractor_shareable_p(val))
#define ractor_object_incidental_shareable_p(obj, val) \
ractor_incidental_shareable_p(rb_ractor_shareable_p(obj), val)
#define ATTR_INDEX_NOT_SET (attr_index_t)-1
ALWAYS_INLINE(static VALUE vm_getivar(VALUE, ID, const rb_iseq_t *, IVC, const struct rb_callcache *, int, VALUE));
static inline VALUE
vm_getivar(VALUE obj, ID id, const rb_iseq_t *iseq, IVC ic, const struct rb_callcache *cc, int is_attr, VALUE default_value)
{
#if OPT_IC_FOR_IVAR
VALUE val = Qundef;
shape_id_t shape_id;
VALUE * ivar_list;
if (SPECIAL_CONST_P(obj)) {
return default_value;
}
#if SHAPE_IN_BASIC_FLAGS
shape_id = RBASIC_SHAPE_ID(obj);
#endif
switch (BUILTIN_TYPE(obj)) {
2022-10-12 12:27:23 +03:00
case T_OBJECT:
ivar_list = ROBJECT_IVPTR(obj);
VM_ASSERT(rb_ractor_shareable_p(obj) ? rb_ractor_shareable_p(val) : true);
This commit implements the Object Shapes technique in CRuby. Object Shapes is used for accessing instance variables and representing the "frozenness" of objects. Object instances have a "shape" and the shape represents some attributes of the object (currently which instance variables are set and the "frozenness"). Shapes form a tree data structure, and when a new instance variable is set on an object, that object "transitions" to a new shape in the shape tree. Each shape has an ID that is used for caching. The shape structure is independent of class, so objects of different types can have the same shape. For example: ```ruby class Foo def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end class Bar def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end foo = Foo.new # `foo` has shape id 2 bar = Bar.new # `bar` has shape id 2 ``` Both `foo` and `bar` instances have the same shape because they both set instance variables of the same name in the same order. This technique can help to improve inline cache hits as well as generate more efficient machine code in JIT compilers. This commit also adds some methods for debugging shapes on objects. See `RubyVM::Shape` for more details. For more context on Object Shapes, see [Feature: #18776] Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org> Co-Authored-By: Eileen M. Uchitelle <eileencodes@gmail.com> Co-Authored-By: John Hawthorn <john@hawthorn.email>
2022-09-23 20:54:42 +03:00
#if !SHAPE_IN_BASIC_FLAGS
2022-10-12 12:27:23 +03:00
shape_id = ROBJECT_SHAPE_ID(obj);
#endif
2022-10-12 12:27:23 +03:00
break;
case T_CLASS:
case T_MODULE:
{
if (UNLIKELY(!rb_ractor_main_p())) {
// For two reasons we can only use the fast path on the main
// ractor.
// First, only the main ractor is allowed to set ivars on classes
// and modules. So we can skip locking.
// Second, other ractors need to check the shareability of the
// values returned from the class ivars.
if (default_value == Qundef) { // defined?
return rb_ivar_defined(obj, id) ? Qtrue : Qundef;
}
else {
goto general_path;
}
}
ivar_list = RCLASS_IVPTR(obj);
#if !SHAPE_IN_BASIC_FLAGS
shape_id = RCLASS_SHAPE_ID(obj);
#endif
break;
2022-10-12 12:27:23 +03:00
}
default:
if (FL_TEST_RAW(obj, FL_EXIVAR)) {
struct gen_ivtbl *ivtbl;
rb_gen_ivtbl_get(obj, id, &ivtbl);
#if !SHAPE_IN_BASIC_FLAGS
2022-10-12 12:27:23 +03:00
shape_id = ivtbl->shape_id;
#endif
ivar_list = ivtbl->as.shape.ivptr;
2022-10-12 12:27:23 +03:00
}
else {
return default_value;
2022-10-12 12:27:23 +03:00
}
}
shape_id_t cached_id;
attr_index_t index;
This commit implements the Object Shapes technique in CRuby. Object Shapes is used for accessing instance variables and representing the "frozenness" of objects. Object instances have a "shape" and the shape represents some attributes of the object (currently which instance variables are set and the "frozenness"). Shapes form a tree data structure, and when a new instance variable is set on an object, that object "transitions" to a new shape in the shape tree. Each shape has an ID that is used for caching. The shape structure is independent of class, so objects of different types can have the same shape. For example: ```ruby class Foo def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end class Bar def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end foo = Foo.new # `foo` has shape id 2 bar = Bar.new # `bar` has shape id 2 ``` Both `foo` and `bar` instances have the same shape because they both set instance variables of the same name in the same order. This technique can help to improve inline cache hits as well as generate more efficient machine code in JIT compilers. This commit also adds some methods for debugging shapes on objects. See `RubyVM::Shape` for more details. For more context on Object Shapes, see [Feature: #18776] Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org> Co-Authored-By: Eileen M. Uchitelle <eileencodes@gmail.com> Co-Authored-By: John Hawthorn <john@hawthorn.email>
2022-09-23 20:54:42 +03:00
if (is_attr) {
vm_cc_atomic_shape_and_index(cc, &cached_id, &index);
This commit implements the Object Shapes technique in CRuby. Object Shapes is used for accessing instance variables and representing the "frozenness" of objects. Object instances have a "shape" and the shape represents some attributes of the object (currently which instance variables are set and the "frozenness"). Shapes form a tree data structure, and when a new instance variable is set on an object, that object "transitions" to a new shape in the shape tree. Each shape has an ID that is used for caching. The shape structure is independent of class, so objects of different types can have the same shape. For example: ```ruby class Foo def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end class Bar def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end foo = Foo.new # `foo` has shape id 2 bar = Bar.new # `bar` has shape id 2 ``` Both `foo` and `bar` instances have the same shape because they both set instance variables of the same name in the same order. This technique can help to improve inline cache hits as well as generate more efficient machine code in JIT compilers. This commit also adds some methods for debugging shapes on objects. See `RubyVM::Shape` for more details. For more context on Object Shapes, see [Feature: #18776] Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org> Co-Authored-By: Eileen M. Uchitelle <eileencodes@gmail.com> Co-Authored-By: John Hawthorn <john@hawthorn.email>
2022-09-23 20:54:42 +03:00
}
else {
vm_ic_atomic_shape_and_index(ic, &cached_id, &index);
}
This commit implements the Object Shapes technique in CRuby. Object Shapes is used for accessing instance variables and representing the "frozenness" of objects. Object instances have a "shape" and the shape represents some attributes of the object (currently which instance variables are set and the "frozenness"). Shapes form a tree data structure, and when a new instance variable is set on an object, that object "transitions" to a new shape in the shape tree. Each shape has an ID that is used for caching. The shape structure is independent of class, so objects of different types can have the same shape. For example: ```ruby class Foo def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end class Bar def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end foo = Foo.new # `foo` has shape id 2 bar = Bar.new # `bar` has shape id 2 ``` Both `foo` and `bar` instances have the same shape because they both set instance variables of the same name in the same order. This technique can help to improve inline cache hits as well as generate more efficient machine code in JIT compilers. This commit also adds some methods for debugging shapes on objects. See `RubyVM::Shape` for more details. For more context on Object Shapes, see [Feature: #18776] Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org> Co-Authored-By: Eileen M. Uchitelle <eileencodes@gmail.com> Co-Authored-By: John Hawthorn <john@hawthorn.email>
2022-09-23 20:54:42 +03:00
2022-10-12 12:27:23 +03:00
if (LIKELY(cached_id == shape_id)) {
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
RUBY_ASSERT(cached_id != OBJ_TOO_COMPLEX_SHAPE_ID);
if (index == ATTR_INDEX_NOT_SET) {
return default_value;
}
val = ivar_list[index];
#if USE_DEBUG_COUNTER
RB_DEBUG_COUNTER_INC(ivar_get_ic_hit);
if (RB_TYPE_P(obj, T_OBJECT)) {
RB_DEBUG_COUNTER_INC(ivar_get_obj_hit);
}
#endif
2022-11-15 07:24:08 +03:00
RUBY_ASSERT(!UNDEF_P(val));
}
else { // cache miss case
#if USE_DEBUG_COUNTER
if (is_attr) {
if (cached_id != INVALID_SHAPE_ID) {
RB_DEBUG_COUNTER_INC(ivar_get_cc_miss_set);
2022-10-12 12:27:23 +03:00
}
else {
RB_DEBUG_COUNTER_INC(ivar_get_cc_miss_unset);
}
}
else {
if (cached_id != INVALID_SHAPE_ID) {
RB_DEBUG_COUNTER_INC(ivar_get_ic_miss_set);
2022-10-12 12:27:23 +03:00
}
else {
RB_DEBUG_COUNTER_INC(ivar_get_ic_miss_unset);
}
}
RB_DEBUG_COUNTER_INC(ivar_get_ic_miss);
if (RB_TYPE_P(obj, T_OBJECT)) {
RB_DEBUG_COUNTER_INC(ivar_get_obj_miss);
}
#endif
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
if (shape_id == OBJ_TOO_COMPLEX_SHAPE_ID) {
st_table *table = NULL;
switch (BUILTIN_TYPE(obj)) {
case T_CLASS:
case T_MODULE:
table = (st_table *)RCLASS_IVPTR(obj);
break;
case T_OBJECT:
table = ROBJECT_IV_HASH(obj);
break;
default: {
struct gen_ivtbl *ivtbl;
if (rb_gen_ivtbl_get(obj, 0, &ivtbl)) {
table = ivtbl->as.complex.table;
}
break;
}
}
if (!table || !st_lookup(table, id, &val)) {
val = default_value;
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
}
}
else {
vm_getivar: assume the cached shape_id like have a common ancestor When an inline cache misses, it is very likely that the stale shape_id and the current instance shape_id have a close common ancestor. For example if the instance variable is sometimes frozen sometimes not, one of the two shape will be the direct parent of the other. Another pattern that commonly cause IC misses is "memoization", in such case the object will have a "base common shape" and then a number of close descendants. In addition, when we find a common ancestor, we store it in the inline cache instead of the current shape. This help prevent the cache from flip-flopping, ensuring the next lookup will be marginally faster and more generally avoid writing in memory too much. However, now that shapes have an ancestors index, we only check for a few ancestors before falling back to use the index. So overall this change speeds up what is assumed to be the more common case, but makes what is assumed to be the less common case a bit slower. ``` compare-ruby: ruby 3.3.0dev (2023-10-26T05:30:17Z master 701ca070b4) [arm64-darwin22] built-ruby: ruby 3.3.0dev (2023-10-26T09:25:09Z shapes_double_sear.. a723a85235) [arm64-darwin22] warming up...... | |compare-ruby|built-ruby| |:------------------------------------|-----------:|---------:| |vm_ivar_stable_shape | 11.672M| 11.679M| | | -| 1.00x| |vm_ivar_memoize_unstable_shape | 7.551M| 10.506M| | | -| 1.39x| |vm_ivar_memoize_unstable_shape_miss | 11.591M| 11.624M| | | -| 1.00x| |vm_ivar_unstable_undef | 9.037M| 7.981M| | | 1.13x| -| |vm_ivar_divergent_shape | 8.034M| 6.657M| | | 1.21x| -| |vm_ivar_divergent_shape_imbalanced | 10.471M| 9.231M| | | 1.13x| -| ``` Co-Authored-By: John Hawthorn <john@hawthorn.email>
2023-10-26 12:08:05 +03:00
shape_id_t previous_cached_id = cached_id;
if (rb_shape_get_iv_index_with_hint(shape_id, id, &index, &cached_id)) {
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
// This fills in the cache with the shared cache object.
// "ent" is the shared cache object
vm_getivar: assume the cached shape_id like have a common ancestor When an inline cache misses, it is very likely that the stale shape_id and the current instance shape_id have a close common ancestor. For example if the instance variable is sometimes frozen sometimes not, one of the two shape will be the direct parent of the other. Another pattern that commonly cause IC misses is "memoization", in such case the object will have a "base common shape" and then a number of close descendants. In addition, when we find a common ancestor, we store it in the inline cache instead of the current shape. This help prevent the cache from flip-flopping, ensuring the next lookup will be marginally faster and more generally avoid writing in memory too much. However, now that shapes have an ancestors index, we only check for a few ancestors before falling back to use the index. So overall this change speeds up what is assumed to be the more common case, but makes what is assumed to be the less common case a bit slower. ``` compare-ruby: ruby 3.3.0dev (2023-10-26T05:30:17Z master 701ca070b4) [arm64-darwin22] built-ruby: ruby 3.3.0dev (2023-10-26T09:25:09Z shapes_double_sear.. a723a85235) [arm64-darwin22] warming up...... | |compare-ruby|built-ruby| |:------------------------------------|-----------:|---------:| |vm_ivar_stable_shape | 11.672M| 11.679M| | | -| 1.00x| |vm_ivar_memoize_unstable_shape | 7.551M| 10.506M| | | -| 1.39x| |vm_ivar_memoize_unstable_shape_miss | 11.591M| 11.624M| | | -| 1.00x| |vm_ivar_unstable_undef | 9.037M| 7.981M| | | 1.13x| -| |vm_ivar_divergent_shape | 8.034M| 6.657M| | | 1.21x| -| |vm_ivar_divergent_shape_imbalanced | 10.471M| 9.231M| | | 1.13x| -| ``` Co-Authored-By: John Hawthorn <john@hawthorn.email>
2023-10-26 12:08:05 +03:00
if (cached_id != previous_cached_id) {
fill_ivar_cache(iseq, ic, cc, is_attr, index, cached_id);
}
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
vm_getivar: assume the cached shape_id like have a common ancestor When an inline cache misses, it is very likely that the stale shape_id and the current instance shape_id have a close common ancestor. For example if the instance variable is sometimes frozen sometimes not, one of the two shape will be the direct parent of the other. Another pattern that commonly cause IC misses is "memoization", in such case the object will have a "base common shape" and then a number of close descendants. In addition, when we find a common ancestor, we store it in the inline cache instead of the current shape. This help prevent the cache from flip-flopping, ensuring the next lookup will be marginally faster and more generally avoid writing in memory too much. However, now that shapes have an ancestors index, we only check for a few ancestors before falling back to use the index. So overall this change speeds up what is assumed to be the more common case, but makes what is assumed to be the less common case a bit slower. ``` compare-ruby: ruby 3.3.0dev (2023-10-26T05:30:17Z master 701ca070b4) [arm64-darwin22] built-ruby: ruby 3.3.0dev (2023-10-26T09:25:09Z shapes_double_sear.. a723a85235) [arm64-darwin22] warming up...... | |compare-ruby|built-ruby| |:------------------------------------|-----------:|---------:| |vm_ivar_stable_shape | 11.672M| 11.679M| | | -| 1.00x| |vm_ivar_memoize_unstable_shape | 7.551M| 10.506M| | | -| 1.39x| |vm_ivar_memoize_unstable_shape_miss | 11.591M| 11.624M| | | -| 1.00x| |vm_ivar_unstable_undef | 9.037M| 7.981M| | | 1.13x| -| |vm_ivar_divergent_shape | 8.034M| 6.657M| | | 1.21x| -| |vm_ivar_divergent_shape_imbalanced | 10.471M| 9.231M| | | 1.13x| -| ``` Co-Authored-By: John Hawthorn <john@hawthorn.email>
2023-10-26 12:08:05 +03:00
if (index == ATTR_INDEX_NOT_SET) {
val = default_value;
}
else {
// We fetched the ivar list above
val = ivar_list[index];
RUBY_ASSERT(!UNDEF_P(val));
}
}
else {
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
if (is_attr) {
vm_cc_attr_index_initialize(cc, shape_id);
}
else {
vm_ic_attr_index_initialize(ic, shape_id);
}
val = default_value;
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
}
}
}
2024-01-30 08:48:59 +03:00
if (!UNDEF_P(default_value)) {
RUBY_ASSERT(!UNDEF_P(val));
}
return val;
general_path:
#endif /* OPT_IC_FOR_IVAR */
RB_DEBUG_COUNTER_INC(ivar_get_ic_miss);
if (is_attr) {
return rb_attr_get(obj, id);
}
else {
return rb_ivar_get(obj, id);
}
}
static void
populate_cache(attr_index_t index, shape_id_t next_shape_id, ID id, const rb_iseq_t *iseq, IVC ic, const struct rb_callcache *cc, bool is_attr)
{
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
RUBY_ASSERT(next_shape_id != OBJ_TOO_COMPLEX_SHAPE_ID);
// Cache population code
if (is_attr) {
vm_cc_attr_index_set(cc, index, next_shape_id);
}
else {
vm_ic_attr_index_set(iseq, ic, index, next_shape_id);
}
}
ALWAYS_INLINE(static VALUE vm_setivar_slowpath(VALUE obj, ID id, VALUE val, const rb_iseq_t *iseq, IVC ic, const struct rb_callcache *cc, int is_attr));
NOINLINE(static VALUE vm_setivar_slowpath_ivar(VALUE obj, ID id, VALUE val, const rb_iseq_t *iseq, IVC ic));
NOINLINE(static VALUE vm_setivar_slowpath_attr(VALUE obj, ID id, VALUE val, const struct rb_callcache *cc));
static VALUE
vm_setivar_slowpath(VALUE obj, ID id, VALUE val, const rb_iseq_t *iseq, IVC ic, const struct rb_callcache *cc, int is_attr)
{
#if OPT_IC_FOR_IVAR
RB_DEBUG_COUNTER_INC(ivar_set_ic_miss);
if (BUILTIN_TYPE(obj) == T_OBJECT) {
rb_check_frozen(obj);
This commit implements the Object Shapes technique in CRuby. Object Shapes is used for accessing instance variables and representing the "frozenness" of objects. Object instances have a "shape" and the shape represents some attributes of the object (currently which instance variables are set and the "frozenness"). Shapes form a tree data structure, and when a new instance variable is set on an object, that object "transitions" to a new shape in the shape tree. Each shape has an ID that is used for caching. The shape structure is independent of class, so objects of different types can have the same shape. For example: ```ruby class Foo def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end class Bar def initialize # Starts with shape id 0 @a = 1 # transitions to shape id 1 @b = 1 # transitions to shape id 2 end end foo = Foo.new # `foo` has shape id 2 bar = Bar.new # `bar` has shape id 2 ``` Both `foo` and `bar` instances have the same shape because they both set instance variables of the same name in the same order. This technique can help to improve inline cache hits as well as generate more efficient machine code in JIT compilers. This commit also adds some methods for debugging shapes on objects. See `RubyVM::Shape` for more details. For more context on Object Shapes, see [Feature: #18776] Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org> Co-Authored-By: Eileen M. Uchitelle <eileencodes@gmail.com> Co-Authored-By: John Hawthorn <john@hawthorn.email>
2022-09-23 20:54:42 +03:00
attr_index_t index = rb_obj_ivar_set(obj, id, val);
shape_id_t next_shape_id = ROBJECT_SHAPE_ID(obj);
if (next_shape_id != OBJ_TOO_COMPLEX_SHAPE_ID) {
populate_cache(index, next_shape_id, id, iseq, ic, cc, is_attr);
2022-10-12 12:27:23 +03:00
}
RB_DEBUG_COUNTER_INC(ivar_set_obj_miss);
return val;
}
#endif
return rb_ivar_set(obj, id, val);
}
static VALUE
vm_setivar_slowpath_ivar(VALUE obj, ID id, VALUE val, const rb_iseq_t *iseq, IVC ic)
{
return vm_setivar_slowpath(obj, id, val, iseq, ic, NULL, false);
}
static VALUE
vm_setivar_slowpath_attr(VALUE obj, ID id, VALUE val, const struct rb_callcache *cc)
{
return vm_setivar_slowpath(obj, id, val, NULL, NULL, cc, true);
}
NOINLINE(static VALUE vm_setivar_default(VALUE obj, ID id, VALUE val, shape_id_t dest_shape_id, attr_index_t index));
static VALUE
vm_setivar_default(VALUE obj, ID id, VALUE val, shape_id_t dest_shape_id, attr_index_t index)
{
#if SHAPE_IN_BASIC_FLAGS
shape_id_t shape_id = RBASIC_SHAPE_ID(obj);
#else
shape_id_t shape_id = rb_generic_shape_id(obj);
#endif
struct gen_ivtbl *ivtbl = 0;
// Cache hit case
if (shape_id == dest_shape_id) {
RUBY_ASSERT(dest_shape_id != INVALID_SHAPE_ID && shape_id != INVALID_SHAPE_ID);
}
else if (dest_shape_id != INVALID_SHAPE_ID) {
rb_shape_t *shape = rb_shape_get_shape_by_id(shape_id);
rb_shape_t *dest_shape = rb_shape_get_shape_by_id(dest_shape_id);
if (shape_id == dest_shape->parent_id && dest_shape->edge_name == id && shape->capacity == dest_shape->capacity) {
RUBY_ASSERT(index < dest_shape->capacity);
}
else {
return Qundef;
}
}
else {
return Qundef;
}
rb_gen_ivtbl_get(obj, 0, &ivtbl);
if (shape_id != dest_shape_id) {
#if SHAPE_IN_BASIC_FLAGS
RBASIC_SET_SHAPE_ID(obj, dest_shape_id);
#else
ivtbl->shape_id = dest_shape_id;
#endif
}
RB_OBJ_WRITE(obj, &ivtbl->as.shape.ivptr[index], val);
RB_DEBUG_COUNTER_INC(ivar_set_ic_hit);
return val;
}
static inline VALUE
vm_setivar(VALUE obj, ID id, VALUE val, shape_id_t dest_shape_id, attr_index_t index)
{
#if OPT_IC_FOR_IVAR
switch (BUILTIN_TYPE(obj)) {
case T_OBJECT:
2022-10-12 12:27:23 +03:00
{
VM_ASSERT(!rb_ractor_shareable_p(obj) || rb_obj_frozen_p(obj));
2022-10-12 12:27:23 +03:00
shape_id_t shape_id = ROBJECT_SHAPE_ID(obj);
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
RUBY_ASSERT(dest_shape_id != OBJ_TOO_COMPLEX_SHAPE_ID);
2022-10-12 12:27:23 +03:00
if (LIKELY(shape_id == dest_shape_id)) {
RUBY_ASSERT(dest_shape_id != INVALID_SHAPE_ID && shape_id != INVALID_SHAPE_ID);
VM_ASSERT(!rb_ractor_shareable_p(obj));
}
else if (dest_shape_id != INVALID_SHAPE_ID) {
rb_shape_t *shape = rb_shape_get_shape_by_id(shape_id);
2022-10-12 12:27:23 +03:00
rb_shape_t *dest_shape = rb_shape_get_shape_by_id(dest_shape_id);
shape_id_t source_shape_id = dest_shape->parent_id;
if (shape_id == source_shape_id && dest_shape->edge_name == id && shape->capacity == dest_shape->capacity) {
RUBY_ASSERT(dest_shape_id != INVALID_SHAPE_ID && shape_id != INVALID_SHAPE_ID);
2022-10-12 12:27:23 +03:00
ROBJECT_SET_SHAPE_ID(obj, dest_shape_id);
RUBY_ASSERT(rb_shape_get_next_iv_shape(rb_shape_get_shape_by_id(source_shape_id), id) == dest_shape);
RUBY_ASSERT(index < dest_shape->capacity);
2022-10-12 12:27:23 +03:00
}
else {
break;
}
2022-10-12 12:27:23 +03:00
}
else {
break;
}
2022-10-12 12:27:23 +03:00
VALUE *ptr = ROBJECT_IVPTR(obj);
Transition complex objects to "too complex" shape When an object becomes "too complex" (in other words it has too many variations in the shape tree), we transition it to use a "too complex" shape and use a hash for storing instance variables. Without this patch, there were rare cases where shape tree growth could "explode" and cause performance degradation on what would otherwise have been cached fast paths. This patch puts a limit on shape tree growth, and gracefully degrades in the rare case where there could be a factorial growth in the shape tree. For example: ```ruby class NG; end HUGE_NUMBER.times do NG.new.instance_variable_set(:"@unique_ivar_#{_1}", 1) end ``` We consider objects to be "too complex" when the object's class has more than SHAPE_MAX_VARIATIONS (currently 8) leaf nodes in the shape tree and the object introduces a new variation (a new leaf node) associated with that class. For example, new variations on instances of the following class would be considered "too complex" because those instances create more than 8 leaves in the shape tree: ```ruby class Foo; end 9.times { Foo.new.instance_variable_set(":@uniq_#{_1}", 1) } ``` However, the following class is *not* too complex because it only has one leaf in the shape tree: ```ruby class Foo def initialize @a = @b = @c = @d = @e = @f = @g = @h = @i = nil end end 9.times { Foo.new } `` This case is rare, so we don't expect this change to impact performance of most applications, but it needs to be handled. Co-Authored-By: Aaron Patterson <tenderlove@ruby-lang.org>
2022-12-09 01:16:52 +03:00
RUBY_ASSERT(!rb_shape_obj_too_complex(obj));
2022-10-12 12:27:23 +03:00
RB_OBJ_WRITE(obj, &ptr[index], val);
2022-10-12 12:27:23 +03:00
RB_DEBUG_COUNTER_INC(ivar_set_ic_hit);
RB_DEBUG_COUNTER_INC(ivar_set_obj_hit);
2022-10-12 12:27:23 +03:00
return val;
}
break;
case T_CLASS:
case T_MODULE:
RB_DEBUG_COUNTER_INC(ivar_set_ic_miss_noobject);
default:
break;
}
return Qundef;
#endif /* OPT_IC_FOR_IVAR */
}
static VALUE
update_classvariable_cache(const rb_iseq_t *iseq, VALUE klass, ID id, const rb_cref_t * cref, ICVARC ic)
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
{
VALUE defined_class = 0;
VALUE cvar_value = rb_cvar_find(klass, id, &defined_class);
if (RB_TYPE_P(defined_class, T_ICLASS)) {
defined_class = RBASIC(defined_class)->klass;
}
struct rb_id_table *rb_cvc_tbl = RCLASS_CVC_TBL(defined_class);
if (!rb_cvc_tbl) {
rb_bug("the cvc table should be set");
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
}
VALUE ent_data;
if (!rb_id_table_lookup(rb_cvc_tbl, id, &ent_data)) {
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
rb_bug("should have cvar cache entry");
}
struct rb_cvar_class_tbl_entry *ent = (void *)ent_data;
ent->global_cvar_state = GET_GLOBAL_CVAR_STATE();
ent->cref = cref;
ic->entry = ent;
RUBY_ASSERT(BUILTIN_TYPE((VALUE)cref) == T_IMEMO && IMEMO_TYPE_P(cref, imemo_cref));
RB_OBJ_WRITTEN(iseq, Qundef, ent->cref);
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
RB_OBJ_WRITTEN(iseq, Qundef, ent->class_value);
RB_OBJ_WRITTEN(ent->class_value, Qundef, ent->cref);
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
return cvar_value;
}
static inline VALUE
vm_getclassvariable(const rb_iseq_t *iseq, const rb_control_frame_t *reg_cfp, ID id, ICVARC ic)
{
const rb_cref_t *cref;
cref = vm_get_cref(GET_EP());
if (ic->entry && ic->entry->global_cvar_state == GET_GLOBAL_CVAR_STATE() && ic->entry->cref == cref && LIKELY(rb_ractor_main_p())) {
RB_DEBUG_COUNTER_INC(cvar_read_inline_hit);
VALUE v = rb_ivar_lookup(ic->entry->class_value, id, Qundef);
2022-11-15 07:24:08 +03:00
RUBY_ASSERT(!UNDEF_P(v));
return v;
}
VALUE klass = vm_get_cvar_base(cref, reg_cfp, 1);
return update_classvariable_cache(iseq, klass, id, cref, ic);
}
VALUE
rb_vm_getclassvariable(const rb_iseq_t *iseq, const rb_control_frame_t *cfp, ID id, ICVARC ic)
{
return vm_getclassvariable(iseq, cfp, id, ic);
}
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
static inline void
vm_setclassvariable(const rb_iseq_t *iseq, const rb_control_frame_t *reg_cfp, ID id, VALUE val, ICVARC ic)
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
{
const rb_cref_t *cref;
cref = vm_get_cref(GET_EP());
if (ic->entry && ic->entry->global_cvar_state == GET_GLOBAL_CVAR_STATE() && ic->entry->cref == cref && LIKELY(rb_ractor_main_p())) {
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
RB_DEBUG_COUNTER_INC(cvar_write_inline_hit);
rb_class_ivar_set(ic->entry->class_value, id, val);
return;
}
VALUE klass = vm_get_cvar_base(cref, reg_cfp, 1);
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
rb_cvar_set(klass, id, val);
update_classvariable_cache(iseq, klass, id, cref, ic);
Add a cache for class variables Redo of 34a2acdac788602c14bf05fb616215187badd504 and 931138b00696419945dc03e10f033b1f53cd50f3 which were reverted. GitHub PR #4340. This change implements a cache for class variables. Previously there was no cache for cvars. Cvar access is slow due to needing to travel all the way up th ancestor tree before returning the cvar value. The deeper the ancestor tree the slower cvar access will be. The benefits of the cache are more visible with a higher number of included modules due to the way Ruby looks up class variables. The benchmark here includes 26 modules and shows with the cache, this branch is 6.5x faster when accessing class variables. ``` compare-ruby: ruby 3.1.0dev (2021-03-15T06:22:34Z master 9e5105c) [x86_64-darwin19] built-ruby: ruby 3.1.0dev (2021-03-15T12:12:44Z add-cache-for-clas.. c6be009) [x86_64-darwin19] | |compare-ruby|built-ruby| |:--------|-----------:|---------:| |vm_cvar | 5.681M| 36.980M| | | -| 6.51x| ``` Benchmark.ips calling `ActiveRecord::Base.logger` from within a Rails application. ActiveRecord::Base.logger has 71 ancestors. The more ancestors a tree has, the more clear the speed increase. IE if Base had only one ancestor we'd see no improvement. This benchmark is run on a vanilla Rails application. Benchmark code: ```ruby require "benchmark/ips" require_relative "config/environment" Benchmark.ips do |x| x.report "logger" do ActiveRecord::Base.logger end end ``` Ruby 3.0 master / Rails 6.1: ``` Warming up -------------------------------------- logger 155.251k i/100ms Calculating ------------------------------------- ``` Ruby 3.0 with cvar cache / Rails 6.1: ``` Warming up -------------------------------------- logger 1.546M i/100ms Calculating ------------------------------------- logger 14.857M (± 4.8%) i/s - 74.198M in 5.006202s ``` Lastly we ran a benchmark to demonstate the difference between master and our cache when the number of modules increases. This benchmark measures 1 ancestor, 30 ancestors, and 100 ancestors. Ruby 3.0 master: ``` Warming up -------------------------------------- 1 module 1.231M i/100ms 30 modules 432.020k i/100ms 100 modules 145.399k i/100ms Calculating ------------------------------------- 1 module 12.210M (± 2.1%) i/s - 61.553M in 5.043400s 30 modules 4.354M (± 2.7%) i/s - 22.033M in 5.063839s 100 modules 1.434M (± 2.9%) i/s - 7.270M in 5.072531s Comparison: 1 module: 12209958.3 i/s 30 modules: 4354217.8 i/s - 2.80x (± 0.00) slower 100 modules: 1434447.3 i/s - 8.51x (± 0.00) slower ``` Ruby 3.0 with cvar cache: ``` Warming up -------------------------------------- 1 module 1.641M i/100ms 30 modules 1.655M i/100ms 100 modules 1.620M i/100ms Calculating ------------------------------------- 1 module 16.279M (± 3.8%) i/s - 82.038M in 5.046923s 30 modules 15.891M (± 3.9%) i/s - 79.459M in 5.007958s 100 modules 16.087M (± 3.6%) i/s - 81.005M in 5.041931s Comparison: 1 module: 16279458.0 i/s 100 modules: 16087484.6 i/s - same-ish: difference falls within error 30 modules: 15891406.2 i/s - same-ish: difference falls within error ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
2021-06-01 20:34:06 +03:00
}
void
rb_vm_setclassvariable(const rb_iseq_t *iseq, const rb_control_frame_t *cfp, ID id, VALUE val, ICVARC ic)
{
vm_setclassvariable(iseq, cfp, id, val, ic);
}
static inline VALUE
vm_getinstancevariable(const rb_iseq_t *iseq, VALUE obj, ID id, IVC ic)
{
return vm_getivar(obj, id, iseq, ic, NULL, FALSE, Qnil);
}
static inline void
vm_setinstancevariable(const rb_iseq_t *iseq, VALUE obj, ID id, VALUE val, IVC ic)
{
if (RB_SPECIAL_CONST_P(obj)) {
rb_error_frozen_object(obj);
return;
}
shape_id_t dest_shape_id;
attr_index_t index;
vm_ic_atomic_shape_and_index(ic, &dest_shape_id, &index);
2022-11-15 07:24:08 +03:00
if (UNLIKELY(UNDEF_P(vm_setivar(obj, id, val, dest_shape_id, index)))) {
switch (BUILTIN_TYPE(obj)) {
2022-10-12 12:27:23 +03:00
case T_OBJECT:
case T_CLASS:
case T_MODULE:
break;
default:
2022-11-15 07:24:08 +03:00
if (!UNDEF_P(vm_setivar_default(obj, id, val, dest_shape_id, index))) {
2022-10-12 12:27:23 +03:00
return;
}
}
vm_setivar_slowpath_ivar(obj, id, val, iseq, ic);
}
}
void
rb_vm_setinstancevariable(const rb_iseq_t *iseq, VALUE obj, ID id, VALUE val, IVC ic)
{
vm_setinstancevariable(iseq, obj, id, val, ic);
}
static VALUE
vm_throw_continue(const rb_execution_context_t *ec, VALUE err)
{
/* continue throw */
if (FIXNUM_P(err)) {
ec->tag->state = RUBY_TAG_FATAL;
}
else if (SYMBOL_P(err)) {
ec->tag->state = TAG_THROW;
}
else if (THROW_DATA_P(err)) {
ec->tag->state = THROW_DATA_STATE((struct vm_throw_data *)err);
}
else {
ec->tag->state = TAG_RAISE;
}
return err;
}
static VALUE
vm_throw_start(const rb_execution_context_t *ec, rb_control_frame_t *const reg_cfp, enum ruby_tag_type state,
const int flag, const VALUE throwobj)
{
const rb_control_frame_t *escape_cfp = NULL;
const rb_control_frame_t * const eocfp = RUBY_VM_END_CONTROL_FRAME(ec); /* end of control frame pointer */
if (flag != 0) {
/* do nothing */
}
else if (state == TAG_BREAK) {
int is_orphan = 1;
const VALUE *ep = GET_EP();
const rb_iseq_t *base_iseq = GET_ISEQ();
escape_cfp = reg_cfp;
while (ISEQ_BODY(base_iseq)->type != ISEQ_TYPE_BLOCK) {
if (ISEQ_BODY(escape_cfp->iseq)->type == ISEQ_TYPE_CLASS) {
escape_cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(escape_cfp);
ep = escape_cfp->ep;
base_iseq = escape_cfp->iseq;
}
else {
ep = VM_ENV_PREV_EP(ep);
base_iseq = ISEQ_BODY(base_iseq)->parent_iseq;
escape_cfp = rb_vm_search_cf_from_ep(ec, escape_cfp, ep);
VM_ASSERT(escape_cfp->iseq == base_iseq);
}
}
2022-07-21 19:23:58 +03:00
if (VM_FRAME_LAMBDA_P(escape_cfp)) {
/* lambda{... break ...} */
is_orphan = 0;
state = TAG_RETURN;
}
else {
ep = VM_ENV_PREV_EP(ep);
2022-07-21 19:23:58 +03:00
while (escape_cfp < eocfp) {
if (escape_cfp->ep == ep) {
const rb_iseq_t *const iseq = escape_cfp->iseq;
const VALUE epc = escape_cfp->pc - ISEQ_BODY(iseq)->iseq_encoded;
const struct iseq_catch_table *const ct = ISEQ_BODY(iseq)->catch_table;
unsigned int i;
2022-07-21 19:23:58 +03:00
if (!ct) break;
for (i=0; i < ct->size; i++) {
const struct iseq_catch_table_entry *const entry =
UNALIGNED_MEMBER_PTR(ct, entries[i]);
2022-07-21 19:23:58 +03:00
if (entry->type == CATCH_TYPE_BREAK &&
entry->iseq == base_iseq &&
entry->start < epc && entry->end >= epc) {
if (entry->cont == epc) { /* found! */
is_orphan = 0;
}
break;
}
}
break;
}
2022-07-21 19:23:58 +03:00
escape_cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(escape_cfp);
}
}
2022-07-21 19:23:58 +03:00
if (is_orphan) {
rb_vm_localjump_error("break from proc-closure", throwobj, TAG_BREAK);
}
}
else if (state == TAG_RETRY) {
const VALUE *ep = VM_ENV_PREV_EP(GET_EP());
escape_cfp = rb_vm_search_cf_from_ep(ec, reg_cfp, ep);
}
else if (state == TAG_RETURN) {
const VALUE *current_ep = GET_EP();
const VALUE *target_ep = NULL, *target_lep, *ep = current_ep;
int in_class_frame = 0;
int toplevel = 1;
escape_cfp = reg_cfp;
// find target_lep, target_ep
while (!VM_ENV_LOCAL_P(ep)) {
if (VM_ENV_FLAGS(ep, VM_FRAME_FLAG_LAMBDA) && target_ep == NULL) {
target_ep = ep;
}
ep = VM_ENV_PREV_EP(ep);
}
target_lep = ep;
while (escape_cfp < eocfp) {
const VALUE *lep = VM_CF_LEP(escape_cfp);
if (!target_lep) {
target_lep = lep;
}
if (lep == target_lep &&
VM_FRAME_RUBYFRAME_P(escape_cfp) &&
ISEQ_BODY(escape_cfp->iseq)->type == ISEQ_TYPE_CLASS) {
in_class_frame = 1;
target_lep = 0;
}
2022-07-21 19:23:58 +03:00
if (lep == target_lep) {
if (VM_FRAME_LAMBDA_P(escape_cfp)) {
toplevel = 0;
if (in_class_frame) {
/* lambda {class A; ... return ...; end} */
goto valid_return;
}
else {
const VALUE *tep = current_ep;
while (target_lep != tep) {
if (escape_cfp->ep == tep) {
/* in lambda */
if (tep == target_ep) {
goto valid_return;
}
else {
goto unexpected_return;
}
}
tep = VM_ENV_PREV_EP(tep);
}
}
}
else if (VM_FRAME_RUBYFRAME_P(escape_cfp)) {
switch (ISEQ_BODY(escape_cfp->iseq)->type) {
case ISEQ_TYPE_TOP:
case ISEQ_TYPE_MAIN:
if (toplevel) {
if (in_class_frame) goto unexpected_return;
if (target_ep == NULL) {
goto valid_return;
}
else {
goto unexpected_return;
}
}
break;
case ISEQ_TYPE_EVAL: {
const rb_iseq_t *is = escape_cfp->iseq;
enum rb_iseq_type t = ISEQ_BODY(is)->type;
while (t == ISEQ_TYPE_RESCUE || t == ISEQ_TYPE_ENSURE || t == ISEQ_TYPE_EVAL) {
if (!(is = ISEQ_BODY(is)->parent_iseq)) break;
t = ISEQ_BODY(is)->type;
}
toplevel = t == ISEQ_TYPE_TOP || t == ISEQ_TYPE_MAIN;
break;
}
case ISEQ_TYPE_CLASS:
toplevel = 0;
break;
default:
break;
}
}
}
if (escape_cfp->ep == target_lep && ISEQ_BODY(escape_cfp->iseq)->type == ISEQ_TYPE_METHOD) {
if (target_ep == NULL) {
goto valid_return;
}
else {
goto unexpected_return;
}
}
escape_cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(escape_cfp);
}
unexpected_return:;
rb_vm_localjump_error("unexpected return", throwobj, TAG_RETURN);
valid_return:;
/* do nothing */
}
else {
rb_bug("isns(throw): unsupported throw type");
}
ec->tag->state = state;
return (VALUE)THROW_DATA_NEW(throwobj, escape_cfp, state);
}
static VALUE
vm_throw(const rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
rb_num_t throw_state, VALUE throwobj)
{
const int state = (int)(throw_state & VM_THROW_STATE_MASK);
const int flag = (int)(throw_state & VM_THROW_NO_ESCAPE_FLAG);
if (state != 0) {
return vm_throw_start(ec, reg_cfp, state, flag, throwobj);
}
else {
return vm_throw_continue(ec, throwobj);
}
}
VALUE
rb_vm_throw(const rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, rb_num_t throw_state, VALUE throwobj)
{
return vm_throw(ec, reg_cfp, throw_state, throwobj);
}
static inline void
vm_expandarray(struct rb_control_frame_struct *cfp, VALUE ary, rb_num_t num, int flag)
{
int is_splat = flag & 0x01;
const VALUE *ptr;
rb_num_t len;
const VALUE obj = ary;
if (!RB_TYPE_P(ary, T_ARRAY) && NIL_P(ary = rb_check_array_type(ary))) {
ary = obj;
ptr = &ary;
len = 1;
}
else {
ptr = RARRAY_CONST_PTR(ary);
len = (rb_num_t)RARRAY_LEN(ary);
}
if (num + is_splat == 0) {
/* no space left on stack */
}
else if (flag & 0x02) {
/* post: ..., nil ,ary[-1], ..., ary[0..-num] # top */
rb_num_t i = 0, j;
2022-07-21 19:23:58 +03:00
if (len < num) {
for (i = 0; i < num - len; i++) {
*cfp->sp++ = Qnil;
}
}
for (j = 0; i < num; i++, j++) {
VALUE v = ptr[len - j - 1];
*cfp->sp++ = v;
}
if (is_splat) {
*cfp->sp++ = rb_ary_new4(len - j, ptr);
}
}
else {
/* normal: ary[num..-1], ary[num-2], ary[num-3], ..., ary[0] # top */
if (is_splat) {
if (num > len) {
*cfp->sp++ = rb_ary_new();
}
else {
*cfp->sp++ = rb_ary_new4(len - num, ptr + num);
}
}
if (num > len) {
rb_num_t i = 0;
for (; i < num - len; i++) {
*cfp->sp++ = Qnil;
}
for (rb_num_t j = 0; i < num; i++, j++) {
*cfp->sp++ = ptr[len - j - 1];
}
}
else {
for (rb_num_t j = 0; j < num; j++) {
*cfp->sp++ = ptr[num - j - 1];
}
}
}
RB_GC_GUARD(ary);
}
static VALUE vm_call_general(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling);
static VALUE vm_mtbl_dump(VALUE klass, ID target_mid);
static struct rb_class_cc_entries *
vm_ccs_create(VALUE klass, struct rb_id_table *cc_tbl, ID mid, const rb_callable_method_entry_t *cme)
{
struct rb_class_cc_entries *ccs = ALLOC(struct rb_class_cc_entries);
#if VM_CHECK_MODE > 0
ccs->debug_sig = ~(VALUE)ccs;
#endif
ccs->capa = 0;
ccs->len = 0;
ccs->cme = cme;
METHOD_ENTRY_CACHED_SET((rb_callable_method_entry_t *)cme);
ccs->entries = NULL;
rb_id_table_insert(cc_tbl, mid, (VALUE)ccs);
RB_OBJ_WRITTEN(klass, Qundef, cme);
return ccs;
}
static void
vm_ccs_push(VALUE klass, struct rb_class_cc_entries *ccs, const struct rb_callinfo *ci, const struct rb_callcache *cc)
{
if (! vm_cc_markable(cc)) {
return;
}
if (UNLIKELY(ccs->len == ccs->capa)) {
if (ccs->capa == 0) {
ccs->capa = 1;
ccs->entries = ALLOC_N(struct rb_class_cc_entries_entry, ccs->capa);
}
else {
ccs->capa *= 2;
REALLOC_N(ccs->entries, struct rb_class_cc_entries_entry, ccs->capa);
}
}
VM_ASSERT(ccs->len < ccs->capa);
const int pos = ccs->len++;
ccs->entries[pos].argc = vm_ci_argc(ci);
ccs->entries[pos].flag = vm_ci_flag(ci);
RB_OBJ_WRITE(klass, &ccs->entries[pos].cc, cc);
if (RB_DEBUG_COUNTER_SETMAX(ccs_maxlen, ccs->len)) {
// for tuning
// vm_mtbl_dump(klass, 0);
}
}
#if VM_CHECK_MODE > 0
void
rb_vm_ccs_dump(struct rb_class_cc_entries *ccs)
mjit_compile.c: merge initial JIT compiler which has been developed by Takashi Kokubun <takashikkbn@gmail> as YARV-MJIT. Many of its bugs are fixed by wanabe <s.wanabe@gmail.com>. This JIT compiler is designed to be a safe migration path to introduce JIT compiler to MRI. So this commit does not include any bytecode changes or dynamic instruction modifications, which are done in original MJIT. This commit even strips off some aggressive optimizations from YARV-MJIT, and thus it's slower than YARV-MJIT too. But it's still fairly faster than Ruby 2.5 in some benchmarks (attached below). Note that this JIT compiler passes `make test`, `make test-all`, `make test-spec` without JIT, and even with JIT. Not only it's perfectly safe with JIT disabled because it does not replace VM instructions unlike MJIT, but also with JIT enabled it stably runs Ruby applications including Rails applications. I'm expecting this version as just "initial" JIT compiler. I have many optimization ideas which are skipped for initial merging, and you may easily replace this JIT compiler with a faster one by just replacing mjit_compile.c. `mjit_compile` interface is designed for the purpose. common.mk: update dependencies for mjit_compile.c. internal.h: declare `rb_vm_insn_addr2insn` for MJIT. vm.c: exclude some definitions if `-DMJIT_HEADER` is provided to compiler. This avoids to include some functions which take a long time to compile, e.g. vm_exec_core. Some of the purpose is achieved in transform_mjit_header.rb (see `IGNORED_FUNCTIONS`) but others are manually resolved for now. Load mjit_helper.h for MJIT header. mjit_helper.h: New. This is a file used only by JIT-ed code. I'll refactor `mjit_call_cfunc` later. vm_eval.c: add some #ifdef switches to skip compiling some functions like Init_vm_eval. win32/mkexports.rb: export thread/ec functions, which are used by MJIT. include/ruby/defines.h: add MJIT_FUNC_EXPORTED macro alis to clarify that a function is exported only for MJIT. array.c: export a function used by MJIT. bignum.c: ditto. class.c: ditto. compile.c: ditto. error.c: ditto. gc.c: ditto. hash.c: ditto. iseq.c: ditto. numeric.c: ditto. object.c: ditto. proc.c: ditto. re.c: ditto. st.c: ditto. string.c: ditto. thread.c: ditto. variable.c: ditto. vm_backtrace.c: ditto. vm_insnhelper.c: ditto. vm_method.c: ditto. I would like to improve maintainability of function exports, but I believe this way is acceptable as initial merging if we clarify the new exports are for MJIT (so that we can use them as TODO list to fix) and add unit tests to detect unresolved symbols. I'll add unit tests of JIT compilations in succeeding commits. Author: Takashi Kokubun <takashikkbn@gmail.com> Contributor: wanabe <s.wanabe@gmail.com> Part of [Feature #14235] --- * Known issues * Code generated by gcc is faster than clang. The benchmark may be worse in macOS. Following benchmark result is provided by gcc w/ Linux. * Performance is decreased when Google Chrome is running * JIT can work on MinGW, but it doesn't improve performance at least in short running benchmark. * Currently it doesn't perform well with Rails. We'll try to fix this before release. --- * Benchmark reslts Benchmarked with: Intel 4.0GHz i7-4790K with 16GB memory under x86-64 Ubuntu 8 Cores - 2.0.0-p0: Ruby 2.0.0-p0 - r62186: Ruby trunk (early 2.6.0), before MJIT changes - JIT off: On this commit, but without `--jit` option - JIT on: On this commit, and with `--jit` option ** Optcarrot fps Benchmark: https://github.com/mame/optcarrot | |2.0.0-p0 |r62186 |JIT off |JIT on | |:--------|:--------|:--------|:--------|:--------| |fps |37.32 |51.46 |51.31 |58.88 | |vs 2.0.0 |1.00x |1.38x |1.37x |1.58x | ** MJIT benchmarks Benchmark: https://github.com/benchmark-driver/mjit-benchmarks (Original: https://github.com/vnmakarov/ruby/tree/rtl_mjit_branch/MJIT-benchmarks) | |2.0.0-p0 |r62186 |JIT off |JIT on | |:----------|:--------|:--------|:--------|:--------| |aread |1.00 |1.09 |1.07 |2.19 | |aref |1.00 |1.13 |1.11 |2.22 | |aset |1.00 |1.50 |1.45 |2.64 | |awrite |1.00 |1.17 |1.13 |2.20 | |call |1.00 |1.29 |1.26 |2.02 | |const2 |1.00 |1.10 |1.10 |2.19 | |const |1.00 |1.11 |1.10 |2.19 | |fannk |1.00 |1.04 |1.02 |1.00 | |fib |1.00 |1.32 |1.31 |1.84 | |ivread |1.00 |1.13 |1.12 |2.43 | |ivwrite |1.00 |1.23 |1.21 |2.40 | |mandelbrot |1.00 |1.13 |1.16 |1.28 | |meteor |1.00 |2.97 |2.92 |3.17 | |nbody |1.00 |1.17 |1.15 |1.49 | |nest-ntimes|1.00 |1.22 |1.20 |1.39 | |nest-while |1.00 |1.10 |1.10 |1.37 | |norm |1.00 |1.18 |1.16 |1.24 | |nsvb |1.00 |1.16 |1.16 |1.17 | |red-black |1.00 |1.02 |0.99 |1.12 | |sieve |1.00 |1.30 |1.28 |1.62 | |trees |1.00 |1.14 |1.13 |1.19 | |while |1.00 |1.12 |1.11 |2.41 | ** Discourse's script/bench.rb Benchmark: https://github.com/discourse/discourse/blob/v1.8.7/script/bench.rb NOTE: Rails performance was somehow a little degraded with JIT for now. We should fix this. (At least I know opt_aref is performing badly in JIT and I have an idea to fix it. Please wait for the fix.) *** JIT off Your Results: (note for timings- percentile is first, duration is second in millisecs) categories_admin: 50: 17 75: 18 90: 22 99: 29 home_admin: 50: 21 75: 21 90: 27 99: 40 topic_admin: 50: 17 75: 18 90: 22 99: 32 categories: 50: 35 75: 41 90: 43 99: 77 home: 50: 39 75: 46 90: 49 99: 95 topic: 50: 46 75: 52 90: 56 99: 101 *** JIT on Your Results: (note for timings- percentile is first, duration is second in millisecs) categories_admin: 50: 19 75: 21 90: 25 99: 33 home_admin: 50: 24 75: 26 90: 30 99: 35 topic_admin: 50: 19 75: 20 90: 25 99: 30 categories: 50: 40 75: 44 90: 48 99: 76 home: 50: 42 75: 48 90: 51 99: 89 topic: 50: 49 75: 55 90: 58 99: 99 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@62197 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-02-04 14:22:28 +03:00
{
ruby_debug_printf("ccs:%p (%d,%d)\n", (void *)ccs, ccs->len, ccs->capa);
for (int i=0; i<ccs->len; i++) {
ruby_debug_printf("CCS CI ID:flag:%x argc:%u\n",
ccs->entries[i].flag,
ccs->entries[i].argc);
rp(ccs->entries[i].cc);
}
}
static int
vm_ccs_verify(struct rb_class_cc_entries *ccs, ID mid, VALUE klass)
{
VM_ASSERT(vm_ccs_p(ccs));
VM_ASSERT(ccs->len <= ccs->capa);
for (int i=0; i<ccs->len; i++) {
const struct rb_callcache *cc = ccs->entries[i].cc;
VM_ASSERT(IMEMO_TYPE_P(cc, imemo_callcache));
VM_ASSERT(vm_cc_class_check(cc, klass));
`Primitive.mandatory_only?` for fast path Compare with the C methods, A built-in methods written in Ruby is slower if only mandatory parameters are given because it needs to check the argumens and fill default values for optional and keyword parameters (C methods can check the number of parameters with `argc`, so there are no overhead). Passing mandatory arguments are common (optional arguments are exceptional, in many cases) so it is important to provide the fast path for such common cases. `Primitive.mandatory_only?` is a special builtin function used with `if` expression like that: ```ruby def self.at(time, subsec = false, unit = :microsecond, in: nil) if Primitive.mandatory_only? Primitive.time_s_at1(time) else Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end end ``` and it makes two ISeq, ``` def self.at(time, subsec = false, unit = :microsecond, in: nil) Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end def self.at(time) Primitive.time_s_at1(time) end ``` and (2) is pointed by (1). Note that `Primitive.mandatory_only?` should be used only in a condition of an `if` statement and the `if` statement should be equal to the methdo body (you can not put any expression before and after the `if` statement). A method entry with `mandatory_only?` (`Time.at` on the above case) is marked as `iseq_overload`. When the method will be dispatch only with mandatory arguments (`Time.at(0)` for example), make another method entry with ISeq (2) as mandatory only method entry and it will be cached in an inline method cache. The idea is similar discussed in https://bugs.ruby-lang.org/issues/16254 but it only checks mandatory parameters or more, because many cases only mandatory parameters are given. If we find other cases (optional or keyword parameters are used frequently and it hurts performance), we can extend the feature.
2021-11-12 20:12:20 +03:00
VM_ASSERT(vm_cc_check_cme(cc, ccs->cme));
use inline cache for refinements From Ruby 3.0, refined method invocations are slow because resolved methods are not cached by inline cache because of conservertive strategy. However, `using` clears all caches so that it seems safe to cache resolved method entries. This patch caches resolved method entries in inline cache and clear all of inline method caches when `using` is called. fix [Bug #18572] ```ruby # without refinements class C def foo = :C end N = 1_000_000 obj = C.new require 'benchmark' Benchmark.bm{|x| x.report{N.times{ obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; }} } _END__ user system total real master 0.362859 0.002544 0.365403 ( 0.365424) modified 0.357251 0.000000 0.357251 ( 0.357258) ``` ```ruby # with refinment but without using class C def foo = :C end module R refine C do def foo = :R end end N = 1_000_000 obj = C.new require 'benchmark' Benchmark.bm{|x| x.report{N.times{ obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; }} } __END__ user system total real master 0.957182 0.000000 0.957182 ( 0.957212) modified 0.359228 0.000000 0.359228 ( 0.359238) ``` ```ruby # with using class C def foo = :C end module R refine C do def foo = :R end end N = 1_000_000 using R obj = C.new require 'benchmark' Benchmark.bm{|x| x.report{N.times{ obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; }} }
2023-07-31 10:17:55 +03:00
VM_ASSERT(!vm_cc_super_p(cc));
VM_ASSERT(!vm_cc_refinement_p(cc));
}
return TRUE;
}
#endif
const rb_callable_method_entry_t *rb_check_overloaded_cme(const rb_callable_method_entry_t *cme, const struct rb_callinfo * const ci);
`Primitive.mandatory_only?` for fast path Compare with the C methods, A built-in methods written in Ruby is slower if only mandatory parameters are given because it needs to check the argumens and fill default values for optional and keyword parameters (C methods can check the number of parameters with `argc`, so there are no overhead). Passing mandatory arguments are common (optional arguments are exceptional, in many cases) so it is important to provide the fast path for such common cases. `Primitive.mandatory_only?` is a special builtin function used with `if` expression like that: ```ruby def self.at(time, subsec = false, unit = :microsecond, in: nil) if Primitive.mandatory_only? Primitive.time_s_at1(time) else Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end end ``` and it makes two ISeq, ``` def self.at(time, subsec = false, unit = :microsecond, in: nil) Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end def self.at(time) Primitive.time_s_at1(time) end ``` and (2) is pointed by (1). Note that `Primitive.mandatory_only?` should be used only in a condition of an `if` statement and the `if` statement should be equal to the methdo body (you can not put any expression before and after the `if` statement). A method entry with `mandatory_only?` (`Time.at` on the above case) is marked as `iseq_overload`. When the method will be dispatch only with mandatory arguments (`Time.at(0)` for example), make another method entry with ISeq (2) as mandatory only method entry and it will be cached in an inline method cache. The idea is similar discussed in https://bugs.ruby-lang.org/issues/16254 but it only checks mandatory parameters or more, because many cases only mandatory parameters are given. If we find other cases (optional or keyword parameters are used frequently and it hurts performance), we can extend the feature.
2021-11-12 20:12:20 +03:00
static const struct rb_callcache *
vm_search_cc(const VALUE klass, const struct rb_callinfo * const ci)
{
const ID mid = vm_ci_mid(ci);
struct rb_id_table *cc_tbl = RCLASS_CC_TBL(klass);
struct rb_class_cc_entries *ccs = NULL;
VALUE ccs_data;
if (cc_tbl) {
// CCS data is keyed on method id, so we don't need the method id
// for doing comparisons in the `for` loop below.
if (rb_id_table_lookup(cc_tbl, mid, &ccs_data)) {
ccs = (struct rb_class_cc_entries *)ccs_data;
const int ccs_len = ccs->len;
if (UNLIKELY(METHOD_ENTRY_INVALIDATED(ccs->cme))) {
rb_vm_ccs_free(ccs);
rb_id_table_delete(cc_tbl, mid);
ccs = NULL;
}
else {
VM_ASSERT(vm_ccs_verify(ccs, mid, klass));
// We already know the method id is correct because we had
// to look up the ccs_data by method id. All we need to
// compare is argc and flag
unsigned int argc = vm_ci_argc(ci);
unsigned int flag = vm_ci_flag(ci);
for (int i=0; i<ccs_len; i++) {
unsigned int ccs_ci_argc = ccs->entries[i].argc;
unsigned int ccs_ci_flag = ccs->entries[i].flag;
const struct rb_callcache *ccs_cc = ccs->entries[i].cc;
VM_ASSERT(IMEMO_TYPE_P(ccs_cc, imemo_callcache));
if (ccs_ci_argc == argc && ccs_ci_flag == flag) {
RB_DEBUG_COUNTER_INC(cc_found_in_ccs);
VM_ASSERT(vm_cc_cme(ccs_cc)->called_id == mid);
VM_ASSERT(ccs_cc->klass == klass);
VM_ASSERT(!METHOD_ENTRY_INVALIDATED(vm_cc_cme(ccs_cc)));
return ccs_cc;
}
}
}
}
}
else {
cc_tbl = RCLASS_CC_TBL(klass) = rb_id_table_create(2);
}
RB_DEBUG_COUNTER_INC(cc_not_found_in_ccs);
const rb_callable_method_entry_t *cme;
if (ccs) {
cme = ccs->cme;
cme = UNDEFINED_METHOD_ENTRY_P(cme) ? NULL : cme;
VM_ASSERT(cme == rb_callable_method_entry(klass, mid));
}
else {
cme = rb_callable_method_entry(klass, mid);
}
VM_ASSERT(cme == NULL || IMEMO_TYPE_P(cme, imemo_ment));
if (cme == NULL) {
// undef or not found: can't cache the information
VM_ASSERT(vm_cc_cme(&vm_empty_cc) == NULL);
return &vm_empty_cc;
}
VM_ASSERT(cme == rb_callable_method_entry(klass, mid));
METHOD_ENTRY_CACHED_SET((struct rb_callable_method_entry_struct *)cme);
if (ccs == NULL) {
VM_ASSERT(cc_tbl != NULL);
if (LIKELY(rb_id_table_lookup(cc_tbl, mid, &ccs_data))) {
// rb_callable_method_entry() prepares ccs.
ccs = (struct rb_class_cc_entries *)ccs_data;
}
else {
// TODO: required?
ccs = vm_ccs_create(klass, cc_tbl, mid, cme);
}
}
cme = rb_check_overloaded_cme(cme, ci);
const struct rb_callcache *cc = vm_cc_new(klass, cme, vm_call_general, cc_type_normal);
vm_ccs_push(klass, ccs, ci, cc);
VM_ASSERT(vm_cc_cme(cc) != NULL);
VM_ASSERT(cme->called_id == mid);
VM_ASSERT(vm_cc_cme(cc)->called_id == mid);
return cc;
}
2023-03-07 08:34:31 +03:00
const struct rb_callcache *
rb_vm_search_method_slowpath(const struct rb_callinfo *ci, VALUE klass)
{
const struct rb_callcache *cc;
2024-10-31 16:12:16 +03:00
VM_ASSERT_TYPE2(klass, T_CLASS, T_ICLASS);
RB_VM_LOCK_ENTER();
{
cc = vm_search_cc(klass, ci);
VM_ASSERT(cc);
VM_ASSERT(IMEMO_TYPE_P(cc, imemo_callcache));
VM_ASSERT(cc == vm_cc_empty() || cc->klass == klass);
VM_ASSERT(cc == vm_cc_empty() || callable_method_entry_p(vm_cc_cme(cc)));
VM_ASSERT(cc == vm_cc_empty() || !METHOD_ENTRY_INVALIDATED(vm_cc_cme(cc)));
VM_ASSERT(cc == vm_cc_empty() || vm_cc_cme(cc)->called_id == vm_ci_mid(ci));
}
RB_VM_LOCK_LEAVE();
return cc;
}
static const struct rb_callcache *
2020-12-16 04:36:23 +03:00
vm_search_method_slowpath0(VALUE cd_owner, struct rb_call_data *cd, VALUE klass)
{
#if USE_DEBUG_COUNTER
const struct rb_callcache *old_cc = cd->cc;
#endif
const struct rb_callcache *cc = rb_vm_search_method_slowpath(cd->ci, klass);
#if OPT_INLINE_METHOD_CACHE
cd->cc = cc;
2023-03-07 09:03:39 +03:00
const struct rb_callcache *empty_cc = &vm_empty_cc;
if (cd_owner && cc != empty_cc) {
RB_OBJ_WRITTEN(cd_owner, Qundef, cc);
}
#if USE_DEBUG_COUNTER
if (!old_cc || old_cc == empty_cc) {
// empty
RB_DEBUG_COUNTER_INC(mc_inline_miss_empty);
}
else if (old_cc == cc) {
RB_DEBUG_COUNTER_INC(mc_inline_miss_same_cc);
}
else if (vm_cc_cme(old_cc) == vm_cc_cme(cc)) {
RB_DEBUG_COUNTER_INC(mc_inline_miss_same_cme);
}
else if (vm_cc_cme(old_cc) && vm_cc_cme(cc) &&
vm_cc_cme(old_cc)->def == vm_cc_cme(cc)->def) {
RB_DEBUG_COUNTER_INC(mc_inline_miss_same_def);
}
else {
RB_DEBUG_COUNTER_INC(mc_inline_miss_diff);
}
#endif
#endif // OPT_INLINE_METHOD_CACHE
VM_ASSERT(vm_cc_cme(cc) == NULL ||
vm_cc_cme(cc)->called_id == vm_ci_mid(cd->ci));
return cc;
}
2020-12-19 12:04:47 +03:00
ALWAYS_INLINE(static const struct rb_callcache *vm_search_method_fastpath(VALUE cd_owner, struct rb_call_data *cd, VALUE klass));
discourage inlining for vm_sendish() reversing 9213771817 only for JIT, because it made JIT slower. $ benchmark-driver -v --rbenv 'before;after;before --jit;after --jit' --repeat-count=36 --alternate --output=all benchmark.yml before: ruby 3.0.0dev (2020-12-19T07:38:17Z master a139318538) [x86_64-linux] after: ruby 3.0.0dev (2020-12-19T07:52:01Z master ce9faaeff5) [x86_64-linux] last_commit=discourage inlining for vm_sendish() before --jit: ruby 3.0.0dev (2020-12-19T07:38:17Z master a139318538) +JIT [x86_64-linux] after --jit: ruby 3.0.0dev (2020-12-19T07:52:01Z master ce9faaeff5) +JIT [x86_64-linux] last_commit=discourage inlining for vm_sendish() Calculating ------------------------------------- before after before --jit after --jit Optcarrot Lan_Master.nes 42.83365858987760 42.68912456143848 76.50136803552716 65.74704713379785 fps 42.87724738609940 42.89045158177300 79.72624911659534 81.26221749201044 43.34963955708526 42.95431841174180 80.18085951039328 82.86458983313545 43.56786038452823 43.57563008888242 80.45933051716041 83.09150550702445 43.83219269706004 43.60748924115331 80.67164125046142 83.39458202043882 43.99035062888973 43.62050459554573 80.93204435712701 83.56303651352751 44.25176047881120 44.04822899344536 81.15051082548314 83.58166141398522 44.41978060794512 44.06521657912991 81.35651907376140 83.80036752456826 44.46864790591856 44.09325484326153 81.53456531520031 83.87502933718609 45.54712020644544 44.70693952869038 81.97738413452767 83.95818356402224 45.84292299382878 44.77704345873913 82.35118338199700 83.95966387450966 45.89411137280815 45.41425773286726 83.01052538434648 84.12812994632024 45.93130099197283 46.16884439916935 83.50833510120576 84.26276094927231 46.13648038236674 46.66645417860622 84.88757531920830 85.41732546800056 46.74873798919658 46.71790568883760 84.90953097036886 85.56340808970482 47.11273577214855 46.74581938882115 84.93196765297411 85.57603396455576 47.17870777128640 46.82414166607185 84.97178445888456 86.63510466280221 47.19338055580042 46.83645774240446 85.43536447262163 86.74129103462393 47.25761413477774 46.86834469505590 85.59822430471097 86.85376073363715 47.53327847102834 46.90228589364909 85.76446609620548 87.26108400015282 47.64308771617673 47.02814519551055 85.79904863600991 87.72293541243303 47.80286861846863 47.44672838168050 85.88640862064263 87.86803587836525 47.86455937950740 47.65301489003541 85.88750199172448 88.16881051171814 47.90065455321760 47.73425082354376 85.94295700508701 88.71267004066843 47.90727961241468 47.86377917424705 85.94674546805844 88.77726627283683 47.93243954623904 47.88720812998766 86.51872778134982 88.78993962536994 47.95062952008558 47.88774830879015 86.63116771614249 88.88085054889298 47.95097849989396 47.89825669442417 86.77387990931732 89.72021826461126 48.04730571166697 47.89981045730949 86.95084011077047 89.75804193954582 48.08042611622322 48.03246661737583 87.87239147980547 90.05949240088842 48.08999523258601 48.15253490344558 88.31289344498016 90.36439442190294 48.25670456430854 48.26904755214532 88.33999433286937 90.54253266759406 48.25947200597002 48.41894159956091 88.35502296938638 90.72591894564106 48.30826210577268 48.43125201523194 88.58311746582939 90.77173035874087 48.31514124187375 48.53932287546499 88.89099681179805 91.07747476133886 48.44349281318267 48.58969411593706 89.34043973691581 91.08545627378257
2020-12-19 10:30:09 +03:00
static const struct rb_callcache *
vm_search_method_fastpath(VALUE cd_owner, struct rb_call_data *cd, VALUE klass)
{
const struct rb_callcache *cc = cd->cc;
#if OPT_INLINE_METHOD_CACHE
if (LIKELY(vm_cc_class_check(cc, klass))) {
if (LIKELY(!METHOD_ENTRY_INVALIDATED(vm_cc_cme(cc)))) {
VM_ASSERT(callable_method_entry_p(vm_cc_cme(cc)));
RB_DEBUG_COUNTER_INC(mc_inline_hit);
VM_ASSERT(vm_cc_cme(cc) == NULL || // not found
(vm_ci_flag(cd->ci) & VM_CALL_SUPER) || // search_super w/ define_method
vm_cc_cme(cc)->called_id == vm_ci_mid(cd->ci)); // cme->called_id == ci->mid
return cc;
}
RB_DEBUG_COUNTER_INC(mc_inline_miss_invalidated);
}
else {
RB_DEBUG_COUNTER_INC(mc_inline_miss_klass);
}
#endif
2020-12-16 04:36:23 +03:00
return vm_search_method_slowpath0(cd_owner, cd, klass);
}
static const struct rb_callcache *
vm_search_method(VALUE cd_owner, struct rb_call_data *cd, VALUE recv)
{
VALUE klass = CLASS_OF(recv);
VM_ASSERT(klass != Qfalse);
VM_ASSERT(RBASIC_CLASS(klass) == 0 || rb_obj_is_kind_of(klass, rb_cClass));
return vm_search_method_fastpath(cd_owner, cd, klass);
}
#if __has_attribute(transparent_union)
typedef union {
VALUE (*anyargs)(ANYARGS);
VALUE (*f00)(VALUE);
VALUE (*f01)(VALUE, VALUE);
VALUE (*f02)(VALUE, VALUE, VALUE);
VALUE (*f03)(VALUE, VALUE, VALUE, VALUE);
VALUE (*f04)(VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f05)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f06)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f07)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f08)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f09)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f10)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f11)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f12)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f13)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f14)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*f15)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE);
VALUE (*fm1)(int, union { VALUE *x; const VALUE *y; } __attribute__((__transparent_union__)), VALUE);
} __attribute__((__transparent_union__)) cfunc_type;
# define make_cfunc_type(f) (cfunc_type){.anyargs = (VALUE (*)(ANYARGS))(f)}
#else
typedef VALUE (*cfunc_type)(ANYARGS);
# define make_cfunc_type(f) (cfunc_type)(f)
#endif
static inline int
check_cfunc(const rb_callable_method_entry_t *me, cfunc_type func)
{
if (! me) {
return false;
}
else {
VM_ASSERT(IMEMO_TYPE_P(me, imemo_ment));
VM_ASSERT(callable_method_entry_p(me));
VM_ASSERT(me->def);
if (me->def->type != VM_METHOD_TYPE_CFUNC) {
return false;
}
else {
#if __has_attribute(transparent_union)
return me->def->body.cfunc.func == func.anyargs;
#else
return me->def->body.cfunc.func == func;
#endif
}
}
}
static inline int
vm_method_cfunc_is(const rb_iseq_t *iseq, CALL_DATA cd, VALUE recv, cfunc_type func)
{
VM_ASSERT(iseq != NULL);
const struct rb_callcache *cc = vm_search_method((VALUE)iseq, cd, recv);
return check_cfunc(vm_cc_cme(cc), func);
}
#define check_cfunc(me, func) check_cfunc(me, make_cfunc_type(func))
#define vm_method_cfunc_is(iseq, cd, recv, func) vm_method_cfunc_is(iseq, cd, recv, make_cfunc_type(func))
#define EQ_UNREDEFINED_P(t) BASIC_OP_UNREDEFINED_P(BOP_EQ, t##_REDEFINED_OP_FLAG)
vm_insnhelper.c: make VM helpers inline In r66597, both VM and JIT seem to be made slower: ``` $ benchmark-driver benchmark.yml --rbenv 'r66596::before --disable-gems;r66597::after --disable-gems;r66596+JIT::before --disable-gems --jit;r66597+JIT::after --disable-gems --jit' -v --repeat-count 24 r66596: ruby 2.7.0dev (2018-12-28 trunk 66596) [x86_64-linux] r66597: ruby 2.7.0dev (2018-12-28 trunk 66597) [x86_64-linux] r66596+JIT: ruby 2.7.0dev (2018-12-28 trunk 66596) +JIT [x86_64-linux] r66597+JIT: ruby 2.7.0dev (2018-12-28 trunk 66597) +JIT [x86_64-linux] Calculating ------------------------------------- r66596 r66597 r66596+JIT r66597+JIT Optcarrot Lan_Master.nes 55.174 54.620 88.011 85.326 fps Comparison: Optcarrot Lan_Master.nes r66596+JIT: 88.0 fps r66597+JIT: 85.3 fps - 1.03x slower r66596: 55.2 fps - 1.60x slower r66597: 54.6 fps - 1.61x slower ``` This commit makes JIT's situation a little better. But in 2.7 we seem to have some other regressions after that, and this can't still resurrect the 2.6.0's performance. ``` $ benchmark-driver benchmark.yml --rbenv 'before::before --disable-gems;after::after --disable-gems;before+JIT::before --disable-gems --jit;after+JIT::after --disable-gems --jit' -v --repeat-count 24 before: ruby 2.7.0dev (2019-01-13 trunk 66808) [x86_64-linux] after: ruby 2.7.0dev (2019-01-13 trunk 66808) [x86_64-linux] last_commit=vm_insnhelper.c: make VM helpers inline before+JIT: ruby 2.7.0dev (2019-01-13 trunk 66808) +JIT [x86_64-linux] after+JIT: ruby 2.7.0dev (2019-01-13 trunk 66808) +JIT [x86_64-linux] last_commit=vm_insnhelper.c: make VM helpers inline Calculating ------------------------------------- before after before+JIT after+JIT Optcarrot Lan_Master.nes 51.710 51.535 83.629 85.486 fps Comparison: Optcarrot Lan_Master.nes after+JIT: 85.5 fps before+JIT: 83.6 fps - 1.02x slower before: 51.7 fps - 1.65x slower after: 51.5 fps - 1.66x slower ``` git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66809 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-01-14 07:49:28 +03:00
static inline bool
FIXNUM_2_P(VALUE a, VALUE b)
{
/* FIXNUM_P(a) && FIXNUM_P(b)
* == ((a & 1) && (b & 1))
* == a & b & 1 */
SIGNED_VALUE x = a;
SIGNED_VALUE y = b;
SIGNED_VALUE z = x & y & 1;
return z == 1;
}
vm_insnhelper.c: make VM helpers inline In r66597, both VM and JIT seem to be made slower: ``` $ benchmark-driver benchmark.yml --rbenv 'r66596::before --disable-gems;r66597::after --disable-gems;r66596+JIT::before --disable-gems --jit;r66597+JIT::after --disable-gems --jit' -v --repeat-count 24 r66596: ruby 2.7.0dev (2018-12-28 trunk 66596) [x86_64-linux] r66597: ruby 2.7.0dev (2018-12-28 trunk 66597) [x86_64-linux] r66596+JIT: ruby 2.7.0dev (2018-12-28 trunk 66596) +JIT [x86_64-linux] r66597+JIT: ruby 2.7.0dev (2018-12-28 trunk 66597) +JIT [x86_64-linux] Calculating ------------------------------------- r66596 r66597 r66596+JIT r66597+JIT Optcarrot Lan_Master.nes 55.174 54.620 88.011 85.326 fps Comparison: Optcarrot Lan_Master.nes r66596+JIT: 88.0 fps r66597+JIT: 85.3 fps - 1.03x slower r66596: 55.2 fps - 1.60x slower r66597: 54.6 fps - 1.61x slower ``` This commit makes JIT's situation a little better. But in 2.7 we seem to have some other regressions after that, and this can't still resurrect the 2.6.0's performance. ``` $ benchmark-driver benchmark.yml --rbenv 'before::before --disable-gems;after::after --disable-gems;before+JIT::before --disable-gems --jit;after+JIT::after --disable-gems --jit' -v --repeat-count 24 before: ruby 2.7.0dev (2019-01-13 trunk 66808) [x86_64-linux] after: ruby 2.7.0dev (2019-01-13 trunk 66808) [x86_64-linux] last_commit=vm_insnhelper.c: make VM helpers inline before+JIT: ruby 2.7.0dev (2019-01-13 trunk 66808) +JIT [x86_64-linux] after+JIT: ruby 2.7.0dev (2019-01-13 trunk 66808) +JIT [x86_64-linux] last_commit=vm_insnhelper.c: make VM helpers inline Calculating ------------------------------------- before after before+JIT after+JIT Optcarrot Lan_Master.nes 51.710 51.535 83.629 85.486 fps Comparison: Optcarrot Lan_Master.nes after+JIT: 85.5 fps before+JIT: 83.6 fps - 1.02x slower before: 51.7 fps - 1.65x slower after: 51.5 fps - 1.66x slower ``` git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66809 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-01-14 07:49:28 +03:00
static inline bool
FLONUM_2_P(VALUE a, VALUE b)
{
#if USE_FLONUM
/* FLONUM_P(a) && FLONUM_P(b)
* == ((a & 3) == 2) && ((b & 3) == 2)
* == ! ((a ^ 2) | (b ^ 2) & 3)
*/
SIGNED_VALUE x = a;
SIGNED_VALUE y = b;
SIGNED_VALUE z = ((x ^ 2) | (y ^ 2)) & 3;
return !z;
#else
return false;
#endif
}
static VALUE
opt_equality_specialized(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) && EQ_UNREDEFINED_P(INTEGER)) {
goto compare_by_identity;
}
else if (FLONUM_2_P(recv, obj) && EQ_UNREDEFINED_P(FLOAT)) {
goto compare_by_identity;
}
else if (STATIC_SYM_P(recv) && STATIC_SYM_P(obj) && EQ_UNREDEFINED_P(SYMBOL)) {
goto compare_by_identity;
}
else if (SPECIAL_CONST_P(recv)) {
//
}
else if (RBASIC_CLASS(recv) == rb_cFloat && RB_FLOAT_TYPE_P(obj) && EQ_UNREDEFINED_P(FLOAT)) {
double a = RFLOAT_VALUE(recv);
double b = RFLOAT_VALUE(obj);
#if MSC_VERSION_BEFORE(1300)
if (isnan(a)) {
return Qfalse;
}
else if (isnan(b)) {
return Qfalse;
}
else
#endif
2021-08-02 06:06:44 +03:00
return RBOOL(a == b);
}
else if (RBASIC_CLASS(recv) == rb_cString && EQ_UNREDEFINED_P(STRING)) {
if (recv == obj) {
return Qtrue;
}
else if (RB_TYPE_P(obj, T_STRING)) {
return rb_str_eql_internal(obj, recv);
Make opt_eq and opt_neq insns leaf # Benchmark zero? ``` require 'benchmark/ips' Numeric.class_eval do def ruby_zero? self == 0 end end Benchmark.ips do |x| x.report('0.zero?') { 0.ruby_zero? } x.report('1.zero?') { 1.ruby_zero? } x.compare! end ``` ## VM No significant impact for VM. ### before ruby 2.7.0dev (2019-08-04T02:56:02Z master 2d8c037e97) [x86_64-linux] 0.zero?: 21855445.5 i/s 1.zero?: 21770817.3 i/s - same-ish: difference falls within error ### after ruby 2.7.0dev (2019-08-04T11:17:10Z opt-eq-leaf 6404bebd6a) [x86_64-linux] 1.zero?: 21958912.3 i/s 0.zero?: 21881625.9 i/s - same-ish: difference falls within error ## JIT The performance improves about 1.23x. ### before ruby 2.7.0dev (2019-08-04T02:56:02Z master 2d8c037e97) +JIT [x86_64-linux] 0.zero?: 36343111.6 i/s 1.zero?: 36295153.3 i/s - same-ish: difference falls within error ### after ruby 2.7.0dev (2019-08-04T11:17:10Z opt-eq-leaf 6404bebd6a) +JIT [x86_64-linux] 0.zero?: 44740467.2 i/s 1.zero?: 44363616.1 i/s - same-ish: difference falls within error # Benchmark str == str / str != str ``` # frozen_string_literal: true require 'benchmark/ips' Benchmark.ips do |x| x.report('a == a') { 'a' == 'a' } x.report('a == b') { 'a' == 'b' } x.report('a != a') { 'a' != 'a' } x.report('a != b') { 'a' != 'b' } x.compare! end ``` ## VM No significant impact for VM. ### before ruby 2.7.0dev (2019-08-04T02:56:02Z master 2d8c037e97) [x86_64-linux] a == a: 27286219.0 i/s a != a: 24892389.5 i/s - 1.10x slower a == b: 23623635.8 i/s - 1.16x slower a != b: 21800958.0 i/s - 1.25x slower ### after ruby 2.7.0dev (2019-08-04T11:17:10Z opt-eq-leaf 6404bebd6a) [x86_64-linux] a == a: 27224016.2 i/s a != a: 24490109.5 i/s - 1.11x slower a == b: 23391052.4 i/s - 1.16x slower a != b: 21811321.7 i/s - 1.25x slower ## JIT The performance improves on JIT a little. ### before ruby 2.7.0dev (2019-08-04T02:56:02Z master 2d8c037e97) +JIT [x86_64-linux] a == a: 42010674.7 i/s a != a: 38920311.2 i/s - same-ish: difference falls within error a == b: 32574262.2 i/s - 1.29x slower a != b: 32099790.3 i/s - 1.31x slower ### after ruby 2.7.0dev (2019-08-04T11:17:10Z opt-eq-leaf 6404bebd6a) +JIT [x86_64-linux] a == a: 46902738.8 i/s a != a: 43097258.6 i/s - 1.09x slower a == b: 35822018.4 i/s - 1.31x slower a != b: 33377257.8 i/s - 1.41x slower This is needed towards Bug#15589. Closes: https://github.com/ruby/ruby/pull/2318
2019-08-04 14:11:00 +03:00
}
}
return Qundef;
compare_by_identity:
2021-08-02 06:06:44 +03:00
return RBOOL(recv == obj);
}
static VALUE
opt_equality(const rb_iseq_t *cd_owner, VALUE recv, VALUE obj, CALL_DATA cd)
{
VM_ASSERT(cd_owner != NULL);
VALUE val = opt_equality_specialized(recv, obj);
2022-11-15 07:24:08 +03:00
if (!UNDEF_P(val)) return val;
if (!vm_method_cfunc_is(cd_owner, cd, recv, rb_obj_equal)) {
return Qundef;
}
else {
2021-08-02 06:06:44 +03:00
return RBOOL(recv == obj);
}
}
#undef EQ_UNREDEFINED_P
static inline const struct rb_callcache *gccct_method_search(rb_execution_context_t *ec, VALUE recv, ID mid, const struct rb_callinfo *ci); // vm_eval.c
NOINLINE(static VALUE opt_equality_by_mid_slowpath(VALUE recv, VALUE obj, ID mid));
static VALUE
opt_equality_by_mid_slowpath(VALUE recv, VALUE obj, ID mid)
{
const struct rb_callcache *cc = gccct_method_search(GET_EC(), recv, mid, &VM_CI_ON_STACK(mid, 0, 1, NULL));
if (cc && check_cfunc(vm_cc_cme(cc), rb_obj_equal)) {
2021-08-02 06:06:44 +03:00
return RBOOL(recv == obj);
}
else {
return Qundef;
}
}
static VALUE
opt_equality_by_mid(VALUE recv, VALUE obj, ID mid)
{
VALUE val = opt_equality_specialized(recv, obj);
2022-11-15 07:24:08 +03:00
if (!UNDEF_P(val)) {
return val;
}
else {
return opt_equality_by_mid_slowpath(recv, obj, mid);
}
}
VALUE
rb_equal_opt(VALUE obj1, VALUE obj2)
{
return opt_equality_by_mid(obj1, obj2, idEq);
}
VALUE
rb_eql_opt(VALUE obj1, VALUE obj2)
{
return opt_equality_by_mid(obj1, obj2, idEqlP);
}
extern VALUE rb_vm_call0(rb_execution_context_t *ec, VALUE, ID, int, const VALUE*, const rb_callable_method_entry_t *, int kw_splat);
extern VALUE rb_vm_call_with_refinements(rb_execution_context_t *, VALUE, ID, int, const VALUE *, int);
static VALUE
check_match(rb_execution_context_t *ec, VALUE pattern, VALUE target, enum vm_check_match_type type)
{
switch (type) {
case VM_CHECKMATCH_TYPE_WHEN:
return pattern;
case VM_CHECKMATCH_TYPE_RESCUE:
if (!rb_obj_is_kind_of(pattern, rb_cModule)) {
rb_raise(rb_eTypeError, "class or module required for rescue clause");
}
/* fall through */
case VM_CHECKMATCH_TYPE_CASE: {
return rb_vm_call_with_refinements(ec, pattern, idEqq, 1, &target, RB_NO_KEYWORDS);
}
default:
rb_bug("check_match: unreachable");
}
}
#if MSC_VERSION_BEFORE(1300)
#define CHECK_CMP_NAN(a, b) if (isnan(a) || isnan(b)) return Qfalse;
#else
#define CHECK_CMP_NAN(a, b) /* do nothing */
#endif
static inline VALUE
double_cmp_lt(double a, double b)
{
CHECK_CMP_NAN(a, b);
2021-08-02 06:06:44 +03:00
return RBOOL(a < b);
}
static inline VALUE
double_cmp_le(double a, double b)
{
CHECK_CMP_NAN(a, b);
2021-08-02 06:06:44 +03:00
return RBOOL(a <= b);
}
static inline VALUE
double_cmp_gt(double a, double b)
{
CHECK_CMP_NAN(a, b);
2021-08-02 06:06:44 +03:00
return RBOOL(a > b);
}
* vm_core.h: remove lfp (local frame pointer) and rename dfp (dynamic frame pointer) to ep (environment pointer). This change make VM `normal' (similar to other interpreters). Before this commit: Each frame has two env pointers lfp and dfp. lfp points local environment which is method/class/toplevel frame. lfp[0] is block pointer. dfp is block local frame. dfp[0] points previous (parent) environment pointer. lfp == dfp when frame is method/class/toplevel. You can get lfp from dfp by traversing previous environment pointers. After this commit: Each frame has only `ep' to point respective enviornoment. If there is parent environment, then ep[0] points parent envioenment (as dfp). If there are no more environment, then ep[0] points block pointer (as lfp). We call such ep as `LEP' (local EP). We add some macros to get LEP and to detect LEP or not. In short, we replace dfp and lfp with ep and LEP. rb_block_t and rb_binding_t member `lfp' and `dfp' are removed and member `ep' is added. rename rb_thread_t's member `local_lfp' and `local_svar' to `root_lep' and `root_svar'. (VM_EP_PREV_EP(ep)): get previous environment pointer. This macro assume that ep is not LEP. (VM_EP_BLOCK_PTR(ep)): get block pointer. This macro assume that ep is LEP. (VM_EP_LEP_P(ep)): detect ep is LEP or not. (VM_ENVVAL_BLOCK_PTR(ptr)): make block pointer. (VM_ENVVAL_BLOCK_PTR_P(v)): detect v is block pointer. (VM_ENVVAL_PREV_EP_PTR(ptr)): make prev environment pointer. (VM_ENVVAL_PREV_EP_PTR_P(v)): detect v is prev env pointer. * vm.c: apply above changes. (VM_EP_LEP(ep)): get LEP. (VM_CF_LEP(cfp)): get LEP of cfp->ep. (VM_CF_PREV_EP(cfp)): utility function VM_EP_PREV_EP(cfp->ep). (VM_CF_BLOCK_PTR(cfp)): utility function VM_EP_BLOCK_PTR(cfp->ep). * vm.c, vm_eval.c, vm_insnhelper.c, vm_insnhelper.h, insns.def: apply above changes. * cont.c: ditto. * eval.c, eval_intern.h: ditto. * proc.c: ditto. * thread.c: ditto. * vm_dump.c: ditto. * vm_exec.h: fix function name (on vm debug mode). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@36030 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-06-11 07:14:59 +04:00
static inline VALUE
double_cmp_ge(double a, double b)
{
CHECK_CMP_NAN(a, b);
2021-08-02 06:06:44 +03:00
return RBOOL(a >= b);
}
* vm_core.h: remove lfp (local frame pointer) and rename dfp (dynamic frame pointer) to ep (environment pointer). This change make VM `normal' (similar to other interpreters). Before this commit: Each frame has two env pointers lfp and dfp. lfp points local environment which is method/class/toplevel frame. lfp[0] is block pointer. dfp is block local frame. dfp[0] points previous (parent) environment pointer. lfp == dfp when frame is method/class/toplevel. You can get lfp from dfp by traversing previous environment pointers. After this commit: Each frame has only `ep' to point respective enviornoment. If there is parent environment, then ep[0] points parent envioenment (as dfp). If there are no more environment, then ep[0] points block pointer (as lfp). We call such ep as `LEP' (local EP). We add some macros to get LEP and to detect LEP or not. In short, we replace dfp and lfp with ep and LEP. rb_block_t and rb_binding_t member `lfp' and `dfp' are removed and member `ep' is added. rename rb_thread_t's member `local_lfp' and `local_svar' to `root_lep' and `root_svar'. (VM_EP_PREV_EP(ep)): get previous environment pointer. This macro assume that ep is not LEP. (VM_EP_BLOCK_PTR(ep)): get block pointer. This macro assume that ep is LEP. (VM_EP_LEP_P(ep)): detect ep is LEP or not. (VM_ENVVAL_BLOCK_PTR(ptr)): make block pointer. (VM_ENVVAL_BLOCK_PTR_P(v)): detect v is block pointer. (VM_ENVVAL_PREV_EP_PTR(ptr)): make prev environment pointer. (VM_ENVVAL_PREV_EP_PTR_P(v)): detect v is prev env pointer. * vm.c: apply above changes. (VM_EP_LEP(ep)): get LEP. (VM_CF_LEP(cfp)): get LEP of cfp->ep. (VM_CF_PREV_EP(cfp)): utility function VM_EP_PREV_EP(cfp->ep). (VM_CF_BLOCK_PTR(cfp)): utility function VM_EP_BLOCK_PTR(cfp->ep). * vm.c, vm_eval.c, vm_insnhelper.c, vm_insnhelper.h, insns.def: apply above changes. * cont.c: ditto. * eval.c, eval_intern.h: ditto. * proc.c: ditto. * thread.c: ditto. * vm_dump.c: ditto. * vm_exec.h: fix function name (on vm debug mode). git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@36030 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-06-11 07:14:59 +04:00
Remove __bp__ and speed-up bmethod calls (#8060) Remove rb_control_frame_t::__bp__ and optimize bmethod calls This commit removes the __bp__ field from rb_control_frame_t. It was introduced to help MJIT, but since MJIT was replaced by RJIT, we can use vm_base_ptr() to compute it from the SP of the previous control frame instead. Removing the field avoids needing to set it up when pushing new frames. Simply removing __bp__ would cause crashes since RJIT and YJIT used a slightly different stack layout for bmethod calls than the interpreter. At the moment of the call, the two layouts looked as follows: ┌────────────┐ ┌────────────┐ │ frame_base │ │ frame_base │ ├────────────┤ ├────────────┤ │ ... │ │ ... │ ├────────────┤ ├────────────┤ │ args │ │ args │ ├────────────┤ └────────────┘<─prev_frame_sp │ receiver │ prev_frame_sp─>└────────────┘ RJIT & YJIT interpreter Essentially, vm_base_ptr() needs to compute the address to frame_base given prev_frame_sp in the diagrams. The presence of the receiver created an off-by-one situation. Make the interpreter use the layout the JITs use for iseq-to-iseq bmethod calls. Doing so removes unnecessary argument shifting and vm_exec_core() re-entry from the interpreter, yielding a speed improvement visible through `benchmark/vm_defined_method.yml`: patched: 7578743.1 i/s master: 4796596.3 i/s - 1.58x slower C-to-iseq bmethod calls now store one more VALUE than before, but that should have negligible impact on overall performance. Note that re-entering vm_exec_core() used to be necessary for firing TracePoint events, but that's no longer the case since 9121e57a5f50bc91bae48b3b91edb283bf96cb6b. Closes ruby/ruby#6428
2023-07-17 20:57:58 +03:00
// Copied by vm_dump.c
static inline VALUE *
vm_base_ptr(const rb_control_frame_t *cfp)
{
const rb_control_frame_t *prev_cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp);
if (cfp->iseq && VM_FRAME_RUBYFRAME_P(cfp)) {
VALUE *bp = prev_cfp->sp + ISEQ_BODY(cfp->iseq)->local_table_size + VM_ENV_DATA_SIZE;
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
if (ISEQ_BODY(cfp->iseq)->param.flags.forwardable && VM_ENV_LOCAL_P(cfp->ep)) {
int lts = ISEQ_BODY(cfp->iseq)->local_table_size;
int params = ISEQ_BODY(cfp->iseq)->param.size;
CALL_INFO ci = (CALL_INFO)cfp->ep[-(VM_ENV_DATA_SIZE + (lts - params))]; // skip EP stuff, CI should be last local
bp += vm_ci_argc(ci);
}
Remove __bp__ and speed-up bmethod calls (#8060) Remove rb_control_frame_t::__bp__ and optimize bmethod calls This commit removes the __bp__ field from rb_control_frame_t. It was introduced to help MJIT, but since MJIT was replaced by RJIT, we can use vm_base_ptr() to compute it from the SP of the previous control frame instead. Removing the field avoids needing to set it up when pushing new frames. Simply removing __bp__ would cause crashes since RJIT and YJIT used a slightly different stack layout for bmethod calls than the interpreter. At the moment of the call, the two layouts looked as follows: ┌────────────┐ ┌────────────┐ │ frame_base │ │ frame_base │ ├────────────┤ ├────────────┤ │ ... │ │ ... │ ├────────────┤ ├────────────┤ │ args │ │ args │ ├────────────┤ └────────────┘<─prev_frame_sp │ receiver │ prev_frame_sp─>└────────────┘ RJIT & YJIT interpreter Essentially, vm_base_ptr() needs to compute the address to frame_base given prev_frame_sp in the diagrams. The presence of the receiver created an off-by-one situation. Make the interpreter use the layout the JITs use for iseq-to-iseq bmethod calls. Doing so removes unnecessary argument shifting and vm_exec_core() re-entry from the interpreter, yielding a speed improvement visible through `benchmark/vm_defined_method.yml`: patched: 7578743.1 i/s master: 4796596.3 i/s - 1.58x slower C-to-iseq bmethod calls now store one more VALUE than before, but that should have negligible impact on overall performance. Note that re-entering vm_exec_core() used to be necessary for firing TracePoint events, but that's no longer the case since 9121e57a5f50bc91bae48b3b91edb283bf96cb6b. Closes ruby/ruby#6428
2023-07-17 20:57:58 +03:00
if (ISEQ_BODY(cfp->iseq)->type == ISEQ_TYPE_METHOD || VM_FRAME_BMETHOD_P(cfp)) {
/* adjust `self' */
bp += 1;
}
#if VM_DEBUG_BP_CHECK
if (bp != cfp->bp_check) {
ruby_debug_printf("bp_check: %ld, bp: %ld\n",
(long)(cfp->bp_check - GET_EC()->vm_stack),
(long)(bp - GET_EC()->vm_stack));
rb_bug("vm_base_ptr: unreachable");
}
#endif
return bp;
}
else {
return NULL;
}
}
VALUE *
rb_vm_base_ptr(const rb_control_frame_t *cfp)
{
return vm_base_ptr(cfp);
}
/* method call processes with call_info */
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 21:02:55 +03:00
#include "vm_args.c"
static inline VALUE vm_call_iseq_setup_2(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling, int opt_pc, int param_size, int local_size);
ALWAYS_INLINE(static VALUE vm_call_iseq_setup_normal(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling, const rb_callable_method_entry_t *me, int opt_pc, int param_size, int local_size));
static inline VALUE vm_call_iseq_setup_tailcall(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling, int opt_pc);
static VALUE vm_call_super_method(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling);
static VALUE vm_call_method_nome(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling);
static VALUE vm_call_method_each_type(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling);
static inline VALUE vm_call_method(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling);
static vm_call_handler vm_call_iseq_setup_func(const struct rb_callinfo *ci, const int param_size, const int local_size);
static VALUE
vm_call_iseq_setup_tailcall_0start(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_iseq_setup_tailcall_0start);
return vm_call_iseq_setup_tailcall(ec, cfp, calling, 0);
}
static VALUE
vm_call_iseq_setup_normal_0start(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_iseq_setup_0start);
const struct rb_callcache *cc = calling->cc;
const rb_iseq_t *iseq = def_iseq_ptr(vm_cc_cme(cc)->def);
int param = ISEQ_BODY(iseq)->param.size;
int local = ISEQ_BODY(iseq)->local_table_size;
return vm_call_iseq_setup_normal(ec, cfp, calling, vm_cc_cme(cc), 0, param, local);
}
2023-03-07 09:02:03 +03:00
bool
rb_simple_iseq_p(const rb_iseq_t *iseq)
{
return ISEQ_BODY(iseq)->param.flags.has_opt == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_rest == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_post == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_kw == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_kwrest == FALSE &&
ISEQ_BODY(iseq)->param.flags.accepts_no_kwarg == FALSE &&
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
ISEQ_BODY(iseq)->param.flags.forwardable == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_block == FALSE;
}
2023-03-07 08:34:31 +03:00
bool
rb_iseq_only_optparam_p(const rb_iseq_t *iseq)
{
return ISEQ_BODY(iseq)->param.flags.has_opt == TRUE &&
ISEQ_BODY(iseq)->param.flags.has_rest == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_post == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_kw == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_kwrest == FALSE &&
ISEQ_BODY(iseq)->param.flags.accepts_no_kwarg == FALSE &&
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
ISEQ_BODY(iseq)->param.flags.forwardable == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_block == FALSE;
}
2023-03-07 08:34:31 +03:00
bool
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
rb_iseq_only_kwparam_p(const rb_iseq_t *iseq)
{
return ISEQ_BODY(iseq)->param.flags.has_opt == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_rest == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_post == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_kw == TRUE &&
ISEQ_BODY(iseq)->param.flags.has_kwrest == FALSE &&
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
ISEQ_BODY(iseq)->param.flags.forwardable == FALSE &&
ISEQ_BODY(iseq)->param.flags.has_block == FALSE;
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
}
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
#define ALLOW_HEAP_ARGV (-2)
#define ALLOW_HEAP_ARGV_KEEP_KWSPLAT (-3)
static inline bool
vm_caller_setup_arg_splat(rb_control_frame_t *cfp, struct rb_calling_info *calling, VALUE ary, int max_args)
2020-04-14 06:32:59 +03:00
{
vm_check_canary(GET_EC(), cfp->sp);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
bool ret = false;
if (!NIL_P(ary)) {
const VALUE *ptr = RARRAY_CONST_PTR(ary);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
long len = RARRAY_LEN(ary);
int argc = calling->argc;
if (UNLIKELY(max_args <= ALLOW_HEAP_ARGV && len + argc > VM_ARGC_STACK_MAX)) {
/* Avoid SystemStackError when splatting large arrays by storing arguments in
* a temporary array, instead of trying to keeping arguments on the VM stack.
*/
VALUE *argv = cfp->sp - argc;
VALUE argv_ary = rb_ary_hidden_new(len + argc + 1);
rb_ary_cat(argv_ary, argv, argc);
rb_ary_cat(argv_ary, ptr, len);
cfp->sp -= argc - 1;
cfp->sp[-1] = argv_ary;
calling->argc = 1;
calling->heap_argv = argv_ary;
RB_GC_GUARD(ary);
}
else {
long i;
if (max_args >= 0 && len + argc > max_args) {
/* If only a given max_args is allowed, copy up to max args.
* Used by vm_callee_setup_block_arg for non-lambda blocks,
* where additional arguments are ignored.
*
* Also, copy up to one more argument than the maximum,
* in case it is an empty keyword hash that will be removed.
*/
calling->argc += len - (max_args - argc + 1);
len = max_args - argc + 1;
ret = true;
}
else {
/* Unset heap_argv if set originally. Can happen when
* forwarding modified arguments, where heap_argv was used
* originally, but heap_argv not supported by the forwarded
* method in all cases.
*/
calling->heap_argv = 0;
}
CHECK_VM_STACK_OVERFLOW(cfp, len);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
for (i = 0; i < len; i++) {
*cfp->sp++ = ptr[i];
}
calling->argc += i;
}
}
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
return ret;
2020-04-14 06:32:59 +03:00
}
static inline void
vm_caller_setup_arg_kw(rb_control_frame_t *cfp, struct rb_calling_info *calling, const struct rb_callinfo *ci)
{
const VALUE *const passed_keywords = vm_ci_kwarg(ci)->keywords;
const int kw_len = vm_ci_kwarg(ci)->keyword_len;
const VALUE h = rb_hash_new_with_size(kw_len);
VALUE *sp = cfp->sp;
int i;
for (i=0; i<kw_len; i++) {
rb_hash_aset(h, passed_keywords[i], (sp - kw_len)[i]);
}
(sp-kw_len)[0] = h;
cfp->sp -= kw_len - 1;
calling->argc -= kw_len - 1;
calling->kw_splat = 1;
}
static inline VALUE
vm_caller_setup_keyword_hash(const struct rb_callinfo *ci, VALUE keyword_hash)
{
if (UNLIKELY(!RB_TYPE_P(keyword_hash, T_HASH))) {
if (keyword_hash != Qnil) {
/* Convert a non-hash keyword splat to a new hash */
keyword_hash = rb_hash_dup(rb_to_hash_type(keyword_hash));
}
}
Avoid hash allocation for certain proc calls Previously, proc calls such as: ```ruby proc{|| }.(**empty_hash) proc{|b: 1| }.(**r2k_array_with_empty_hash) ``` both allocated hashes unnecessarily, due to two separate code paths. The first call goes through CALLER_SETUP_ARG/vm_caller_setup_keyword_hash, and is simple to fix by not duping an empty keyword hash that will be dropped. The second case is more involved, in setup_parameters_complex, but is fixed the exact same way as when the ruby2_keywords hash is not empty, by flattening the rest array to the VM stack, ignoring the last element (the empty keyword splat). Add a flatten_rest_array static function to handle this case. Update test_allocation.rb to automatically convert the method call allocation tests to proc allocation tests, at least for the calls that can be converted. With the code changes, all proc call allocation tests pass, showing that proc calls and method calls now allocate the same number of objects. I've audited the allocation tests, and I believe that all of the low hanging fruit has been collected. All remaining allocations are either caller side: * Positional splat + post argument * Multiple positional splats * Literal keywords + keyword splat * Multiple keyword splats Or callee side: * Positional splat parameter * Keyword splat parameter * Keyword to positional argument conversion for methods that don't accept keywords * ruby2_keywords method called with keywords Reapplies abc04e898b627ab37fa9dd5e330f239768778d8b, which was reverted at d56470a27c5a8a2e7aee7a76cea445c2d29c0c59, with the addition of a bug fix and test. Fixes [Bug #20679]
2024-08-20 05:00:37 +03:00
else if (!IS_ARGS_KW_SPLAT_MUT(ci) && !RHASH_EMPTY_P(keyword_hash)) {
/* Convert a hash keyword splat to a new hash unless
* a mutable keyword splat was passed.
Avoid hash allocation for certain proc calls Previously, proc calls such as: ```ruby proc{|| }.(**empty_hash) proc{|b: 1| }.(**r2k_array_with_empty_hash) ``` both allocated hashes unnecessarily, due to two separate code paths. The first call goes through CALLER_SETUP_ARG/vm_caller_setup_keyword_hash, and is simple to fix by not duping an empty keyword hash that will be dropped. The second case is more involved, in setup_parameters_complex, but is fixed the exact same way as when the ruby2_keywords hash is not empty, by flattening the rest array to the VM stack, ignoring the last element (the empty keyword splat). Add a flatten_rest_array static function to handle this case. Update test_allocation.rb to automatically convert the method call allocation tests to proc allocation tests, at least for the calls that can be converted. With the code changes, all proc call allocation tests pass, showing that proc calls and method calls now allocate the same number of objects. I've audited the allocation tests, and I believe that all of the low hanging fruit has been collected. All remaining allocations are either caller side: * Positional splat + post argument * Multiple positional splats * Literal keywords + keyword splat * Multiple keyword splats Or callee side: * Positional splat parameter * Keyword splat parameter * Keyword to positional argument conversion for methods that don't accept keywords * ruby2_keywords method called with keywords Reapplies abc04e898b627ab37fa9dd5e330f239768778d8b, which was reverted at d56470a27c5a8a2e7aee7a76cea445c2d29c0c59, with the addition of a bug fix and test. Fixes [Bug #20679]
2024-08-20 05:00:37 +03:00
* Skip allocating new hash for empty keyword splat, as empty
* keyword splat will be ignored by both callers.
*/
keyword_hash = rb_hash_dup(keyword_hash);
}
return keyword_hash;
}
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
vm_insnhelper.c: make VM helpers inline In r66597, both VM and JIT seem to be made slower: ``` $ benchmark-driver benchmark.yml --rbenv 'r66596::before --disable-gems;r66597::after --disable-gems;r66596+JIT::before --disable-gems --jit;r66597+JIT::after --disable-gems --jit' -v --repeat-count 24 r66596: ruby 2.7.0dev (2018-12-28 trunk 66596) [x86_64-linux] r66597: ruby 2.7.0dev (2018-12-28 trunk 66597) [x86_64-linux] r66596+JIT: ruby 2.7.0dev (2018-12-28 trunk 66596) +JIT [x86_64-linux] r66597+JIT: ruby 2.7.0dev (2018-12-28 trunk 66597) +JIT [x86_64-linux] Calculating ------------------------------------- r66596 r66597 r66596+JIT r66597+JIT Optcarrot Lan_Master.nes 55.174 54.620 88.011 85.326 fps Comparison: Optcarrot Lan_Master.nes r66596+JIT: 88.0 fps r66597+JIT: 85.3 fps - 1.03x slower r66596: 55.2 fps - 1.60x slower r66597: 54.6 fps - 1.61x slower ``` This commit makes JIT's situation a little better. But in 2.7 we seem to have some other regressions after that, and this can't still resurrect the 2.6.0's performance. ``` $ benchmark-driver benchmark.yml --rbenv 'before::before --disable-gems;after::after --disable-gems;before+JIT::before --disable-gems --jit;after+JIT::after --disable-gems --jit' -v --repeat-count 24 before: ruby 2.7.0dev (2019-01-13 trunk 66808) [x86_64-linux] after: ruby 2.7.0dev (2019-01-13 trunk 66808) [x86_64-linux] last_commit=vm_insnhelper.c: make VM helpers inline before+JIT: ruby 2.7.0dev (2019-01-13 trunk 66808) +JIT [x86_64-linux] after+JIT: ruby 2.7.0dev (2019-01-13 trunk 66808) +JIT [x86_64-linux] last_commit=vm_insnhelper.c: make VM helpers inline Calculating ------------------------------------- before after before+JIT after+JIT Optcarrot Lan_Master.nes 51.710 51.535 83.629 85.486 fps Comparison: Optcarrot Lan_Master.nes after+JIT: 85.5 fps before+JIT: 83.6 fps - 1.02x slower before: 51.7 fps - 1.65x slower after: 51.5 fps - 1.66x slower ``` git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66809 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-01-14 07:49:28 +03:00
static inline void
CALLER_SETUP_ARG(struct rb_control_frame_struct *restrict cfp,
struct rb_calling_info *restrict calling,
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
const struct rb_callinfo *restrict ci, int max_args)
{
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (UNLIKELY(IS_ARGS_SPLAT(ci))) {
if (IS_ARGS_KW_SPLAT(ci)) {
// f(*a, **kw)
VM_ASSERT(calling->kw_splat == 1);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
cfp->sp -= 2;
calling->argc -= 2;
VALUE ary = cfp->sp[0];
VALUE kwh = vm_caller_setup_keyword_hash(ci, cfp->sp[1]);
// splat a
if (vm_caller_setup_arg_splat(cfp, calling, ary, max_args)) return;
// put kw
if (kwh != Qnil && !RHASH_EMPTY_P(kwh)) {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (UNLIKELY(calling->heap_argv)) {
rb_ary_push(calling->heap_argv, kwh);
((struct RHash *)kwh)->basic.flags |= RHASH_PASS_AS_KEYWORDS;
if (max_args != ALLOW_HEAP_ARGV_KEEP_KWSPLAT) {
calling->kw_splat = 0;
}
}
else {
cfp->sp[0] = kwh;
cfp->sp++;
calling->argc++;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
VM_ASSERT(calling->kw_splat == 1);
}
}
else {
calling->kw_splat = 0;
}
Reduce allocations for keyword argument hashes Previously, passing a keyword splat to a method always allocated a hash on the caller side, and accepting arbitrary keywords in a method allocated a separate hash on the callee side. Passing explicit keywords to a method that accepted a keyword splat did not allocate a hash on the caller side, but resulted in two hashes allocated on the callee side. This commit makes passing a single keyword splat to a method not allocate a hash on the caller side. Passing multiple keyword splats or a mix of explicit keywords and a keyword splat still generates a hash on the caller side. On the callee side, if arbitrary keywords are not accepted, it does not allocate a hash. If arbitrary keywords are accepted, it will allocate a hash, but this commit uses a callinfo flag to indicate whether the caller already allocated a hash, and if so, the callee can use the passed hash without duplicating it. So this commit should make it so that a maximum of a single hash is allocated during method calls. To set the callinfo flag appropriately, method call argument compilation checks if only a single keyword splat is given. If only one keyword splat is given, the VM_CALL_KW_SPLAT_MUT callinfo flag is not set, since in that case the keyword splat is passed directly and not mutable. If more than one splat is used, a new hash needs to be generated on the caller side, and in that case the callinfo flag is set, indicating the keyword splat is mutable by the callee. In compile_hash, used for both hash and keyword argument compilation, if compiling keyword arguments and only a single keyword splat is used, pass the argument directly. On the caller side, in vm_args.c, the callinfo flag needs to be recognized and handled. Because the keyword splat argument may not be a hash, it needs to be converted to a hash first if not. Then, unless the callinfo flag is set, the hash needs to be duplicated. The temporary copy of the callinfo flag, kw_flag, is updated if a hash was duplicated, to prevent the need to duplicate it again. If we are converting to a hash or duplicating a hash, we need to update the argument array, which can including duplicating the positional splat array if one was passed. CALLER_SETUP_ARG and a couple other places needs to be modified to handle similar issues for other types of calls. This includes fairly comprehensive tests for different ways keywords are handled internally, checking that you get equal results but that keyword splats on the caller side result in distinct objects for keyword rest parameters. Included are benchmarks for keyword argument calls. Brief results when compiled without optimization: def kw(a: 1) a end def kws(**kw) kw end h = {a: 1} kw(a: 1) # about same kw(**h) # 2.37x faster kws(a: 1) # 1.30x faster kws(**h) # 2.19x faster kw(a: 1, **h) # 1.03x slower kw(**h, **h) # about same kws(a: 1, **h) # 1.16x faster kws(**h, **h) # 1.14x faster
2020-02-24 23:05:07 +03:00
}
else {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
// f(*a)
VM_ASSERT(calling->kw_splat == 0);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
cfp->sp -= 1;
calling->argc -= 1;
VALUE ary = cfp->sp[0];
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (vm_caller_setup_arg_splat(cfp, calling, ary, max_args)) {
goto check_keyword;
}
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
// check the last argument
VALUE last_hash, argv_ary;
if (UNLIKELY(argv_ary = calling->heap_argv)) {
if (!IS_ARGS_KEYWORD(ci) &&
RARRAY_LEN(argv_ary) > 0 &&
RB_TYPE_P((last_hash = rb_ary_last(0, NULL, argv_ary)), T_HASH) &&
(((struct RHash *)last_hash)->basic.flags & RHASH_PASS_AS_KEYWORDS)) {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
rb_ary_pop(argv_ary);
if (!RHASH_EMPTY_P(last_hash)) {
rb_ary_push(argv_ary, rb_hash_dup(last_hash));
calling->kw_splat = 1;
}
}
Reduce allocations for keyword argument hashes Previously, passing a keyword splat to a method always allocated a hash on the caller side, and accepting arbitrary keywords in a method allocated a separate hash on the callee side. Passing explicit keywords to a method that accepted a keyword splat did not allocate a hash on the caller side, but resulted in two hashes allocated on the callee side. This commit makes passing a single keyword splat to a method not allocate a hash on the caller side. Passing multiple keyword splats or a mix of explicit keywords and a keyword splat still generates a hash on the caller side. On the callee side, if arbitrary keywords are not accepted, it does not allocate a hash. If arbitrary keywords are accepted, it will allocate a hash, but this commit uses a callinfo flag to indicate whether the caller already allocated a hash, and if so, the callee can use the passed hash without duplicating it. So this commit should make it so that a maximum of a single hash is allocated during method calls. To set the callinfo flag appropriately, method call argument compilation checks if only a single keyword splat is given. If only one keyword splat is given, the VM_CALL_KW_SPLAT_MUT callinfo flag is not set, since in that case the keyword splat is passed directly and not mutable. If more than one splat is used, a new hash needs to be generated on the caller side, and in that case the callinfo flag is set, indicating the keyword splat is mutable by the callee. In compile_hash, used for both hash and keyword argument compilation, if compiling keyword arguments and only a single keyword splat is used, pass the argument directly. On the caller side, in vm_args.c, the callinfo flag needs to be recognized and handled. Because the keyword splat argument may not be a hash, it needs to be converted to a hash first if not. Then, unless the callinfo flag is set, the hash needs to be duplicated. The temporary copy of the callinfo flag, kw_flag, is updated if a hash was duplicated, to prevent the need to duplicate it again. If we are converting to a hash or duplicating a hash, we need to update the argument array, which can including duplicating the positional splat array if one was passed. CALLER_SETUP_ARG and a couple other places needs to be modified to handle similar issues for other types of calls. This includes fairly comprehensive tests for different ways keywords are handled internally, checking that you get equal results but that keyword splats on the caller side result in distinct objects for keyword rest parameters. Included are benchmarks for keyword argument calls. Brief results when compiled without optimization: def kw(a: 1) a end def kws(**kw) kw end h = {a: 1} kw(a: 1) # about same kw(**h) # 2.37x faster kws(a: 1) # 1.30x faster kws(**h) # 2.19x faster kw(a: 1, **h) # 1.03x slower kw(**h, **h) # about same kws(a: 1, **h) # 1.16x faster kws(**h, **h) # 1.14x faster
2020-02-24 23:05:07 +03:00
}
else {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
check_keyword:
if (!IS_ARGS_KEYWORD(ci) &&
calling->argc > 0 &&
RB_TYPE_P((last_hash = cfp->sp[-1]), T_HASH) &&
(((struct RHash *)last_hash)->basic.flags & RHASH_PASS_AS_KEYWORDS)) {
if (RHASH_EMPTY_P(last_hash)) {
calling->argc--;
cfp->sp -= 1;
}
else {
cfp->sp[-1] = rb_hash_dup(last_hash);
calling->kw_splat = 1;
}
}
Reduce allocations for keyword argument hashes Previously, passing a keyword splat to a method always allocated a hash on the caller side, and accepting arbitrary keywords in a method allocated a separate hash on the callee side. Passing explicit keywords to a method that accepted a keyword splat did not allocate a hash on the caller side, but resulted in two hashes allocated on the callee side. This commit makes passing a single keyword splat to a method not allocate a hash on the caller side. Passing multiple keyword splats or a mix of explicit keywords and a keyword splat still generates a hash on the caller side. On the callee side, if arbitrary keywords are not accepted, it does not allocate a hash. If arbitrary keywords are accepted, it will allocate a hash, but this commit uses a callinfo flag to indicate whether the caller already allocated a hash, and if so, the callee can use the passed hash without duplicating it. So this commit should make it so that a maximum of a single hash is allocated during method calls. To set the callinfo flag appropriately, method call argument compilation checks if only a single keyword splat is given. If only one keyword splat is given, the VM_CALL_KW_SPLAT_MUT callinfo flag is not set, since in that case the keyword splat is passed directly and not mutable. If more than one splat is used, a new hash needs to be generated on the caller side, and in that case the callinfo flag is set, indicating the keyword splat is mutable by the callee. In compile_hash, used for both hash and keyword argument compilation, if compiling keyword arguments and only a single keyword splat is used, pass the argument directly. On the caller side, in vm_args.c, the callinfo flag needs to be recognized and handled. Because the keyword splat argument may not be a hash, it needs to be converted to a hash first if not. Then, unless the callinfo flag is set, the hash needs to be duplicated. The temporary copy of the callinfo flag, kw_flag, is updated if a hash was duplicated, to prevent the need to duplicate it again. If we are converting to a hash or duplicating a hash, we need to update the argument array, which can including duplicating the positional splat array if one was passed. CALLER_SETUP_ARG and a couple other places needs to be modified to handle similar issues for other types of calls. This includes fairly comprehensive tests for different ways keywords are handled internally, checking that you get equal results but that keyword splats on the caller side result in distinct objects for keyword rest parameters. Included are benchmarks for keyword argument calls. Brief results when compiled without optimization: def kw(a: 1) a end def kws(**kw) kw end h = {a: 1} kw(a: 1) # about same kw(**h) # 2.37x faster kws(a: 1) # 1.30x faster kws(**h) # 2.19x faster kw(a: 1, **h) # 1.03x slower kw(**h, **h) # about same kws(a: 1, **h) # 1.16x faster kws(**h, **h) # 1.14x faster
2020-02-24 23:05:07 +03:00
}
}
}
else if (UNLIKELY(IS_ARGS_KW_SPLAT(ci))) {
// f(**kw)
VM_ASSERT(calling->kw_splat == 1);
VALUE kwh = vm_caller_setup_keyword_hash(ci, cfp->sp[-1]);
if (kwh == Qnil || RHASH_EMPTY_P(kwh)) {
cfp->sp--;
calling->argc--;
calling->kw_splat = 0;
}
else {
cfp->sp[-1] = kwh;
}
}
else if (UNLIKELY(IS_ARGS_KEYWORD(ci))) {
// f(k1:1, k2:2)
VM_ASSERT(calling->kw_splat == 0);
/* This converts VM_CALL_KWARG style to VM_CALL_KW_SPLAT style
* by creating a keyword hash.
* So, vm_ci_flag(ci) & VM_CALL_KWARG is now inconsistent.
*/
vm_caller_setup_arg_kw(cfp, calling, ci);
}
}
#define USE_OPT_HIST 0
#if USE_OPT_HIST
#define OPT_HIST_MAX 64
static int opt_hist[OPT_HIST_MAX+1];
__attribute__((destructor))
static void
opt_hist_show_results_at_exit(void)
{
for (int i=0; i<OPT_HIST_MAX; i++) {
ruby_debug_printf("opt_hist\t%d\t%d\n", i, opt_hist[i]);
}
}
#endif
static VALUE
vm_call_iseq_setup_normal_opt_start(rb_execution_context_t *ec, rb_control_frame_t *cfp,
struct rb_calling_info *calling)
{
const struct rb_callcache *cc = calling->cc;
const rb_iseq_t *iseq = def_iseq_ptr(vm_cc_cme(cc)->def);
const int lead_num = ISEQ_BODY(iseq)->param.lead_num;
const int opt = calling->argc - lead_num;
const int opt_num = ISEQ_BODY(iseq)->param.opt_num;
const int opt_pc = (int)ISEQ_BODY(iseq)->param.opt_table[opt];
const int param = ISEQ_BODY(iseq)->param.size;
const int local = ISEQ_BODY(iseq)->local_table_size;
const int delta = opt_num - opt;
RB_DEBUG_COUNTER_INC(ccf_iseq_opt);
#if USE_OPT_HIST
if (opt_pc < OPT_HIST_MAX) {
opt_hist[opt]++;
}
else {
opt_hist[OPT_HIST_MAX]++;
}
#endif
return vm_call_iseq_setup_normal(ec, cfp, calling, vm_cc_cme(cc), opt_pc, param - delta, local);
}
static VALUE
vm_call_iseq_setup_tailcall_opt_start(rb_execution_context_t *ec, rb_control_frame_t *cfp,
struct rb_calling_info *calling)
{
const struct rb_callcache *cc = calling->cc;
const rb_iseq_t *iseq = def_iseq_ptr(vm_cc_cme(cc)->def);
const int lead_num = ISEQ_BODY(iseq)->param.lead_num;
const int opt = calling->argc - lead_num;
const int opt_pc = (int)ISEQ_BODY(iseq)->param.opt_table[opt];
RB_DEBUG_COUNTER_INC(ccf_iseq_opt);
#if USE_OPT_HIST
if (opt_pc < OPT_HIST_MAX) {
opt_hist[opt]++;
}
else {
opt_hist[OPT_HIST_MAX]++;
}
#endif
return vm_call_iseq_setup_tailcall(ec, cfp, calling, opt_pc);
}
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
static void
args_setup_kw_parameters(rb_execution_context_t *const ec, const rb_iseq_t *const iseq,
VALUE *const passed_values, const int passed_keyword_len, const VALUE *const passed_keywords,
VALUE *const locals);
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
static VALUE
vm_call_iseq_forwardable(rb_execution_context_t *ec, rb_control_frame_t *cfp,
struct rb_calling_info *calling)
{
const struct rb_callcache *cc = calling->cc;
const rb_iseq_t *iseq = def_iseq_ptr(vm_cc_cme(cc)->def);
int param_size = ISEQ_BODY(iseq)->param.size;
int local_size = ISEQ_BODY(iseq)->local_table_size;
// Setting up local size and param size
VM_ASSERT(ISEQ_BODY(iseq)->param.flags.forwardable);
local_size = local_size + vm_ci_argc(calling->cd->ci);
param_size = param_size + vm_ci_argc(calling->cd->ci);
cfp->sp[0] = (VALUE)calling->cd->ci;
return vm_call_iseq_setup_normal(ec, cfp, calling, vm_cc_cme(cc), 0, param_size, local_size);
}
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
static VALUE
vm_call_iseq_setup_kwparm_kwarg(rb_execution_context_t *ec, rb_control_frame_t *cfp,
struct rb_calling_info *calling)
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
{
const struct rb_callinfo *ci = calling->cd->ci;
const struct rb_callcache *cc = calling->cc;
VM_ASSERT(vm_ci_flag(ci) & VM_CALL_KWARG);
RB_DEBUG_COUNTER_INC(ccf_iseq_kw1);
const rb_iseq_t *iseq = def_iseq_ptr(vm_cc_cme(cc)->def);
const struct rb_iseq_param_keyword *kw_param = ISEQ_BODY(iseq)->param.keyword;
const struct rb_callinfo_kwarg *kw_arg = vm_ci_kwarg(ci);
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
const int ci_kw_len = kw_arg->keyword_len;
const VALUE * const ci_keywords = kw_arg->keywords;
VALUE *argv = cfp->sp - calling->argc;
VALUE *const klocals = argv + kw_param->bits_start - kw_param->num;
const int lead_num = ISEQ_BODY(iseq)->param.lead_num;
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
VALUE * const ci_kws = ALLOCA_N(VALUE, ci_kw_len);
MEMCPY(ci_kws, argv + lead_num, VALUE, ci_kw_len);
args_setup_kw_parameters(ec, iseq, ci_kws, ci_kw_len, ci_keywords, klocals);
int param = ISEQ_BODY(iseq)->param.size;
int local = ISEQ_BODY(iseq)->local_table_size;
return vm_call_iseq_setup_normal(ec, cfp, calling, vm_cc_cme(cc), 0, param, local);
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
}
static VALUE
vm_call_iseq_setup_kwparm_nokwarg(rb_execution_context_t *ec, rb_control_frame_t *cfp,
struct rb_calling_info *calling)
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
{
const struct rb_callinfo *MAYBE_UNUSED(ci) = calling->cd->ci;
const struct rb_callcache *cc = calling->cc;
VM_ASSERT((vm_ci_flag(ci) & VM_CALL_KWARG) == 0);
RB_DEBUG_COUNTER_INC(ccf_iseq_kw2);
const rb_iseq_t *iseq = def_iseq_ptr(vm_cc_cme(cc)->def);
const struct rb_iseq_param_keyword *kw_param = ISEQ_BODY(iseq)->param.keyword;
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
VALUE * const argv = cfp->sp - calling->argc;
VALUE * const klocals = argv + kw_param->bits_start - kw_param->num;
int i;
for (i=0; i<kw_param->num; i++) {
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
klocals[i] = kw_param->default_values[i];
}
klocals[i] = INT2FIX(0); // kw specify flag
// NOTE:
// nobody check this value, but it should be cleared because it can
// points invalid VALUE (T_NONE objects, raw pointer and so on).
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
int param = ISEQ_BODY(iseq)->param.size;
int local = ISEQ_BODY(iseq)->local_table_size;
return vm_call_iseq_setup_normal(ec, cfp, calling, vm_cc_cme(cc), 0, param, local);
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
}
static VALUE builtin_invoker0(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr);
static VALUE
vm_call_single_noarg_leaf_builtin(rb_execution_context_t *ec, rb_control_frame_t *cfp,
struct rb_calling_info *calling)
{
const struct rb_builtin_function *bf = calling->cc->aux_.bf;
cfp->sp -= (calling->argc + 1);
rb_insn_func_t func_ptr = (rb_insn_func_t)(uintptr_t)bf->func_ptr;
return builtin_invoker0(ec, calling->recv, NULL, func_ptr);
}
VALUE rb_gen_method_name(VALUE owner, VALUE name); // in vm_backtrace.c
static void
warn_unused_block(const rb_callable_method_entry_t *cme, const rb_iseq_t *iseq, void *pc)
{
rb_vm_t *vm = GET_VM();
st_table *dup_check_table = vm->unused_block_warning_table;
st_data_t key;
union {
VALUE v;
unsigned char b[SIZEOF_VALUE];
} k1 = {
.v = (VALUE)pc,
}, k2 = {
.v = (VALUE)cme->def,
};
// relax check
if (!vm->unused_block_warning_strict) {
key = (st_data_t)cme->def->original_id;
if (st_lookup(dup_check_table, key, NULL)) {
return;
}
}
// strict check
// make unique key from pc and me->def pointer
key = 0;
for (int i=0; i<SIZEOF_VALUE; i++) {
// fprintf(stderr, "k1:%3d k2:%3d\n", k1.b[i], k2.b[SIZEOF_VALUE-1-i]);
key |= (st_data_t)(k1.b[i] ^ k2.b[SIZEOF_VALUE-1-i]) << (8 * i);
}
if (0) {
fprintf(stderr, "SIZEOF_VALUE:%d\n", SIZEOF_VALUE);
fprintf(stderr, "pc:%p def:%p\n", pc, (void *)cme->def);
fprintf(stderr, "key:%p\n", (void *)key);
}
// duplication check
if (st_insert(dup_check_table, key, 1)) {
// already shown
}
else {
VALUE m_loc = rb_method_entry_location((const rb_method_entry_t *)cme);
VALUE name = rb_gen_method_name(cme->defined_class, ISEQ_BODY(iseq)->location.base_label);
if (!NIL_P(m_loc)) {
rb_warning("the block passed to '%"PRIsVALUE"' defined at %"PRIsVALUE":%"PRIsVALUE" may be ignored",
name, RARRAY_AREF(m_loc, 0), RARRAY_AREF(m_loc, 1));
}
else {
rb_warning("the block may be ignored because '%"PRIsVALUE"' does not use a block", name);
}
}
}
static inline int
vm_callee_setup_arg(rb_execution_context_t *ec, struct rb_calling_info *calling,
const rb_iseq_t *iseq, VALUE *argv, int param_size, int local_size)
{
const struct rb_callinfo *ci = calling->cd->ci;
const struct rb_callcache *cc = calling->cc;
VM_ASSERT((vm_ci_argc(ci), 1));
VM_ASSERT(vm_cc_cme(cc) != NULL);
if (UNLIKELY(!ISEQ_BODY(iseq)->param.flags.use_block &&
calling->block_handler != VM_BLOCK_HANDLER_NONE &&
!(vm_ci_flag(calling->cd->ci) & (VM_CALL_OPT_SEND | VM_CALL_SUPER)))) {
warn_unused_block(vm_cc_cme(cc), iseq, (void *)ec->cfp->pc);
}
if (LIKELY(!(vm_ci_flag(ci) & VM_CALL_KW_SPLAT))) {
if (LIKELY(rb_simple_iseq_p(iseq))) {
rb_control_frame_t *cfp = ec->cfp;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
int lead_num = ISEQ_BODY(iseq)->param.lead_num;
CALLER_SETUP_ARG(cfp, calling, ci, lead_num);
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 21:02:55 +03:00
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (calling->argc != lead_num) {
argument_arity_error(ec, iseq, calling->argc, lead_num, lead_num);
}
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 21:02:55 +03:00
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
//VM_ASSERT(ci == calling->cd->ci);
VM_ASSERT(cc == calling->cc);
if (vm_call_iseq_optimizable_p(ci, cc)) {
if ((iseq->body->builtin_attrs & BUILTIN_ATTR_SINGLE_NOARG_LEAF) &&
!(ruby_vm_event_flags & (RUBY_EVENT_C_CALL | RUBY_EVENT_C_RETURN))) {
VM_ASSERT(iseq->body->builtin_attrs & BUILTIN_ATTR_LEAF);
vm_cc_bf_set(cc, (void *)iseq->body->iseq_encoded[1]);
CC_SET_FASTPATH(cc, vm_call_single_noarg_leaf_builtin, true);
}
else {
CC_SET_FASTPATH(cc, vm_call_iseq_setup_func(ci, param_size, local_size), true);
}
}
return 0;
}
else if (rb_iseq_only_optparam_p(iseq)) {
rb_control_frame_t *cfp = ec->cfp;
const int lead_num = ISEQ_BODY(iseq)->param.lead_num;
const int opt_num = ISEQ_BODY(iseq)->param.opt_num;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
CALLER_SETUP_ARG(cfp, calling, ci, lead_num + opt_num);
const int argc = calling->argc;
const int opt = argc - lead_num;
if (opt < 0 || opt > opt_num) {
argument_arity_error(ec, iseq, argc, lead_num, lead_num + opt_num);
}
if (LIKELY(!(vm_ci_flag(ci) & VM_CALL_TAILCALL))) {
CC_SET_FASTPATH(cc, vm_call_iseq_setup_normal_opt_start,
!IS_ARGS_SPLAT(ci) && !IS_ARGS_KEYWORD(ci) &&
vm_call_cacheable(ci, cc));
}
else {
CC_SET_FASTPATH(cc, vm_call_iseq_setup_tailcall_opt_start,
!IS_ARGS_SPLAT(ci) && !IS_ARGS_KEYWORD(ci) &&
vm_call_cacheable(ci, cc));
}
/* initialize opt vars for self-references */
VM_ASSERT((int)ISEQ_BODY(iseq)->param.size == lead_num + opt_num);
for (int i=argc; i<lead_num + opt_num; i++) {
argv[i] = Qnil;
}
return (int)ISEQ_BODY(iseq)->param.opt_table[opt];
}
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
else if (rb_iseq_only_kwparam_p(iseq) && !IS_ARGS_SPLAT(ci)) {
const int lead_num = ISEQ_BODY(iseq)->param.lead_num;
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
const int argc = calling->argc;
const struct rb_iseq_param_keyword *kw_param = ISEQ_BODY(iseq)->param.keyword;
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
if (vm_ci_flag(ci) & VM_CALL_KWARG) {
const struct rb_callinfo_kwarg *kw_arg = vm_ci_kwarg(ci);
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
if (argc - kw_arg->keyword_len == lead_num) {
const int ci_kw_len = kw_arg->keyword_len;
const VALUE * const ci_keywords = kw_arg->keywords;
VALUE * const ci_kws = ALLOCA_N(VALUE, ci_kw_len);
MEMCPY(ci_kws, argv + lead_num, VALUE, ci_kw_len);
VALUE *const klocals = argv + kw_param->bits_start - kw_param->num;
args_setup_kw_parameters(ec, iseq, ci_kws, ci_kw_len, ci_keywords, klocals);
CC_SET_FASTPATH(cc, vm_call_iseq_setup_kwparm_kwarg,
vm_call_cacheable(ci, cc));
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
return 0;
}
}
else if (argc == lead_num) {
/* no kwarg */
VALUE *const klocals = argv + kw_param->bits_start - kw_param->num;
args_setup_kw_parameters(ec, iseq, NULL, 0, NULL, klocals);
if (klocals[kw_param->num] == INT2FIX(0)) {
/* copy from default_values */
CC_SET_FASTPATH(cc, vm_call_iseq_setup_kwparm_nokwarg,
vm_call_cacheable(ci, cc));
optimize method dispatch for lead/kw params. similar idea to r67315, provide the following optimization for method dispatch with lead and kw parameters. (1) add a special branch to check passing kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0, k:1) (2) add a special branch to check passing no-kw arguments to a method which has lead and kw parameters. ex) def foo(x, k:1); end; foo(0) For (1) and (2) cases, provide special dispatchers. For (2) case, this patch only use the special dispatcher if all default kw parameters are literal values (nil, 1, and so on. In other case, kw->default_values does not contains Qundef) (and no required kw parameters becaseu they don't pass any keyword parameters). Passing keyword arguments with a hash object is not a scope of this patch. Without this patch, (1) and (2) cases use `setup_parameters_complex()`. Especially, (2) seems frequent case for methods which extend a normal usecase with keyword parameters (like: `exception: true`). We can measure the performance with benchmark-driver: With methods: def kw k1:1, k2:2; end def m; end With the following binaries: clean-miniruby: unmodified trunk. opt_miniruby1: use special branches for lead/kw parameters. opt_miniruby2: use special dispatchers for lead/kw parameters. opt_cc_miniruby: apply step (2). Result with benchmark-driver: m opt_miniruby2: 75222278.0 i/s clean-miniruby: 73177896.5 i/s - 1.03x slower opt_miniruby1: 62466783.3 i/s - 1.20x slower kw opt_miniruby2: 52044504.4 i/s opt_miniruby1: 29142025.7 i/s - 1.79x slower clean-miniruby: 20515235.4 i/s - 2.54x slower kw k1: 10 opt_miniruby2: 26492219.5 i/s opt_miniruby1: 25409484.9 i/s - 1.04x slower clean-miniruby: 20235113.7 i/s - 1.31x slower kw k1: 10, k2: 20 opt_miniruby1: 24159534.0 i/s opt_miniruby2: 23470527.5 i/s - 1.03x slower clean-miniruby: 17822621.5 i/s - 1.36x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@67333 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2019-03-22 03:21:41 +03:00
}
return 0;
}
}
}
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
// Called iseq is using ... param
// def foo(...) # <- iseq for foo will have "forwardable"
//
// We want to set the `...` local to the caller's CI
// foo(1, 2) # <- the ci for this should end up as `...`
//
// So hopefully the stack looks like:
//
// => 1
// => 2
// => *
// => **
// => &
// => ... # <- points at `foo`s CI
// => cref_or_me
// => specval
// => type
//
if (ISEQ_BODY(iseq)->param.flags.forwardable) {
bool can_fastpath = true;
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
if ((vm_ci_flag(ci) & VM_CALL_FORWARDING)) {
struct rb_forwarding_call_data * forward_cd = (struct rb_forwarding_call_data *)calling->cd;
if (vm_ci_argc(ci) != vm_ci_argc(forward_cd->caller_ci)) {
ci = vm_ci_new_runtime(
vm_ci_mid(ci),
vm_ci_flag(ci),
vm_ci_argc(ci),
vm_ci_kwarg(ci));
} else {
ci = forward_cd->caller_ci;
}
can_fastpath = false;
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
}
// C functions calling iseqs will stack allocate a CI,
// so we need to convert it to heap allocated
if (!vm_ci_markable(ci)) {
ci = vm_ci_new_runtime(
vm_ci_mid(ci),
vm_ci_flag(ci),
vm_ci_argc(ci),
vm_ci_kwarg(ci));
can_fastpath = false;
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
}
argv[param_size - 1] = (VALUE)ci;
CC_SET_FASTPATH(cc, vm_call_iseq_forwardable, can_fastpath);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
return 0;
}
return setup_parameters_complex(ec, iseq, calling, ci, argv, arg_setup_method);
}
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
static void
vm_adjust_stack_forwarding(const struct rb_execution_context_struct *ec, struct rb_control_frame_struct *cfp, int argc, VALUE splat)
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
{
// This case is when the caller is using a ... parameter.
// For example `bar(...)`. The call info will have VM_CALL_FORWARDING
// In this case the caller's caller's CI will be on the stack.
//
// For example:
//
// def bar(a, b); a + b; end
// def foo(...); bar(...); end
// foo(1, 2) # <- this CI will be on the stack when we call `bar(...)`
//
// Stack layout will be:
//
// > 1
// > 2
// > CI for foo(1, 2)
// > cref_or_me
// > specval
// > type
// > receiver
// > CI for foo(1, 2), via `getlocal ...`
// > ( SP points here )
const VALUE * lep = VM_CF_LEP(cfp);
const rb_iseq_t *iseq;
// If we're in an escaped environment (lambda for example), get the iseq
// from the captured env.
if (VM_ENV_FLAGS(lep, VM_ENV_FLAG_ESCAPED)) {
rb_env_t * env = (rb_env_t *)lep[VM_ENV_DATA_INDEX_ENV];
iseq = env->iseq;
}
else { // Otherwise use the lep to find the caller
iseq = rb_vm_search_cf_from_ep(ec, cfp, lep)->iseq;
}
// Our local storage is below the args we need to copy
int local_size = ISEQ_BODY(iseq)->local_table_size + argc;
const VALUE * from = lep - (local_size + VM_ENV_DATA_SIZE - 1); // 2 for EP values
VALUE * to = cfp->sp - 1; // clobber the CI
if (RTEST(splat)) {
to -= 1; // clobber the splat array
CHECK_VM_STACK_OVERFLOW0(cfp, to, RARRAY_LEN(splat));
MEMCPY(to, RARRAY_CONST_PTR(splat), VALUE, RARRAY_LEN(splat));
to += RARRAY_LEN(splat);
}
CHECK_VM_STACK_OVERFLOW0(cfp, to, argc);
MEMCPY(to, from, VALUE, argc);
cfp->sp = to + argc;
// Stack layout should now be:
//
// > 1
// > 2
// > CI for foo(1, 2)
// > cref_or_me
// > specval
// > type
// > receiver
// > 1
// > 2
// > ( SP points here )
}
static VALUE
vm_call_iseq_setup(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_iseq_setup);
const struct rb_callcache *cc = calling->cc;
const rb_iseq_t *iseq = def_iseq_ptr(vm_cc_cme(cc)->def);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
int param_size = ISEQ_BODY(iseq)->param.size;
int local_size = ISEQ_BODY(iseq)->local_table_size;
2024-05-25 00:33:03 +03:00
RUBY_ASSERT(!ISEQ_BODY(iseq)->param.flags.forwardable);
const int opt_pc = vm_callee_setup_arg(ec, calling, iseq, cfp->sp - calling->argc, param_size, local_size);
return vm_call_iseq_setup_2(ec, cfp, calling, opt_pc, param_size, local_size);
}
static VALUE
vm_call_iseq_fwd_setup(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_iseq_setup);
const struct rb_callcache *cc = calling->cc;
const rb_iseq_t *iseq = def_iseq_ptr(vm_cc_cme(cc)->def);
int param_size = ISEQ_BODY(iseq)->param.size;
int local_size = ISEQ_BODY(iseq)->local_table_size;
RUBY_ASSERT(ISEQ_BODY(iseq)->param.flags.forwardable);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
// Setting up local size and param size
2024-05-25 00:33:03 +03:00
local_size = local_size + vm_ci_argc(calling->cd->ci);
param_size = param_size + vm_ci_argc(calling->cd->ci);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
2023-02-05 00:46:46 +03:00
const int opt_pc = vm_callee_setup_arg(ec, calling, iseq, cfp->sp - calling->argc, param_size, local_size);
return vm_call_iseq_setup_2(ec, cfp, calling, opt_pc, param_size, local_size);
}
static inline VALUE
vm_call_iseq_setup_2(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling,
int opt_pc, int param_size, int local_size)
{
const struct rb_callinfo *ci = calling->cd->ci;
const struct rb_callcache *cc = calling->cc;
if (LIKELY(!(vm_ci_flag(ci) & VM_CALL_TAILCALL))) {
return vm_call_iseq_setup_normal(ec, cfp, calling, vm_cc_cme(cc), opt_pc, param_size, local_size);
}
else {
return vm_call_iseq_setup_tailcall(ec, cfp, calling, opt_pc);
}
}
static inline VALUE
vm_call_iseq_setup_normal(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling, const rb_callable_method_entry_t *me,
int opt_pc, int param_size, int local_size)
{
const rb_iseq_t *iseq = def_iseq_ptr(me->def);
VALUE *argv = cfp->sp - calling->argc;
VALUE *sp = argv + param_size;
cfp->sp = argv - 1 /* recv */;
vm_push_frame(ec, iseq, VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL, calling->recv,
calling->block_handler, (VALUE)me,
ISEQ_BODY(iseq)->iseq_encoded + opt_pc, sp,
local_size - param_size,
ISEQ_BODY(iseq)->stack_max);
return Qundef;
}
static inline VALUE
vm_call_iseq_setup_tailcall(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling, int opt_pc)
{
const struct rb_callcache *cc = calling->cc;
unsigned int i;
VALUE *argv = cfp->sp - calling->argc;
const rb_callable_method_entry_t *me = vm_cc_cme(cc);
const rb_iseq_t *iseq = def_iseq_ptr(me->def);
VALUE *src_argv = argv;
VALUE *sp_orig, *sp;
VALUE finish_flag = VM_FRAME_FINISHED_P(cfp) ? VM_FRAME_FLAG_FINISH : 0;
if (VM_BH_FROM_CFP_P(calling->block_handler, cfp)) {
struct rb_captured_block *dst_captured = VM_CFP_TO_CAPTURED_BLOCK(RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp));
const struct rb_captured_block *src_captured = VM_BH_TO_CAPT_BLOCK(calling->block_handler);
dst_captured->code.val = src_captured->code.val;
if (VM_BH_ISEQ_BLOCK_P(calling->block_handler)) {
calling->block_handler = VM_BH_FROM_ISEQ_BLOCK(dst_captured);
}
else {
calling->block_handler = VM_BH_FROM_IFUNC_BLOCK(dst_captured);
}
}
vm_pop_frame(ec, cfp, cfp->ep);
cfp = ec->cfp;
sp_orig = sp = cfp->sp;
/* push self */
sp[0] = calling->recv;
sp++;
/* copy arguments */
for (i=0; i < ISEQ_BODY(iseq)->param.size; i++) {
*sp++ = src_argv[i];
}
vm_push_frame(ec, iseq, VM_FRAME_MAGIC_METHOD | VM_ENV_FLAG_LOCAL | finish_flag,
calling->recv, calling->block_handler, (VALUE)me,
ISEQ_BODY(iseq)->iseq_encoded + opt_pc, sp,
ISEQ_BODY(iseq)->local_table_size - ISEQ_BODY(iseq)->param.size,
ISEQ_BODY(iseq)->stack_max);
cfp->sp = sp_orig;
return Qundef;
}
static void
ractor_unsafe_check(void)
{
if (!rb_ractor_main_p()) {
rb_raise(rb_eRactorUnsafeError, "ractor unsafe method called from not main ractor");
}
}
static VALUE
call_cfunc_m2(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE) = (VALUE(*)(VALUE, VALUE))func;
return (*f)(recv, rb_ary_new4(argc, argv));
}
static VALUE
call_cfunc_m1(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(int, const VALUE *, VALUE) = (VALUE(*)(int, const VALUE *, VALUE))func;
return (*f)(argc, argv, recv);
}
static VALUE
call_cfunc_0(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE) = (VALUE(*)(VALUE))func;
return (*f)(recv);
}
static VALUE
call_cfunc_1(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE) = (VALUE(*)(VALUE, VALUE))func;
return (*f)(recv, argv[0]);
}
static VALUE
call_cfunc_2(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1]);
}
static VALUE
call_cfunc_3(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2]);
}
static VALUE
call_cfunc_4(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3]);
}
static VALUE
call_cfunc_5(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4]);
}
static VALUE
call_cfunc_6(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5]);
}
static VALUE
call_cfunc_7(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
}
static VALUE
call_cfunc_8(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]);
}
static VALUE
call_cfunc_9(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8]);
}
static VALUE
call_cfunc_10(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9]);
}
static VALUE
call_cfunc_11(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10]);
}
static VALUE
call_cfunc_12(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11]);
}
static VALUE
call_cfunc_13(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12]);
}
static VALUE
call_cfunc_14(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13]);
}
static VALUE
call_cfunc_15(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
ractor_unsafe_check();
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13], argv[14]);
}
static VALUE
ractor_safe_call_cfunc_m2(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE) = (VALUE(*)(VALUE, VALUE))func;
return (*f)(recv, rb_ary_new4(argc, argv));
}
static VALUE
ractor_safe_call_cfunc_m1(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(int, const VALUE *, VALUE) = (VALUE(*)(int, const VALUE *, VALUE))func;
return (*f)(argc, argv, recv);
}
static VALUE
ractor_safe_call_cfunc_0(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE) = (VALUE(*)(VALUE))func;
return (*f)(recv);
}
static VALUE
ractor_safe_call_cfunc_1(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE) = (VALUE(*)(VALUE, VALUE))func;
return (*f)(recv, argv[0]);
}
static VALUE
ractor_safe_call_cfunc_2(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1]);
}
static VALUE
ractor_safe_call_cfunc_3(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2]);
}
static VALUE
ractor_safe_call_cfunc_4(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3]);
}
static VALUE
ractor_safe_call_cfunc_5(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4]);
}
static VALUE
ractor_safe_call_cfunc_6(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5]);
}
static VALUE
ractor_safe_call_cfunc_7(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
}
static VALUE
ractor_safe_call_cfunc_8(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]);
}
static VALUE
ractor_safe_call_cfunc_9(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8]);
}
static VALUE
ractor_safe_call_cfunc_10(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9]);
}
static VALUE
ractor_safe_call_cfunc_11(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10]);
}
static VALUE
ractor_safe_call_cfunc_12(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11]);
}
static VALUE
ractor_safe_call_cfunc_13(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12]);
}
static VALUE
ractor_safe_call_cfunc_14(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13]);
}
static VALUE
ractor_safe_call_cfunc_15(VALUE recv, int argc, const VALUE *argv, VALUE (*func)(ANYARGS))
{
VALUE(*f)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE) = (VALUE(*)(VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE, VALUE))func;
return (*f)(recv, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13], argv[14]);
}
static inline int
vm_cfp_consistent_p(rb_execution_context_t *ec, const rb_control_frame_t *reg_cfp)
{
const int ov_flags = RAISED_STACKOVERFLOW;
if (LIKELY(reg_cfp == ec->cfp + 1)) return TRUE;
if (rb_ec_raised_p(ec, ov_flags)) {
rb_ec_raised_reset(ec, ov_flags);
return TRUE;
}
return FALSE;
}
#define CHECK_CFP_CONSISTENCY(func) \
(LIKELY(vm_cfp_consistent_p(ec, reg_cfp)) ? (void)0 : \
rb_bug(func ": cfp consistency error (%p, %p)", (void *)reg_cfp, (void *)(ec->cfp+1)))
static inline
const rb_method_cfunc_t *
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
vm_method_cfunc_entry(const rb_callable_method_entry_t *me)
{
#if VM_DEBUG_VERIFY_METHOD_CACHE
switch (me->def->type) {
case VM_METHOD_TYPE_CFUNC:
case VM_METHOD_TYPE_NOTIMPLEMENTED:
break;
# define METHOD_BUG(t) case VM_METHOD_TYPE_##t: rb_bug("wrong method type: " #t)
METHOD_BUG(ISEQ);
METHOD_BUG(ATTRSET);
METHOD_BUG(IVAR);
METHOD_BUG(BMETHOD);
METHOD_BUG(ZSUPER);
METHOD_BUG(UNDEF);
METHOD_BUG(OPTIMIZED);
METHOD_BUG(MISSING);
METHOD_BUG(REFINED);
METHOD_BUG(ALIAS);
# undef METHOD_BUG
default:
rb_bug("wrong method type: %d", me->def->type);
}
#endif
return UNALIGNED_MEMBER_PTR(me->def, body.cfunc);
}
Tune codegen for rb_yield() calls landing in ISeqs Unlike in older revisions in the year, GCC 11 isn't inlining the call to vm_push_frame() inside invoke_iseq_block_from_c() anymore. We do want it to be inlined since rb_yield() speed is fairly important. Logs from -fopt-info-optimized-inline reveal that GCC was blowing its code size budget inlining invoke_block_from_c_bh() into its various callers, leaving suboptimal code for its body. Take away some uses of the `inline` keyword and merge a common tail call to vm_exec() for overall better code. This tweak gives about 18% on a micro benchmark and 1% on the chunky-png benchmark from yjit-bench. I tested on a Skylake server. ``` $ cat c-to-ruby-call.yml benchmark: - 0.upto(10_000_000) {} $ benchmark-driver --chruby '+patch;master' c-to-ruby-call.yml Warming up -------------------------------------- 0.upto(10_000_000) {} 2.299 i/s - 3.000 times in 1.304689s (434.90ms/i) Calculating ------------------------------------- +patch master 0.upto(10_000_000) {} 2.299 1.943 i/s - 6.000 times in 2.609393s 3.088353s Comparison: 0.upto(10_000_000) {} +patch: 2.3 i/s master: 1.9 i/s - 1.18x slower $ ruby run_benchmarks.rb --chruby 'master;+patch' chunky-png <snip> ---------- ----------- ---------- ----------- ---------- -------------- ------------- bench master (ms) stddev (%) +patch (ms) stddev (%) +patch 1st itr master/+patch chunky-png 1156.1 0.1 1142.2 0.2 1.01 1.01 ---------- ----------- ---------- ----------- ---------- -------------- ------------- ```
2024-08-03 03:53:13 +03:00
static VALUE
vm_call_cfunc_with_frame_(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling,
int argc, VALUE *argv, VALUE *stack_bottom)
{
2020-04-14 06:32:59 +03:00
RB_DEBUG_COUNTER_INC(ccf_cfunc_with_frame);
const struct rb_callinfo *ci = calling->cd->ci;
const struct rb_callcache *cc = calling->cc;
VALUE val;
const rb_callable_method_entry_t *me = vm_cc_cme(cc);
const rb_method_cfunc_t *cfunc = vm_method_cfunc_entry(me);
VALUE recv = calling->recv;
VALUE block_handler = calling->block_handler;
VALUE frame_type = VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL;
if (UNLIKELY(calling->kw_splat)) {
frame_type |= VM_FRAME_FLAG_CFRAME_KW;
}
VM_ASSERT(reg_cfp == ec->cfp);
RUBY_DTRACE_CMETHOD_ENTRY_HOOK(ec, me->owner, me->def->original_id);
EXEC_EVENT_HOOK(ec, RUBY_EVENT_C_CALL, recv, me->def->original_id, vm_ci_mid(ci), me->owner, Qundef);
vm_push_frame(ec, NULL, frame_type, recv,
block_handler, (VALUE)me,
0, ec->cfp->sp, 0, 0);
int len = cfunc->argc;
if (len >= 0) rb_check_arity(argc, len, len);
reg_cfp->sp = stack_bottom;
val = (*cfunc->invoker)(recv, argc, argv, cfunc->func);
CHECK_CFP_CONSISTENCY("vm_call_cfunc");
rb_vm_pop_frame(ec);
VM_ASSERT(ec->cfp->sp == stack_bottom);
EXEC_EVENT_HOOK(ec, RUBY_EVENT_C_RETURN, recv, me->def->original_id, vm_ci_mid(ci), me->owner, val);
RUBY_DTRACE_CMETHOD_RETURN_HOOK(ec, me->owner, me->def->original_id);
return val;
}
// Push a C method frame for a given cme. This is called when JIT code skipped
// pushing a frame but the C method reached a point where a frame is needed.
void
rb_vm_push_cfunc_frame(const rb_callable_method_entry_t *cme, int recv_idx)
{
VM_ASSERT(cme->def->type == VM_METHOD_TYPE_CFUNC);
rb_execution_context_t *ec = GET_EC();
VALUE *sp = ec->cfp->sp;
VALUE recv = *(sp - recv_idx - 1);
VALUE frame_type = VM_FRAME_MAGIC_CFUNC | VM_FRAME_FLAG_CFRAME | VM_ENV_FLAG_LOCAL;
VALUE block_handler = VM_BLOCK_HANDLER_NONE;
#if VM_CHECK_MODE > 0
// Clean up the stack canary since we're about to satisfy the "leaf or lazy push" assumption
*(GET_EC()->cfp->sp) = Qfalse;
#endif
vm_push_frame(ec, NULL, frame_type, recv, block_handler, (VALUE)cme, 0, ec->cfp->sp, 0, 0);
}
// If true, cc->call needs to include `CALLER_SETUP_ARG` (i.e. can't be skipped in fastpath)
2023-03-07 09:02:03 +03:00
bool
rb_splat_or_kwargs_p(const struct rb_callinfo *restrict ci)
{
return IS_ARGS_SPLAT(ci) || IS_ARGS_KW_OR_KW_SPLAT(ci);
}
static VALUE
vm_call_cfunc_with_frame(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
int argc = calling->argc;
VALUE *stack_bottom = reg_cfp->sp - argc - 1;
VALUE *argv = &stack_bottom[1];
return vm_call_cfunc_with_frame_(ec, reg_cfp, calling, argc, argv, stack_bottom);
}
static VALUE
vm_call_cfunc_other(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
const struct rb_callinfo *ci = calling->cd->ci;
RB_DEBUG_COUNTER_INC(ccf_cfunc_other);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
CALLER_SETUP_ARG(reg_cfp, calling, ci, ALLOW_HEAP_ARGV_KEEP_KWSPLAT);
VALUE argv_ary;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (UNLIKELY(argv_ary = calling->heap_argv)) {
VM_ASSERT(!IS_ARGS_KEYWORD(ci));
int argc = RARRAY_LENINT(argv_ary);
VALUE *argv = (VALUE *)RARRAY_CONST_PTR(argv_ary);
VALUE *stack_bottom = reg_cfp->sp - 2;
VM_ASSERT(calling->argc == 1);
VM_ASSERT(RB_TYPE_P(argv_ary, T_ARRAY));
VM_ASSERT(RBASIC_CLASS(argv_ary) == 0); // hidden ary
return vm_call_cfunc_with_frame_(ec, reg_cfp, calling, argc, argv, stack_bottom);
}
else {
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
CC_SET_FASTPATH(calling->cc, vm_call_cfunc_with_frame, !rb_splat_or_kwargs_p(ci) && !calling->kw_splat && !(vm_ci_flag(ci) & VM_CALL_FORWARDING));
return vm_call_cfunc_with_frame(ec, reg_cfp, calling);
}
}
static inline VALUE
vm_call_cfunc_array_argv(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling, int stack_offset, int argc_offset)
{
VALUE argv_ary = reg_cfp->sp[-1 - stack_offset];
int argc = RARRAY_LENINT(argv_ary) - argc_offset;
if (UNLIKELY(argc > VM_ARGC_STACK_MAX)) {
return vm_call_cfunc_other(ec, reg_cfp, calling);
}
VALUE *argv = (VALUE *)RARRAY_CONST_PTR(argv_ary);
calling->kw_splat = 0;
int i;
VALUE *stack_bottom = reg_cfp->sp - 2 - stack_offset;
VALUE *sp = stack_bottom;
CHECK_VM_STACK_OVERFLOW(reg_cfp, argc);
for(i = 0; i < argc; i++) {
*++sp = argv[i];
}
reg_cfp->sp = sp+1;
return vm_call_cfunc_with_frame_(ec, reg_cfp, calling, argc, stack_bottom+1, stack_bottom);
}
static inline VALUE
vm_call_cfunc_only_splat(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_cfunc_only_splat);
VALUE argv_ary = reg_cfp->sp[-1];
int argc = RARRAY_LENINT(argv_ary);
VALUE *argv = (VALUE *)RARRAY_CONST_PTR(argv_ary);
VALUE last_hash;
int argc_offset = 0;
if (UNLIKELY(argc > 0 &&
RB_TYPE_P((last_hash = argv[argc-1]), T_HASH) &&
(((struct RHash *)last_hash)->basic.flags & RHASH_PASS_AS_KEYWORDS))) {
if (!RHASH_EMPTY_P(last_hash)) {
return vm_call_cfunc_other(ec, reg_cfp, calling);
}
argc_offset++;
}
return vm_call_cfunc_array_argv(ec, reg_cfp, calling, 0, argc_offset);
}
static inline VALUE
vm_call_cfunc_only_splat_kw(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_cfunc_only_splat_kw);
VALUE keyword_hash = reg_cfp->sp[-1];
if (keyword_hash == Qnil || (RB_TYPE_P(keyword_hash, T_HASH) && RHASH_EMPTY_P(keyword_hash))) {
return vm_call_cfunc_array_argv(ec, reg_cfp, calling, 1, 0);
}
return vm_call_cfunc_other(ec, reg_cfp, calling);
}
static VALUE
vm_call_cfunc(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
const struct rb_callinfo *ci = calling->cd->ci;
RB_DEBUG_COUNTER_INC(ccf_cfunc);
if (IS_ARGS_SPLAT(ci) && !(vm_ci_flag(ci) & VM_CALL_FORWARDING)) {
if (!IS_ARGS_KW_SPLAT(ci) && vm_ci_argc(ci) == 1) {
// f(*a)
CC_SET_FASTPATH(calling->cc, vm_call_cfunc_only_splat, TRUE);
return vm_call_cfunc_only_splat(ec, reg_cfp, calling);
}
if (IS_ARGS_KW_SPLAT(ci) && vm_ci_argc(ci) == 2) {
// f(*a, **kw)
CC_SET_FASTPATH(calling->cc, vm_call_cfunc_only_splat_kw, TRUE);
return vm_call_cfunc_only_splat_kw(ec, reg_cfp, calling);
}
}
CC_SET_FASTPATH(calling->cc, vm_call_cfunc_other, TRUE);
return vm_call_cfunc_other(ec, reg_cfp, calling);
}
static VALUE
vm_call_ivar(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
const struct rb_callcache *cc = calling->cc;
RB_DEBUG_COUNTER_INC(ccf_ivar);
cfp->sp -= 1;
VALUE ivar = vm_getivar(calling->recv, vm_cc_cme(cc)->def->body.attr.id, NULL, NULL, cc, TRUE, Qnil);
return ivar;
}
static VALUE
vm_call_attrset_direct(rb_execution_context_t *ec, rb_control_frame_t *cfp, const struct rb_callcache *cc, VALUE obj)
{
RB_DEBUG_COUNTER_INC(ccf_attrset);
VALUE val = *(cfp->sp - 1);
cfp->sp -= 2;
attr_index_t index = vm_cc_attr_index(cc);
shape_id_t dest_shape_id = vm_cc_attr_index_dest_shape_id(cc);
ID id = vm_cc_cme(cc)->def->body.attr.id;
rb_check_frozen(obj);
VALUE res = vm_setivar(obj, id, val, dest_shape_id, index);
2022-11-15 07:24:08 +03:00
if (UNDEF_P(res)) {
switch (BUILTIN_TYPE(obj)) {
2022-10-12 12:27:23 +03:00
case T_OBJECT:
case T_CLASS:
case T_MODULE:
break;
default:
{
res = vm_setivar_default(obj, id, val, dest_shape_id, index);
2022-11-15 07:24:08 +03:00
if (!UNDEF_P(res)) {
2022-10-12 12:27:23 +03:00
return res;
}
2022-10-12 12:27:23 +03:00
}
}
res = vm_setivar_slowpath_attr(obj, id, val, cc);
}
return res;
}
static VALUE
vm_call_attrset(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
return vm_call_attrset_direct(ec, cfp, calling->cc, calling->recv);
}
static inline VALUE
vm_call_bmethod_body(rb_execution_context_t *ec, struct rb_calling_info *calling, const VALUE *argv)
{
rb_proc_t *proc;
VALUE val;
const struct rb_callcache *cc = calling->cc;
const rb_callable_method_entry_t *cme = vm_cc_cme(cc);
VALUE procv = cme->def->body.bmethod.proc;
if (!RB_OBJ_SHAREABLE_P(procv) &&
cme->def->body.bmethod.defined_ractor != rb_ractor_self(rb_ec_ractor_ptr(ec))) {
rb_raise(rb_eRuntimeError, "defined with an un-shareable Proc in a different Ractor");
}
/* control block frame */
GetProcPtr(procv, proc);
2024-08-08 01:29:33 +03:00
val = vm_invoke_bmethod(ec, proc, calling->recv, CALLING_ARGC(calling), argv, calling->kw_splat, calling->block_handler, vm_cc_cme(cc));
return val;
}
Speed up calling iseq bmethods Currently, bmethod arguments are copied from the VM stack to the C stack in vm_call_bmethod, then copied from the C stack to the VM stack later in invoke_iseq_block_from_c. This is inefficient. This adds vm_call_iseq_bmethod and vm_call_noniseq_bmethod. vm_call_iseq_bmethod is an optimized method that skips stack copies (though there is one copy to remove the receiver from the stack), and avoids calling vm_call_bmethod_body, rb_vm_invoke_bmethod, invoke_block_from_c_proc, invoke_iseq_block_from_c, and vm_yield_setup_args. Th vm_call_iseq_bmethod argument handling is similar to the way normal iseq methods are called, and allows for similar performance optimizations when using splats or keywords. However, even in the no argument case it's still significantly faster. A benchmark is added for bmethod calling. In my environment, it improves bmethod calling performance by 38-59% for simple bmethod calls, and up to 180% for bmethod calls passing literal keywords on both sides. ``` ./miniruby-iseq-bmethod: 18159792.6 i/s ./miniruby-m: 13174419.1 i/s - 1.38x slower bmethod_simple_1 ./miniruby-iseq-bmethod: 15890745.4 i/s ./miniruby-m: 10008972.7 i/s - 1.59x slower bmethod_simple_0_splat ./miniruby-iseq-bmethod: 13142804.3 i/s ./miniruby-m: 11168595.2 i/s - 1.18x slower bmethod_simple_1_splat ./miniruby-iseq-bmethod: 12375791.0 i/s ./miniruby-m: 8491140.1 i/s - 1.46x slower bmethod_no_splat ./miniruby-iseq-bmethod: 10151258.8 i/s ./miniruby-m: 8716664.1 i/s - 1.16x slower bmethod_0_splat ./miniruby-iseq-bmethod: 8138802.5 i/s ./miniruby-m: 7515600.2 i/s - 1.08x slower bmethod_1_splat ./miniruby-iseq-bmethod: 8028372.7 i/s ./miniruby-m: 5947658.6 i/s - 1.35x slower bmethod_10_splat ./miniruby-iseq-bmethod: 6953514.1 i/s ./miniruby-m: 4840132.9 i/s - 1.44x slower bmethod_100_splat ./miniruby-iseq-bmethod: 5287288.4 i/s ./miniruby-m: 2243218.4 i/s - 2.36x slower bmethod_kw ./miniruby-iseq-bmethod: 8931358.2 i/s ./miniruby-m: 3185818.6 i/s - 2.80x slower bmethod_no_kw ./miniruby-iseq-bmethod: 12281287.4 i/s ./miniruby-m: 10041727.9 i/s - 1.22x slower bmethod_kw_splat ./miniruby-iseq-bmethod: 5618956.8 i/s ./miniruby-m: 3657549.5 i/s - 1.54x slower ```
2023-03-24 00:39:31 +03:00
static int vm_callee_setup_block_arg(rb_execution_context_t *ec, struct rb_calling_info *calling, const struct rb_callinfo *ci, const rb_iseq_t *iseq, VALUE *argv, const enum arg_setup_type arg_setup_type);
static VALUE
Speed up calling iseq bmethods Currently, bmethod arguments are copied from the VM stack to the C stack in vm_call_bmethod, then copied from the C stack to the VM stack later in invoke_iseq_block_from_c. This is inefficient. This adds vm_call_iseq_bmethod and vm_call_noniseq_bmethod. vm_call_iseq_bmethod is an optimized method that skips stack copies (though there is one copy to remove the receiver from the stack), and avoids calling vm_call_bmethod_body, rb_vm_invoke_bmethod, invoke_block_from_c_proc, invoke_iseq_block_from_c, and vm_yield_setup_args. Th vm_call_iseq_bmethod argument handling is similar to the way normal iseq methods are called, and allows for similar performance optimizations when using splats or keywords. However, even in the no argument case it's still significantly faster. A benchmark is added for bmethod calling. In my environment, it improves bmethod calling performance by 38-59% for simple bmethod calls, and up to 180% for bmethod calls passing literal keywords on both sides. ``` ./miniruby-iseq-bmethod: 18159792.6 i/s ./miniruby-m: 13174419.1 i/s - 1.38x slower bmethod_simple_1 ./miniruby-iseq-bmethod: 15890745.4 i/s ./miniruby-m: 10008972.7 i/s - 1.59x slower bmethod_simple_0_splat ./miniruby-iseq-bmethod: 13142804.3 i/s ./miniruby-m: 11168595.2 i/s - 1.18x slower bmethod_simple_1_splat ./miniruby-iseq-bmethod: 12375791.0 i/s ./miniruby-m: 8491140.1 i/s - 1.46x slower bmethod_no_splat ./miniruby-iseq-bmethod: 10151258.8 i/s ./miniruby-m: 8716664.1 i/s - 1.16x slower bmethod_0_splat ./miniruby-iseq-bmethod: 8138802.5 i/s ./miniruby-m: 7515600.2 i/s - 1.08x slower bmethod_1_splat ./miniruby-iseq-bmethod: 8028372.7 i/s ./miniruby-m: 5947658.6 i/s - 1.35x slower bmethod_10_splat ./miniruby-iseq-bmethod: 6953514.1 i/s ./miniruby-m: 4840132.9 i/s - 1.44x slower bmethod_100_splat ./miniruby-iseq-bmethod: 5287288.4 i/s ./miniruby-m: 2243218.4 i/s - 2.36x slower bmethod_kw ./miniruby-iseq-bmethod: 8931358.2 i/s ./miniruby-m: 3185818.6 i/s - 2.80x slower bmethod_no_kw ./miniruby-iseq-bmethod: 12281287.4 i/s ./miniruby-m: 10041727.9 i/s - 1.22x slower bmethod_kw_splat ./miniruby-iseq-bmethod: 5618956.8 i/s ./miniruby-m: 3657549.5 i/s - 1.54x slower ```
2023-03-24 00:39:31 +03:00
vm_call_iseq_bmethod(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
Speed up calling iseq bmethods Currently, bmethod arguments are copied from the VM stack to the C stack in vm_call_bmethod, then copied from the C stack to the VM stack later in invoke_iseq_block_from_c. This is inefficient. This adds vm_call_iseq_bmethod and vm_call_noniseq_bmethod. vm_call_iseq_bmethod is an optimized method that skips stack copies (though there is one copy to remove the receiver from the stack), and avoids calling vm_call_bmethod_body, rb_vm_invoke_bmethod, invoke_block_from_c_proc, invoke_iseq_block_from_c, and vm_yield_setup_args. Th vm_call_iseq_bmethod argument handling is similar to the way normal iseq methods are called, and allows for similar performance optimizations when using splats or keywords. However, even in the no argument case it's still significantly faster. A benchmark is added for bmethod calling. In my environment, it improves bmethod calling performance by 38-59% for simple bmethod calls, and up to 180% for bmethod calls passing literal keywords on both sides. ``` ./miniruby-iseq-bmethod: 18159792.6 i/s ./miniruby-m: 13174419.1 i/s - 1.38x slower bmethod_simple_1 ./miniruby-iseq-bmethod: 15890745.4 i/s ./miniruby-m: 10008972.7 i/s - 1.59x slower bmethod_simple_0_splat ./miniruby-iseq-bmethod: 13142804.3 i/s ./miniruby-m: 11168595.2 i/s - 1.18x slower bmethod_simple_1_splat ./miniruby-iseq-bmethod: 12375791.0 i/s ./miniruby-m: 8491140.1 i/s - 1.46x slower bmethod_no_splat ./miniruby-iseq-bmethod: 10151258.8 i/s ./miniruby-m: 8716664.1 i/s - 1.16x slower bmethod_0_splat ./miniruby-iseq-bmethod: 8138802.5 i/s ./miniruby-m: 7515600.2 i/s - 1.08x slower bmethod_1_splat ./miniruby-iseq-bmethod: 8028372.7 i/s ./miniruby-m: 5947658.6 i/s - 1.35x slower bmethod_10_splat ./miniruby-iseq-bmethod: 6953514.1 i/s ./miniruby-m: 4840132.9 i/s - 1.44x slower bmethod_100_splat ./miniruby-iseq-bmethod: 5287288.4 i/s ./miniruby-m: 2243218.4 i/s - 2.36x slower bmethod_kw ./miniruby-iseq-bmethod: 8931358.2 i/s ./miniruby-m: 3185818.6 i/s - 2.80x slower bmethod_no_kw ./miniruby-iseq-bmethod: 12281287.4 i/s ./miniruby-m: 10041727.9 i/s - 1.22x slower bmethod_kw_splat ./miniruby-iseq-bmethod: 5618956.8 i/s ./miniruby-m: 3657549.5 i/s - 1.54x slower ```
2023-03-24 00:39:31 +03:00
RB_DEBUG_COUNTER_INC(ccf_iseq_bmethod);
const struct rb_callcache *cc = calling->cc;
const rb_callable_method_entry_t *cme = vm_cc_cme(cc);
VALUE procv = cme->def->body.bmethod.proc;
if (!RB_OBJ_SHAREABLE_P(procv) &&
cme->def->body.bmethod.defined_ractor != rb_ractor_self(rb_ec_ractor_ptr(ec))) {
rb_raise(rb_eRuntimeError, "defined with an un-shareable Proc in a different Ractor");
}
rb_proc_t *proc;
GetProcPtr(procv, proc);
const struct rb_block *block = &proc->block;
while (vm_block_type(block) == block_type_proc) {
block = vm_proc_block(block->as.proc);
}
VM_ASSERT(vm_block_type(block) == block_type_iseq);
const struct rb_captured_block *captured = &block->as.captured;
const rb_iseq_t *iseq = rb_iseq_check(captured->code.iseq);
Remove __bp__ and speed-up bmethod calls (#8060) Remove rb_control_frame_t::__bp__ and optimize bmethod calls This commit removes the __bp__ field from rb_control_frame_t. It was introduced to help MJIT, but since MJIT was replaced by RJIT, we can use vm_base_ptr() to compute it from the SP of the previous control frame instead. Removing the field avoids needing to set it up when pushing new frames. Simply removing __bp__ would cause crashes since RJIT and YJIT used a slightly different stack layout for bmethod calls than the interpreter. At the moment of the call, the two layouts looked as follows: ┌────────────┐ ┌────────────┐ │ frame_base │ │ frame_base │ ├────────────┤ ├────────────┤ │ ... │ │ ... │ ├────────────┤ ├────────────┤ │ args │ │ args │ ├────────────┤ └────────────┘<─prev_frame_sp │ receiver │ prev_frame_sp─>└────────────┘ RJIT & YJIT interpreter Essentially, vm_base_ptr() needs to compute the address to frame_base given prev_frame_sp in the diagrams. The presence of the receiver created an off-by-one situation. Make the interpreter use the layout the JITs use for iseq-to-iseq bmethod calls. Doing so removes unnecessary argument shifting and vm_exec_core() re-entry from the interpreter, yielding a speed improvement visible through `benchmark/vm_defined_method.yml`: patched: 7578743.1 i/s master: 4796596.3 i/s - 1.58x slower C-to-iseq bmethod calls now store one more VALUE than before, but that should have negligible impact on overall performance. Note that re-entering vm_exec_core() used to be necessary for firing TracePoint events, but that's no longer the case since 9121e57a5f50bc91bae48b3b91edb283bf96cb6b. Closes ruby/ruby#6428
2023-07-17 20:57:58 +03:00
VALUE * const argv = cfp->sp - calling->argc;
const int arg_size = ISEQ_BODY(iseq)->param.size;
Speed up calling iseq bmethods Currently, bmethod arguments are copied from the VM stack to the C stack in vm_call_bmethod, then copied from the C stack to the VM stack later in invoke_iseq_block_from_c. This is inefficient. This adds vm_call_iseq_bmethod and vm_call_noniseq_bmethod. vm_call_iseq_bmethod is an optimized method that skips stack copies (though there is one copy to remove the receiver from the stack), and avoids calling vm_call_bmethod_body, rb_vm_invoke_bmethod, invoke_block_from_c_proc, invoke_iseq_block_from_c, and vm_yield_setup_args. Th vm_call_iseq_bmethod argument handling is similar to the way normal iseq methods are called, and allows for similar performance optimizations when using splats or keywords. However, even in the no argument case it's still significantly faster. A benchmark is added for bmethod calling. In my environment, it improves bmethod calling performance by 38-59% for simple bmethod calls, and up to 180% for bmethod calls passing literal keywords on both sides. ``` ./miniruby-iseq-bmethod: 18159792.6 i/s ./miniruby-m: 13174419.1 i/s - 1.38x slower bmethod_simple_1 ./miniruby-iseq-bmethod: 15890745.4 i/s ./miniruby-m: 10008972.7 i/s - 1.59x slower bmethod_simple_0_splat ./miniruby-iseq-bmethod: 13142804.3 i/s ./miniruby-m: 11168595.2 i/s - 1.18x slower bmethod_simple_1_splat ./miniruby-iseq-bmethod: 12375791.0 i/s ./miniruby-m: 8491140.1 i/s - 1.46x slower bmethod_no_splat ./miniruby-iseq-bmethod: 10151258.8 i/s ./miniruby-m: 8716664.1 i/s - 1.16x slower bmethod_0_splat ./miniruby-iseq-bmethod: 8138802.5 i/s ./miniruby-m: 7515600.2 i/s - 1.08x slower bmethod_1_splat ./miniruby-iseq-bmethod: 8028372.7 i/s ./miniruby-m: 5947658.6 i/s - 1.35x slower bmethod_10_splat ./miniruby-iseq-bmethod: 6953514.1 i/s ./miniruby-m: 4840132.9 i/s - 1.44x slower bmethod_100_splat ./miniruby-iseq-bmethod: 5287288.4 i/s ./miniruby-m: 2243218.4 i/s - 2.36x slower bmethod_kw ./miniruby-iseq-bmethod: 8931358.2 i/s ./miniruby-m: 3185818.6 i/s - 2.80x slower bmethod_no_kw ./miniruby-iseq-bmethod: 12281287.4 i/s ./miniruby-m: 10041727.9 i/s - 1.22x slower bmethod_kw_splat ./miniruby-iseq-bmethod: 5618956.8 i/s ./miniruby-m: 3657549.5 i/s - 1.54x slower ```
2023-03-24 00:39:31 +03:00
Remove __bp__ and speed-up bmethod calls (#8060) Remove rb_control_frame_t::__bp__ and optimize bmethod calls This commit removes the __bp__ field from rb_control_frame_t. It was introduced to help MJIT, but since MJIT was replaced by RJIT, we can use vm_base_ptr() to compute it from the SP of the previous control frame instead. Removing the field avoids needing to set it up when pushing new frames. Simply removing __bp__ would cause crashes since RJIT and YJIT used a slightly different stack layout for bmethod calls than the interpreter. At the moment of the call, the two layouts looked as follows: ┌────────────┐ ┌────────────┐ │ frame_base │ │ frame_base │ ├────────────┤ ├────────────┤ │ ... │ │ ... │ ├────────────┤ ├────────────┤ │ args │ │ args │ ├────────────┤ └────────────┘<─prev_frame_sp │ receiver │ prev_frame_sp─>└────────────┘ RJIT & YJIT interpreter Essentially, vm_base_ptr() needs to compute the address to frame_base given prev_frame_sp in the diagrams. The presence of the receiver created an off-by-one situation. Make the interpreter use the layout the JITs use for iseq-to-iseq bmethod calls. Doing so removes unnecessary argument shifting and vm_exec_core() re-entry from the interpreter, yielding a speed improvement visible through `benchmark/vm_defined_method.yml`: patched: 7578743.1 i/s master: 4796596.3 i/s - 1.58x slower C-to-iseq bmethod calls now store one more VALUE than before, but that should have negligible impact on overall performance. Note that re-entering vm_exec_core() used to be necessary for firing TracePoint events, but that's no longer the case since 9121e57a5f50bc91bae48b3b91edb283bf96cb6b. Closes ruby/ruby#6428
2023-07-17 20:57:58 +03:00
int opt_pc;
if (vm_ci_flag(calling->cd->ci) & VM_CALL_ARGS_SIMPLE) {
opt_pc = vm_callee_setup_block_arg(ec, calling, calling->cd->ci, iseq, argv, arg_setup_method);
Speed up calling iseq bmethods Currently, bmethod arguments are copied from the VM stack to the C stack in vm_call_bmethod, then copied from the C stack to the VM stack later in invoke_iseq_block_from_c. This is inefficient. This adds vm_call_iseq_bmethod and vm_call_noniseq_bmethod. vm_call_iseq_bmethod is an optimized method that skips stack copies (though there is one copy to remove the receiver from the stack), and avoids calling vm_call_bmethod_body, rb_vm_invoke_bmethod, invoke_block_from_c_proc, invoke_iseq_block_from_c, and vm_yield_setup_args. Th vm_call_iseq_bmethod argument handling is similar to the way normal iseq methods are called, and allows for similar performance optimizations when using splats or keywords. However, even in the no argument case it's still significantly faster. A benchmark is added for bmethod calling. In my environment, it improves bmethod calling performance by 38-59% for simple bmethod calls, and up to 180% for bmethod calls passing literal keywords on both sides. ``` ./miniruby-iseq-bmethod: 18159792.6 i/s ./miniruby-m: 13174419.1 i/s - 1.38x slower bmethod_simple_1 ./miniruby-iseq-bmethod: 15890745.4 i/s ./miniruby-m: 10008972.7 i/s - 1.59x slower bmethod_simple_0_splat ./miniruby-iseq-bmethod: 13142804.3 i/s ./miniruby-m: 11168595.2 i/s - 1.18x slower bmethod_simple_1_splat ./miniruby-iseq-bmethod: 12375791.0 i/s ./miniruby-m: 8491140.1 i/s - 1.46x slower bmethod_no_splat ./miniruby-iseq-bmethod: 10151258.8 i/s ./miniruby-m: 8716664.1 i/s - 1.16x slower bmethod_0_splat ./miniruby-iseq-bmethod: 8138802.5 i/s ./miniruby-m: 7515600.2 i/s - 1.08x slower bmethod_1_splat ./miniruby-iseq-bmethod: 8028372.7 i/s ./miniruby-m: 5947658.6 i/s - 1.35x slower bmethod_10_splat ./miniruby-iseq-bmethod: 6953514.1 i/s ./miniruby-m: 4840132.9 i/s - 1.44x slower bmethod_100_splat ./miniruby-iseq-bmethod: 5287288.4 i/s ./miniruby-m: 2243218.4 i/s - 2.36x slower bmethod_kw ./miniruby-iseq-bmethod: 8931358.2 i/s ./miniruby-m: 3185818.6 i/s - 2.80x slower bmethod_no_kw ./miniruby-iseq-bmethod: 12281287.4 i/s ./miniruby-m: 10041727.9 i/s - 1.22x slower bmethod_kw_splat ./miniruby-iseq-bmethod: 5618956.8 i/s ./miniruby-m: 3657549.5 i/s - 1.54x slower ```
2023-03-24 00:39:31 +03:00
}
else {
opt_pc = setup_parameters_complex(ec, iseq, calling, calling->cd->ci, argv, arg_setup_method);
Speed up calling iseq bmethods Currently, bmethod arguments are copied from the VM stack to the C stack in vm_call_bmethod, then copied from the C stack to the VM stack later in invoke_iseq_block_from_c. This is inefficient. This adds vm_call_iseq_bmethod and vm_call_noniseq_bmethod. vm_call_iseq_bmethod is an optimized method that skips stack copies (though there is one copy to remove the receiver from the stack), and avoids calling vm_call_bmethod_body, rb_vm_invoke_bmethod, invoke_block_from_c_proc, invoke_iseq_block_from_c, and vm_yield_setup_args. Th vm_call_iseq_bmethod argument handling is similar to the way normal iseq methods are called, and allows for similar performance optimizations when using splats or keywords. However, even in the no argument case it's still significantly faster. A benchmark is added for bmethod calling. In my environment, it improves bmethod calling performance by 38-59% for simple bmethod calls, and up to 180% for bmethod calls passing literal keywords on both sides. ``` ./miniruby-iseq-bmethod: 18159792.6 i/s ./miniruby-m: 13174419.1 i/s - 1.38x slower bmethod_simple_1 ./miniruby-iseq-bmethod: 15890745.4 i/s ./miniruby-m: 10008972.7 i/s - 1.59x slower bmethod_simple_0_splat ./miniruby-iseq-bmethod: 13142804.3 i/s ./miniruby-m: 11168595.2 i/s - 1.18x slower bmethod_simple_1_splat ./miniruby-iseq-bmethod: 12375791.0 i/s ./miniruby-m: 8491140.1 i/s - 1.46x slower bmethod_no_splat ./miniruby-iseq-bmethod: 10151258.8 i/s ./miniruby-m: 8716664.1 i/s - 1.16x slower bmethod_0_splat ./miniruby-iseq-bmethod: 8138802.5 i/s ./miniruby-m: 7515600.2 i/s - 1.08x slower bmethod_1_splat ./miniruby-iseq-bmethod: 8028372.7 i/s ./miniruby-m: 5947658.6 i/s - 1.35x slower bmethod_10_splat ./miniruby-iseq-bmethod: 6953514.1 i/s ./miniruby-m: 4840132.9 i/s - 1.44x slower bmethod_100_splat ./miniruby-iseq-bmethod: 5287288.4 i/s ./miniruby-m: 2243218.4 i/s - 2.36x slower bmethod_kw ./miniruby-iseq-bmethod: 8931358.2 i/s ./miniruby-m: 3185818.6 i/s - 2.80x slower bmethod_no_kw ./miniruby-iseq-bmethod: 12281287.4 i/s ./miniruby-m: 10041727.9 i/s - 1.22x slower bmethod_kw_splat ./miniruby-iseq-bmethod: 5618956.8 i/s ./miniruby-m: 3657549.5 i/s - 1.54x slower ```
2023-03-24 00:39:31 +03:00
}
Remove __bp__ and speed-up bmethod calls (#8060) Remove rb_control_frame_t::__bp__ and optimize bmethod calls This commit removes the __bp__ field from rb_control_frame_t. It was introduced to help MJIT, but since MJIT was replaced by RJIT, we can use vm_base_ptr() to compute it from the SP of the previous control frame instead. Removing the field avoids needing to set it up when pushing new frames. Simply removing __bp__ would cause crashes since RJIT and YJIT used a slightly different stack layout for bmethod calls than the interpreter. At the moment of the call, the two layouts looked as follows: ┌────────────┐ ┌────────────┐ │ frame_base │ │ frame_base │ ├────────────┤ ├────────────┤ │ ... │ │ ... │ ├────────────┤ ├────────────┤ │ args │ │ args │ ├────────────┤ └────────────┘<─prev_frame_sp │ receiver │ prev_frame_sp─>└────────────┘ RJIT & YJIT interpreter Essentially, vm_base_ptr() needs to compute the address to frame_base given prev_frame_sp in the diagrams. The presence of the receiver created an off-by-one situation. Make the interpreter use the layout the JITs use for iseq-to-iseq bmethod calls. Doing so removes unnecessary argument shifting and vm_exec_core() re-entry from the interpreter, yielding a speed improvement visible through `benchmark/vm_defined_method.yml`: patched: 7578743.1 i/s master: 4796596.3 i/s - 1.58x slower C-to-iseq bmethod calls now store one more VALUE than before, but that should have negligible impact on overall performance. Note that re-entering vm_exec_core() used to be necessary for firing TracePoint events, but that's no longer the case since 9121e57a5f50bc91bae48b3b91edb283bf96cb6b. Closes ruby/ruby#6428
2023-07-17 20:57:58 +03:00
cfp->sp = argv - 1; // -1 for the receiver
vm_push_frame(ec, iseq,
VM_FRAME_MAGIC_BLOCK | VM_FRAME_FLAG_BMETHOD | VM_FRAME_FLAG_LAMBDA,
calling->recv,
VM_GUARDED_PREV_EP(captured->ep),
(VALUE)cme,
ISEQ_BODY(iseq)->iseq_encoded + opt_pc,
argv + arg_size,
ISEQ_BODY(iseq)->local_table_size - arg_size,
ISEQ_BODY(iseq)->stack_max);
return Qundef;
Speed up calling iseq bmethods Currently, bmethod arguments are copied from the VM stack to the C stack in vm_call_bmethod, then copied from the C stack to the VM stack later in invoke_iseq_block_from_c. This is inefficient. This adds vm_call_iseq_bmethod and vm_call_noniseq_bmethod. vm_call_iseq_bmethod is an optimized method that skips stack copies (though there is one copy to remove the receiver from the stack), and avoids calling vm_call_bmethod_body, rb_vm_invoke_bmethod, invoke_block_from_c_proc, invoke_iseq_block_from_c, and vm_yield_setup_args. Th vm_call_iseq_bmethod argument handling is similar to the way normal iseq methods are called, and allows for similar performance optimizations when using splats or keywords. However, even in the no argument case it's still significantly faster. A benchmark is added for bmethod calling. In my environment, it improves bmethod calling performance by 38-59% for simple bmethod calls, and up to 180% for bmethod calls passing literal keywords on both sides. ``` ./miniruby-iseq-bmethod: 18159792.6 i/s ./miniruby-m: 13174419.1 i/s - 1.38x slower bmethod_simple_1 ./miniruby-iseq-bmethod: 15890745.4 i/s ./miniruby-m: 10008972.7 i/s - 1.59x slower bmethod_simple_0_splat ./miniruby-iseq-bmethod: 13142804.3 i/s ./miniruby-m: 11168595.2 i/s - 1.18x slower bmethod_simple_1_splat ./miniruby-iseq-bmethod: 12375791.0 i/s ./miniruby-m: 8491140.1 i/s - 1.46x slower bmethod_no_splat ./miniruby-iseq-bmethod: 10151258.8 i/s ./miniruby-m: 8716664.1 i/s - 1.16x slower bmethod_0_splat ./miniruby-iseq-bmethod: 8138802.5 i/s ./miniruby-m: 7515600.2 i/s - 1.08x slower bmethod_1_splat ./miniruby-iseq-bmethod: 8028372.7 i/s ./miniruby-m: 5947658.6 i/s - 1.35x slower bmethod_10_splat ./miniruby-iseq-bmethod: 6953514.1 i/s ./miniruby-m: 4840132.9 i/s - 1.44x slower bmethod_100_splat ./miniruby-iseq-bmethod: 5287288.4 i/s ./miniruby-m: 2243218.4 i/s - 2.36x slower bmethod_kw ./miniruby-iseq-bmethod: 8931358.2 i/s ./miniruby-m: 3185818.6 i/s - 2.80x slower bmethod_no_kw ./miniruby-iseq-bmethod: 12281287.4 i/s ./miniruby-m: 10041727.9 i/s - 1.22x slower bmethod_kw_splat ./miniruby-iseq-bmethod: 5618956.8 i/s ./miniruby-m: 3657549.5 i/s - 1.54x slower ```
2023-03-24 00:39:31 +03:00
}
static VALUE
vm_call_noniseq_bmethod(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_noniseq_bmethod);
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 21:02:55 +03:00
VALUE *argv;
int argc;
CALLER_SETUP_ARG(cfp, calling, calling->cd->ci, ALLOW_HEAP_ARGV);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (UNLIKELY(calling->heap_argv)) {
argv = RARRAY_PTR(calling->heap_argv);
cfp->sp -= 2;
}
else {
argc = calling->argc;
argv = ALLOCA_N(VALUE, argc);
MEMCPY(argv, cfp->sp - argc, VALUE, argc);
cfp->sp += - argc - 1;
}
return vm_call_bmethod_body(ec, calling, argv);
}
Speed up calling iseq bmethods Currently, bmethod arguments are copied from the VM stack to the C stack in vm_call_bmethod, then copied from the C stack to the VM stack later in invoke_iseq_block_from_c. This is inefficient. This adds vm_call_iseq_bmethod and vm_call_noniseq_bmethod. vm_call_iseq_bmethod is an optimized method that skips stack copies (though there is one copy to remove the receiver from the stack), and avoids calling vm_call_bmethod_body, rb_vm_invoke_bmethod, invoke_block_from_c_proc, invoke_iseq_block_from_c, and vm_yield_setup_args. Th vm_call_iseq_bmethod argument handling is similar to the way normal iseq methods are called, and allows for similar performance optimizations when using splats or keywords. However, even in the no argument case it's still significantly faster. A benchmark is added for bmethod calling. In my environment, it improves bmethod calling performance by 38-59% for simple bmethod calls, and up to 180% for bmethod calls passing literal keywords on both sides. ``` ./miniruby-iseq-bmethod: 18159792.6 i/s ./miniruby-m: 13174419.1 i/s - 1.38x slower bmethod_simple_1 ./miniruby-iseq-bmethod: 15890745.4 i/s ./miniruby-m: 10008972.7 i/s - 1.59x slower bmethod_simple_0_splat ./miniruby-iseq-bmethod: 13142804.3 i/s ./miniruby-m: 11168595.2 i/s - 1.18x slower bmethod_simple_1_splat ./miniruby-iseq-bmethod: 12375791.0 i/s ./miniruby-m: 8491140.1 i/s - 1.46x slower bmethod_no_splat ./miniruby-iseq-bmethod: 10151258.8 i/s ./miniruby-m: 8716664.1 i/s - 1.16x slower bmethod_0_splat ./miniruby-iseq-bmethod: 8138802.5 i/s ./miniruby-m: 7515600.2 i/s - 1.08x slower bmethod_1_splat ./miniruby-iseq-bmethod: 8028372.7 i/s ./miniruby-m: 5947658.6 i/s - 1.35x slower bmethod_10_splat ./miniruby-iseq-bmethod: 6953514.1 i/s ./miniruby-m: 4840132.9 i/s - 1.44x slower bmethod_100_splat ./miniruby-iseq-bmethod: 5287288.4 i/s ./miniruby-m: 2243218.4 i/s - 2.36x slower bmethod_kw ./miniruby-iseq-bmethod: 8931358.2 i/s ./miniruby-m: 3185818.6 i/s - 2.80x slower bmethod_no_kw ./miniruby-iseq-bmethod: 12281287.4 i/s ./miniruby-m: 10041727.9 i/s - 1.22x slower bmethod_kw_splat ./miniruby-iseq-bmethod: 5618956.8 i/s ./miniruby-m: 3657549.5 i/s - 1.54x slower ```
2023-03-24 00:39:31 +03:00
static VALUE
vm_call_bmethod(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_bmethod);
const struct rb_callcache *cc = calling->cc;
const rb_callable_method_entry_t *cme = vm_cc_cme(cc);
VALUE procv = cme->def->body.bmethod.proc;
rb_proc_t *proc;
GetProcPtr(procv, proc);
const struct rb_block *block = &proc->block;
while (vm_block_type(block) == block_type_proc) {
block = vm_proc_block(block->as.proc);
}
if (vm_block_type(block) == block_type_iseq) {
CC_SET_FASTPATH(cc, vm_call_iseq_bmethod, TRUE);
return vm_call_iseq_bmethod(ec, cfp, calling);
}
CC_SET_FASTPATH(cc, vm_call_noniseq_bmethod, TRUE);
return vm_call_noniseq_bmethod(ec, cfp, calling);
}
2023-03-07 08:34:31 +03:00
VALUE
rb_find_defined_class_by_owner(VALUE current_class, VALUE target_owner)
{
VALUE klass = current_class;
/* for prepended Module, then start from cover class */
if (RB_TYPE_P(klass, T_ICLASS) && FL_TEST(klass, RICLASS_IS_ORIGIN) &&
RB_TYPE_P(RBASIC_CLASS(klass), T_CLASS)) {
klass = RBASIC_CLASS(klass);
}
while (RTEST(klass)) {
VALUE owner = RB_TYPE_P(klass, T_ICLASS) ? RBASIC_CLASS(klass) : klass;
if (owner == target_owner) {
return klass;
}
klass = RCLASS_SUPER(klass);
}
return current_class; /* maybe module function */
}
static const rb_callable_method_entry_t *
aliased_callable_method_entry(const rb_callable_method_entry_t *me)
{
const rb_method_entry_t *orig_me = me->def->body.alias.original_me;
const rb_callable_method_entry_t *cme;
if (orig_me->defined_class == 0) {
VALUE defined_class = rb_find_defined_class_by_owner(me->defined_class, orig_me->owner);
2024-10-31 16:12:16 +03:00
VM_ASSERT_TYPE(orig_me->owner, T_MODULE);
cme = rb_method_entry_complement_defined_class(orig_me, me->called_id, defined_class);
2022-07-21 19:23:58 +03:00
if (me->def->reference_count == 1) {
RB_OBJ_WRITE(me, &me->def->body.alias.original_me, cme);
}
else {
rb_method_definition_t *def =
rb_method_definition_create(VM_METHOD_TYPE_ALIAS, me->def->original_id);
rb_method_definition_set((rb_method_entry_t *)me, def, (void *)cme);
}
}
else {
cme = (const rb_callable_method_entry_t *)orig_me;
}
VM_ASSERT(callable_method_entry_p(cme));
return cme;
}
const rb_callable_method_entry_t *
rb_aliased_callable_method_entry(const rb_callable_method_entry_t *me)
{
return aliased_callable_method_entry(me);
}
static VALUE
vm_call_alias(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
calling->cc = &VM_CC_ON_STACK(Qundef,
vm_call_general,
{{0}},
aliased_callable_method_entry(vm_cc_cme(calling->cc)));
return vm_call_method_each_type(ec, cfp, calling);
}
static enum method_missing_reason
ci_missing_reason(const struct rb_callinfo *ci)
{
enum method_missing_reason stat = MISSING_NOENTRY;
if (vm_ci_flag(ci) & VM_CALL_VCALL) stat |= MISSING_VCALL;
if (vm_ci_flag(ci) & VM_CALL_FCALL) stat |= MISSING_FCALL;
if (vm_ci_flag(ci) & VM_CALL_SUPER) stat |= MISSING_SUPER;
return stat;
}
static VALUE vm_call_method_missing(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling);
static VALUE
vm_call_symbol(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
struct rb_calling_info *calling, const struct rb_callinfo *ci, VALUE symbol, int flags)
{
ASSUME(calling->argc >= 0);
enum method_missing_reason missing_reason = MISSING_NOENTRY;
int argc = calling->argc;
VALUE recv = calling->recv;
VALUE klass = CLASS_OF(recv);
ID mid = rb_check_id(&symbol);
flags |= VM_CALL_OPT_SEND;
if (UNLIKELY(! mid)) {
mid = idMethodMissing;
missing_reason = ci_missing_reason(ci);
ec->method_missing_reason = missing_reason;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
VALUE argv_ary;
if (UNLIKELY(argv_ary = calling->heap_argv)) {
if (rb_method_basic_definition_p(klass, idMethodMissing)) {
rb_ary_unshift(argv_ary, symbol);
/* Inadvertent symbol creation shall be forbidden, see [Feature #5112] */
int priv = vm_ci_flag(ci) & (VM_CALL_FCALL | VM_CALL_VCALL);
VALUE exc = rb_make_no_method_exception(
rb_eNoMethodError, 0, recv, RARRAY_LENINT(argv_ary), RARRAY_CONST_PTR(argv_ary), priv);
rb_exc_raise(exc);
}
rb_ary_unshift(argv_ary, rb_str_intern(symbol));
}
else {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
/* E.g. when argc == 2
*
* | | | | TOPN
* | | +------+
* | | +---> | arg1 | 0
* +------+ | +------+
* | arg1 | -+ +-> | arg0 | 1
* +------+ | +------+
* | arg0 | ---+ | sym | 2
* +------+ +------+
* | recv | | recv | 3
* --+------+--------+------+------
*/
int i = argc;
CHECK_VM_STACK_OVERFLOW(reg_cfp, 1);
INC_SP(1);
MEMMOVE(&TOPN(i - 1), &TOPN(i), VALUE, i);
argc = ++calling->argc;
if (rb_method_basic_definition_p(klass, idMethodMissing)) {
/* Inadvertent symbol creation shall be forbidden, see [Feature #5112] */
TOPN(i) = symbol;
int priv = vm_ci_flag(ci) & (VM_CALL_FCALL | VM_CALL_VCALL);
const VALUE *argv = STACK_ADDR_FROM_TOP(argc);
VALUE exc = rb_make_no_method_exception(
rb_eNoMethodError, 0, recv, argc, argv, priv);
rb_exc_raise(exc);
}
else {
TOPN(i) = rb_str_intern(symbol);
}
}
}
struct rb_forwarding_call_data new_fcd = {
.cd = {
.ci = &VM_CI_ON_STACK(mid, flags, argc, vm_ci_kwarg(ci)),
.cc = NULL,
},
.caller_ci = NULL,
};
if (!(vm_ci_flag(ci) & VM_CALL_FORWARDING)) {
calling->cd = &new_fcd.cd;
}
else {
const struct rb_callinfo *caller_ci = ((struct rb_forwarding_call_data *)calling->cd)->caller_ci;
VM_ASSERT((vm_ci_argc(caller_ci), 1));
new_fcd.caller_ci = caller_ci;
calling->cd = (struct rb_call_data *)&new_fcd;
}
calling->cc = &VM_CC_ON_STACK(klass,
vm_call_general,
{ .method_missing_reason = missing_reason },
rb_callable_method_entry_with_refinements(klass, mid, NULL));
if (flags & VM_CALL_FCALL) {
return vm_call_method(ec, reg_cfp, calling);
}
const struct rb_callcache *cc = calling->cc;
VM_ASSERT(callable_method_entry_p(vm_cc_cme(cc)));
if (vm_cc_cme(cc) != NULL) {
switch (METHOD_ENTRY_VISI(vm_cc_cme(cc))) {
case METHOD_VISI_PUBLIC: /* likely */
return vm_call_method_each_type(ec, reg_cfp, calling);
case METHOD_VISI_PRIVATE:
vm_cc_method_missing_reason_set(cc, MISSING_PRIVATE);
break;
case METHOD_VISI_PROTECTED:
vm_cc_method_missing_reason_set(cc, MISSING_PROTECTED);
break;
default:
VM_UNREACHABLE(vm_call_method);
}
return vm_call_method_missing(ec, reg_cfp, calling);
}
return vm_call_method_nome(ec, reg_cfp, calling);
}
static VALUE
vm_call_opt_send0(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling, int flags)
{
const struct rb_callinfo *ci = calling->cd->ci;
int i;
VALUE sym;
i = calling->argc - 1;
if (calling->argc == 0) {
rb_raise(rb_eArgError, "no method name given");
}
sym = TOPN(i);
/* E.g. when i == 2
*
* | | | | TOPN
* +------+ | |
* | arg1 | ---+ | | 0
* +------+ | +------+
* | arg0 | -+ +-> | arg1 | 1
* +------+ | +------+
* | sym | +---> | arg0 | 2
* +------+ +------+
* | recv | | recv | 3
* --+------+--------+------+------
*/
/* shift arguments */
if (i > 0) {
MEMMOVE(&TOPN(i), &TOPN(i-1), VALUE, i);
}
calling->argc -= 1;
DEC_SP(1);
return vm_call_symbol(ec, reg_cfp, calling, ci, sym, flags);
}
static VALUE
vm_call_opt_send_complex(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_opt_send_complex);
const struct rb_callinfo *ci = calling->cd->ci;
int flags = VM_CALL_FCALL;
VALUE sym;
VALUE argv_ary;
CALLER_SETUP_ARG(reg_cfp, calling, ci, ALLOW_HEAP_ARGV);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (UNLIKELY(argv_ary = calling->heap_argv)) {
sym = rb_ary_shift(argv_ary);
flags |= VM_CALL_ARGS_SPLAT;
if (calling->kw_splat) {
VALUE last_hash = rb_ary_last(0, NULL, argv_ary);
((struct RHash *)last_hash)->basic.flags |= RHASH_PASS_AS_KEYWORDS;
calling->kw_splat = 0;
}
return vm_call_symbol(ec, reg_cfp, calling, ci, sym, flags);
}
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (calling->kw_splat) flags |= VM_CALL_KW_SPLAT;
return vm_call_opt_send0(ec, reg_cfp, calling, flags);
}
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
static VALUE
vm_call_opt_send_simple(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_opt_send_simple);
return vm_call_opt_send0(ec, reg_cfp, calling, vm_ci_flag(calling->cd->ci) | VM_CALL_FCALL);
}
static VALUE
vm_call_opt_send(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_opt_send);
const struct rb_callinfo *ci = calling->cd->ci;
int flags = vm_ci_flag(ci);
if (UNLIKELY((flags & VM_CALL_FORWARDING) || (!(flags & VM_CALL_ARGS_SIMPLE) &&
((calling->argc == 1 && (flags & (VM_CALL_ARGS_SPLAT | VM_CALL_KW_SPLAT))) ||
(calling->argc == 2 && (flags & VM_CALL_ARGS_SPLAT) && (flags & VM_CALL_KW_SPLAT)) ||
((flags & VM_CALL_KWARG) && (vm_ci_kwarg(ci)->keyword_len == calling->argc)))))) {
CC_SET_FASTPATH(calling->cc, vm_call_opt_send_complex, TRUE);
return vm_call_opt_send_complex(ec, reg_cfp, calling);
}
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
CC_SET_FASTPATH(calling->cc, vm_call_opt_send_simple, TRUE);
return vm_call_opt_send_simple(ec, reg_cfp, calling);
}
static VALUE
vm_call_method_missing_body(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling,
const struct rb_callinfo *orig_ci, enum method_missing_reason reason)
{
RB_DEBUG_COUNTER_INC(ccf_method_missing);
VALUE *argv = STACK_ADDR_FROM_TOP(calling->argc);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
unsigned int argc, flag;
flag = VM_CALL_FCALL | VM_CALL_OPT_SEND | vm_ci_flag(orig_ci);
argc = ++calling->argc;
/* shift arguments: m(a, b, c) #=> method_missing(:m, a, b, c) */
CHECK_VM_STACK_OVERFLOW(reg_cfp, 1);
vm_check_canary(ec, reg_cfp->sp);
if (argc > 1) {
MEMMOVE(argv+1, argv, VALUE, argc-1);
}
argv[0] = ID2SYM(vm_ci_mid(orig_ci));
INC_SP(1);
ec->method_missing_reason = reason;
struct rb_forwarding_call_data new_fcd = {
.cd = {
.ci = &VM_CI_ON_STACK(idMethodMissing, flag, argc, vm_ci_kwarg(orig_ci)),
.cc = NULL,
},
.caller_ci = NULL,
};
if (!(flag & VM_CALL_FORWARDING)) {
calling->cd = &new_fcd.cd;
}
else {
const struct rb_callinfo *caller_ci = ((struct rb_forwarding_call_data *)calling->cd)->caller_ci;
VM_ASSERT((vm_ci_argc(caller_ci), 1));
new_fcd.caller_ci = caller_ci;
calling->cd = (struct rb_call_data *)&new_fcd;
}
calling->cc = &VM_CC_ON_STACK(Qundef, vm_call_general, {{ 0 }},
rb_callable_method_entry_without_refinements(CLASS_OF(calling->recv), idMethodMissing, NULL));
return vm_call_method(ec, reg_cfp, calling);
}
static VALUE
vm_call_method_missing(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
return vm_call_method_missing_body(ec, reg_cfp, calling, calling->cd->ci, vm_cc_cmethod_missing_reason(calling->cc));
}
static const rb_callable_method_entry_t *refined_method_callable_without_refinement(const rb_callable_method_entry_t *me);
static VALUE
vm_call_zsuper(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling, VALUE klass)
{
klass = RCLASS_SUPER(klass);
const rb_callable_method_entry_t *cme = klass ? rb_callable_method_entry(klass, vm_ci_mid(calling->cd->ci)) : NULL;
if (cme == NULL) {
return vm_call_method_nome(ec, cfp, calling);
}
if (cme->def->type == VM_METHOD_TYPE_REFINED &&
cme->def->body.refined.orig_me) {
cme = refined_method_callable_without_refinement(cme);
}
calling->cc = &VM_CC_ON_STACK(Qundef, vm_call_general, {{ 0 }}, cme);
return vm_call_method_each_type(ec, cfp, calling);
}
static inline VALUE
* revised r37993 to avoid SEGV/ILL in tests. In r37993, a method entry with VM_METHOD_TYPE_REFINED holds only the original method definition, so ci->me is set to a method entry allocated in the stack, and it causes SEGV/ILL. In this commit, a method entry with VM_METHOD_TYPE_REFINED holds the whole original method entry. Furthermore, rb_thread_mark() is changed to mark cfp->klass to avoid GC for iclasses created by copy_refinement_iclass(). * vm_method.c (rb_method_entry_make): add a method entry with VM_METHOD_TYPE_REFINED to the class refined by the refinement if the target module is a refinement. When a method entry with VM_METHOD_TYPE_UNDEF is invoked by vm_call_method(), a method with the same name is searched in refinements. If such a method is found, the method is invoked. Otherwise, the original method in the refined class (rb_method_definition_t::body.orig_me) is invoked. This change is made to simplify the normal method lookup and to improve the performance of normal method calls. * vm_method.c (EXPR1, search_method, rb_method_entry), vm_eval.c (rb_call0, rb_search_method_entry): do not use refinements for method lookup. * vm_insnhelper.c (vm_call_method): search methods in refinements if ci->me is VM_METHOD_TYPE_REFINED. If the method is called by super (i.e., ci->call == vm_call_super_method), skip the same method entry as the current method to avoid infinite call of the same method. * class.c (include_modules_at): add a refined method entry for each method defined in a module included in a refinement. * class.c (rb_prepend_module): set an empty table to RCLASS_M_TBL(klass) to add refined method entries, because refinements should have priority over prepended modules. * proc.c (mnew): use rb_method_entry_with_refinements() to get a refined method. * vm.c (rb_thread_mark): mark cfp->klass for iclasses created by copy_refinement_iclass(). * vm.c (Init_VM), cont.c (fiber_init): initialize th->cfp->klass. * test/ruby/test_refinement.rb (test_inline_method_cache): do not skip the test because it should pass successfully. * test/ruby/test_refinement.rb (test_redefine_refined_method): new test for the case a refined method is redefined. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@38236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-12-06 17:08:41 +04:00
find_refinement(VALUE refinements, VALUE klass)
{
if (NIL_P(refinements)) {
return Qnil;
}
return rb_hash_lookup(refinements, klass);
* revised r37993 to avoid SEGV/ILL in tests. In r37993, a method entry with VM_METHOD_TYPE_REFINED holds only the original method definition, so ci->me is set to a method entry allocated in the stack, and it causes SEGV/ILL. In this commit, a method entry with VM_METHOD_TYPE_REFINED holds the whole original method entry. Furthermore, rb_thread_mark() is changed to mark cfp->klass to avoid GC for iclasses created by copy_refinement_iclass(). * vm_method.c (rb_method_entry_make): add a method entry with VM_METHOD_TYPE_REFINED to the class refined by the refinement if the target module is a refinement. When a method entry with VM_METHOD_TYPE_UNDEF is invoked by vm_call_method(), a method with the same name is searched in refinements. If such a method is found, the method is invoked. Otherwise, the original method in the refined class (rb_method_definition_t::body.orig_me) is invoked. This change is made to simplify the normal method lookup and to improve the performance of normal method calls. * vm_method.c (EXPR1, search_method, rb_method_entry), vm_eval.c (rb_call0, rb_search_method_entry): do not use refinements for method lookup. * vm_insnhelper.c (vm_call_method): search methods in refinements if ci->me is VM_METHOD_TYPE_REFINED. If the method is called by super (i.e., ci->call == vm_call_super_method), skip the same method entry as the current method to avoid infinite call of the same method. * class.c (include_modules_at): add a refined method entry for each method defined in a module included in a refinement. * class.c (rb_prepend_module): set an empty table to RCLASS_M_TBL(klass) to add refined method entries, because refinements should have priority over prepended modules. * proc.c (mnew): use rb_method_entry_with_refinements() to get a refined method. * vm.c (rb_thread_mark): mark cfp->klass for iclasses created by copy_refinement_iclass(). * vm.c (Init_VM), cont.c (fiber_init): initialize th->cfp->klass. * test/ruby/test_refinement.rb (test_inline_method_cache): do not skip the test because it should pass successfully. * test/ruby/test_refinement.rb (test_redefine_refined_method): new test for the case a refined method is redefined. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@38236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-12-06 17:08:41 +04:00
}
PUREFUNC(static rb_control_frame_t * current_method_entry(const rb_execution_context_t *ec, rb_control_frame_t *cfp));
static rb_control_frame_t *
current_method_entry(const rb_execution_context_t *ec, rb_control_frame_t *cfp)
{
rb_control_frame_t *top_cfp = cfp;
if (cfp->iseq && ISEQ_BODY(cfp->iseq)->type == ISEQ_TYPE_BLOCK) {
const rb_iseq_t *local_iseq = ISEQ_BODY(cfp->iseq)->local_iseq;
2015-07-22 01:52:59 +03:00
do {
cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp);
if (RUBY_VM_CONTROL_FRAME_STACK_OVERFLOW_P(ec, cfp)) {
/* TODO: orphan block */
return top_cfp;
}
} while (cfp->iseq != local_iseq);
}
return cfp;
}
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
static const rb_callable_method_entry_t *
refined_method_callable_without_refinement(const rb_callable_method_entry_t *me)
{
const rb_method_entry_t *orig_me = me->def->body.refined.orig_me;
const rb_callable_method_entry_t *cme;
if (orig_me->defined_class == 0) {
cme = NULL;
rb_notimplement();
}
else {
cme = (const rb_callable_method_entry_t *)orig_me;
}
VM_ASSERT(callable_method_entry_p(cme));
if (UNDEFINED_METHOD_ENTRY_P(cme)) {
cme = NULL;
}
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
return cme;
}
static const rb_callable_method_entry_t *
search_refined_method(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
ID mid = vm_ci_mid(calling->cd->ci);
const rb_cref_t *cref = vm_get_cref(cfp->ep);
const struct rb_callcache * const cc = calling->cc;
const rb_callable_method_entry_t *cme = vm_cc_cme(cc);
for (; cref; cref = CREF_NEXT(cref)) {
const VALUE refinement = find_refinement(CREF_REFINEMENTS(cref), vm_cc_cme(cc)->owner);
if (NIL_P(refinement)) continue;
const rb_callable_method_entry_t *const ref_me =
rb_callable_method_entry(refinement, mid);
if (ref_me) {
if (vm_cc_call(cc) == vm_call_super_method) {
const rb_control_frame_t *top_cfp = current_method_entry(ec, cfp);
const rb_callable_method_entry_t *top_me = rb_vm_frame_method_entry(top_cfp);
if (top_me && rb_method_definition_eq(ref_me->def, top_me->def)) {
continue;
}
}
if (cme->def->type != VM_METHOD_TYPE_REFINED ||
cme->def != ref_me->def) {
cme = ref_me;
}
if (ref_me->def->type != VM_METHOD_TYPE_REFINED) {
return cme;
}
}
else {
return NULL;
}
}
if (vm_cc_cme(cc)->def->body.refined.orig_me) {
return refined_method_callable_without_refinement(vm_cc_cme(cc));
}
else {
VALUE klass = RCLASS_SUPER(vm_cc_cme(cc)->defined_class);
const rb_callable_method_entry_t *cme = klass ? rb_callable_method_entry(klass, mid) : NULL;
return cme;
}
}
static VALUE
vm_call_refined(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
use inline cache for refinements From Ruby 3.0, refined method invocations are slow because resolved methods are not cached by inline cache because of conservertive strategy. However, `using` clears all caches so that it seems safe to cache resolved method entries. This patch caches resolved method entries in inline cache and clear all of inline method caches when `using` is called. fix [Bug #18572] ```ruby # without refinements class C def foo = :C end N = 1_000_000 obj = C.new require 'benchmark' Benchmark.bm{|x| x.report{N.times{ obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; }} } _END__ user system total real master 0.362859 0.002544 0.365403 ( 0.365424) modified 0.357251 0.000000 0.357251 ( 0.357258) ``` ```ruby # with refinment but without using class C def foo = :C end module R refine C do def foo = :R end end N = 1_000_000 obj = C.new require 'benchmark' Benchmark.bm{|x| x.report{N.times{ obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; }} } __END__ user system total real master 0.957182 0.000000 0.957182 ( 0.957212) modified 0.359228 0.000000 0.359228 ( 0.359238) ``` ```ruby # with using class C def foo = :C end module R refine C do def foo = :R end end N = 1_000_000 using R obj = C.new require 'benchmark' Benchmark.bm{|x| x.report{N.times{ obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; }} }
2023-07-31 10:17:55 +03:00
const rb_callable_method_entry_t *ref_cme = search_refined_method(ec, cfp, calling);
use inline cache for refinements From Ruby 3.0, refined method invocations are slow because resolved methods are not cached by inline cache because of conservertive strategy. However, `using` clears all caches so that it seems safe to cache resolved method entries. This patch caches resolved method entries in inline cache and clear all of inline method caches when `using` is called. fix [Bug #18572] ```ruby # without refinements class C def foo = :C end N = 1_000_000 obj = C.new require 'benchmark' Benchmark.bm{|x| x.report{N.times{ obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; }} } _END__ user system total real master 0.362859 0.002544 0.365403 ( 0.365424) modified 0.357251 0.000000 0.357251 ( 0.357258) ``` ```ruby # with refinment but without using class C def foo = :C end module R refine C do def foo = :R end end N = 1_000_000 obj = C.new require 'benchmark' Benchmark.bm{|x| x.report{N.times{ obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; }} } __END__ user system total real master 0.957182 0.000000 0.957182 ( 0.957212) modified 0.359228 0.000000 0.359228 ( 0.359238) ``` ```ruby # with using class C def foo = :C end module R refine C do def foo = :R end end N = 1_000_000 using R obj = C.new require 'benchmark' Benchmark.bm{|x| x.report{N.times{ obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; obj.foo; }} }
2023-07-31 10:17:55 +03:00
if (ref_cme) {
if (calling->cd->cc) {
const struct rb_callcache *cc = calling->cc = vm_cc_new(vm_cc_cme(calling->cc)->defined_class, ref_cme, vm_call_general, cc_type_refinement);
RB_OBJ_WRITE(cfp->iseq, &calling->cd->cc, cc);
return vm_call_method(ec, cfp, calling);
}
else {
struct rb_callcache *ref_cc = &VM_CC_ON_STACK(Qundef, vm_call_general, {{ 0 }}, ref_cme);
calling->cc= ref_cc;
return vm_call_method(ec, cfp, calling);
}
}
else {
return vm_call_method_nome(ec, cfp, calling);
}
}
static inline VALUE vm_invoke_block(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling, const struct rb_callinfo *ci, bool is_lambda, VALUE block_handler);
NOINLINE(static VALUE
vm_invoke_block_opt_call(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
struct rb_calling_info *calling, const struct rb_callinfo *ci, VALUE block_handler));
static VALUE
vm_invoke_block_opt_call(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
struct rb_calling_info *calling, const struct rb_callinfo *ci, VALUE block_handler)
{
int argc = calling->argc;
/* remove self */
if (argc > 0) MEMMOVE(&TOPN(argc), &TOPN(argc-1), VALUE, argc);
DEC_SP(1);
return vm_invoke_block(ec, reg_cfp, calling, ci, false, block_handler);
}
static VALUE
vm_call_opt_call(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_opt_call);
const struct rb_callinfo *ci = calling->cd->ci;
VALUE procval = calling->recv;
return vm_invoke_block_opt_call(ec, reg_cfp, calling, ci, VM_BH_FROM_PROC(procval));
}
static VALUE
vm_call_opt_block_call(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_opt_block_call);
VALUE block_handler = VM_ENV_BLOCK_HANDLER(VM_CF_LEP(reg_cfp));
const struct rb_callinfo *ci = calling->cd->ci;
if (BASIC_OP_UNREDEFINED_P(BOP_CALL, PROC_REDEFINED_OP_FLAG)) {
return vm_invoke_block_opt_call(ec, reg_cfp, calling, ci, block_handler);
}
else {
calling->recv = rb_vm_bh_to_procval(ec, block_handler);
calling->cc = rb_vm_search_method_slowpath(ci, CLASS_OF(calling->recv));
return vm_call_general(ec, reg_cfp, calling);
}
}
static VALUE
vm_call_opt_struct_aref0(rb_execution_context_t *ec, struct rb_calling_info *calling)
{
VALUE recv = calling->recv;
VM_ASSERT(RB_TYPE_P(recv, T_STRUCT));
VM_ASSERT(vm_cc_cme(calling->cc)->def->type == VM_METHOD_TYPE_OPTIMIZED);
VM_ASSERT(vm_cc_cme(calling->cc)->def->body.optimized.type == OPTIMIZED_METHOD_TYPE_STRUCT_AREF);
const unsigned int off = vm_cc_cme(calling->cc)->def->body.optimized.index;
return internal_RSTRUCT_GET(recv, off);
}
static VALUE
vm_call_opt_struct_aref(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_opt_struct_aref);
VALUE ret = vm_call_opt_struct_aref0(ec, calling);
reg_cfp->sp -= 1;
return ret;
}
static VALUE
vm_call_opt_struct_aset0(rb_execution_context_t *ec, struct rb_calling_info *calling, VALUE val)
{
VALUE recv = calling->recv;
VM_ASSERT(RB_TYPE_P(recv, T_STRUCT));
VM_ASSERT(vm_cc_cme(calling->cc)->def->type == VM_METHOD_TYPE_OPTIMIZED);
VM_ASSERT(vm_cc_cme(calling->cc)->def->body.optimized.type == OPTIMIZED_METHOD_TYPE_STRUCT_ASET);
rb_check_frozen(recv);
const unsigned int off = vm_cc_cme(calling->cc)->def->body.optimized.index;
internal_RSTRUCT_SET(recv, off, val);
return val;
}
static VALUE
vm_call_opt_struct_aset(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_opt_struct_aset);
VALUE ret = vm_call_opt_struct_aset0(ec, calling, *(reg_cfp->sp - 1));
reg_cfp->sp -= 2;
return ret;
}
NOINLINE(static VALUE vm_call_optimized(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling,
const struct rb_callinfo *ci, const struct rb_callcache *cc));
#define VM_CALL_METHOD_ATTR(var, func, nohook) \
if (UNLIKELY(ruby_vm_event_flags & (RUBY_EVENT_C_CALL | RUBY_EVENT_C_RETURN))) { \
EXEC_EVENT_HOOK(ec, RUBY_EVENT_C_CALL, calling->recv, vm_cc_cme(cc)->def->original_id, \
vm_ci_mid(ci), vm_cc_cme(cc)->owner, Qundef); \
var = func; \
EXEC_EVENT_HOOK(ec, RUBY_EVENT_C_RETURN, calling->recv, vm_cc_cme(cc)->def->original_id, \
vm_ci_mid(ci), vm_cc_cme(cc)->owner, (var)); \
} \
else { \
nohook; \
var = func; \
}
static VALUE
vm_call_optimized(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling,
const struct rb_callinfo *ci, const struct rb_callcache *cc)
{
switch (vm_cc_cme(cc)->def->body.optimized.type) {
case OPTIMIZED_METHOD_TYPE_SEND:
CC_SET_FASTPATH(cc, vm_call_opt_send, TRUE);
return vm_call_opt_send(ec, cfp, calling);
case OPTIMIZED_METHOD_TYPE_CALL:
CC_SET_FASTPATH(cc, vm_call_opt_call, TRUE);
return vm_call_opt_call(ec, cfp, calling);
case OPTIMIZED_METHOD_TYPE_BLOCK_CALL:
CC_SET_FASTPATH(cc, vm_call_opt_block_call, TRUE);
return vm_call_opt_block_call(ec, cfp, calling);
case OPTIMIZED_METHOD_TYPE_STRUCT_AREF: {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
CALLER_SETUP_ARG(cfp, calling, ci, 0);
rb_check_arity(calling->argc, 0, 0);
VALUE v;
VM_CALL_METHOD_ATTR(v,
vm_call_opt_struct_aref(ec, cfp, calling),
set_vm_cc_ivar(cc); \
CC_SET_FASTPATH(cc, vm_call_opt_struct_aref, (vm_ci_flag(ci) & VM_CALL_ARGS_SIMPLE)))
return v;
}
case OPTIMIZED_METHOD_TYPE_STRUCT_ASET: {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
CALLER_SETUP_ARG(cfp, calling, ci, 1);
rb_check_arity(calling->argc, 1, 1);
VALUE v;
VM_CALL_METHOD_ATTR(v,
vm_call_opt_struct_aset(ec, cfp, calling),
set_vm_cc_ivar(cc); \
CC_SET_FASTPATH(cc, vm_call_opt_struct_aset, (vm_ci_flag(ci) & VM_CALL_ARGS_SIMPLE)))
return v;
}
default:
rb_bug("vm_call_method: unsupported optimized method type (%d)", vm_cc_cme(cc)->def->body.optimized.type);
}
}
static VALUE
vm_call_method_each_type(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
const struct rb_callinfo *ci = calling->cd->ci;
const struct rb_callcache *cc = calling->cc;
`Primitive.mandatory_only?` for fast path Compare with the C methods, A built-in methods written in Ruby is slower if only mandatory parameters are given because it needs to check the argumens and fill default values for optional and keyword parameters (C methods can check the number of parameters with `argc`, so there are no overhead). Passing mandatory arguments are common (optional arguments are exceptional, in many cases) so it is important to provide the fast path for such common cases. `Primitive.mandatory_only?` is a special builtin function used with `if` expression like that: ```ruby def self.at(time, subsec = false, unit = :microsecond, in: nil) if Primitive.mandatory_only? Primitive.time_s_at1(time) else Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end end ``` and it makes two ISeq, ``` def self.at(time, subsec = false, unit = :microsecond, in: nil) Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end def self.at(time) Primitive.time_s_at1(time) end ``` and (2) is pointed by (1). Note that `Primitive.mandatory_only?` should be used only in a condition of an `if` statement and the `if` statement should be equal to the methdo body (you can not put any expression before and after the `if` statement). A method entry with `mandatory_only?` (`Time.at` on the above case) is marked as `iseq_overload`. When the method will be dispatch only with mandatory arguments (`Time.at(0)` for example), make another method entry with ISeq (2) as mandatory only method entry and it will be cached in an inline method cache. The idea is similar discussed in https://bugs.ruby-lang.org/issues/16254 but it only checks mandatory parameters or more, because many cases only mandatory parameters are given. If we find other cases (optional or keyword parameters are used frequently and it hurts performance), we can extend the feature.
2021-11-12 20:12:20 +03:00
const rb_callable_method_entry_t *cme = vm_cc_cme(cc);
2021-09-18 10:15:24 +03:00
VALUE v;
VM_ASSERT(! METHOD_ENTRY_INVALIDATED(cme));
`Primitive.mandatory_only?` for fast path Compare with the C methods, A built-in methods written in Ruby is slower if only mandatory parameters are given because it needs to check the argumens and fill default values for optional and keyword parameters (C methods can check the number of parameters with `argc`, so there are no overhead). Passing mandatory arguments are common (optional arguments are exceptional, in many cases) so it is important to provide the fast path for such common cases. `Primitive.mandatory_only?` is a special builtin function used with `if` expression like that: ```ruby def self.at(time, subsec = false, unit = :microsecond, in: nil) if Primitive.mandatory_only? Primitive.time_s_at1(time) else Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end end ``` and it makes two ISeq, ``` def self.at(time, subsec = false, unit = :microsecond, in: nil) Primitive.time_s_at(time, subsec, unit, Primitive.arg!(:in)) end def self.at(time) Primitive.time_s_at1(time) end ``` and (2) is pointed by (1). Note that `Primitive.mandatory_only?` should be used only in a condition of an `if` statement and the `if` statement should be equal to the methdo body (you can not put any expression before and after the `if` statement). A method entry with `mandatory_only?` (`Time.at` on the above case) is marked as `iseq_overload`. When the method will be dispatch only with mandatory arguments (`Time.at(0)` for example), make another method entry with ISeq (2) as mandatory only method entry and it will be cached in an inline method cache. The idea is similar discussed in https://bugs.ruby-lang.org/issues/16254 but it only checks mandatory parameters or more, because many cases only mandatory parameters are given. If we find other cases (optional or keyword parameters are used frequently and it hurts performance), we can extend the feature.
2021-11-12 20:12:20 +03:00
switch (cme->def->type) {
case VM_METHOD_TYPE_ISEQ:
2024-05-25 00:33:03 +03:00
if (ISEQ_BODY(def_iseq_ptr(cme->def))->param.flags.forwardable) {
CC_SET_FASTPATH(cc, vm_call_iseq_fwd_setup, TRUE);
return vm_call_iseq_fwd_setup(ec, cfp, calling);
}
else {
CC_SET_FASTPATH(cc, vm_call_iseq_setup, TRUE);
return vm_call_iseq_setup(ec, cfp, calling);
}
case VM_METHOD_TYPE_NOTIMPLEMENTED:
case VM_METHOD_TYPE_CFUNC:
CC_SET_FASTPATH(cc, vm_call_cfunc, TRUE);
return vm_call_cfunc(ec, cfp, calling);
case VM_METHOD_TYPE_ATTRSET:
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
CALLER_SETUP_ARG(cfp, calling, ci, 1);
rb_check_arity(calling->argc, 1, 1);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
const unsigned int aset_mask = (VM_CALL_ARGS_SPLAT | VM_CALL_KW_SPLAT | VM_CALL_KWARG | VM_CALL_FORWARDING);
if (vm_cc_markable(cc)) {
vm_cc_attr_index_initialize(cc, INVALID_SHAPE_ID);
VM_CALL_METHOD_ATTR(v,
vm_call_attrset_direct(ec, cfp, cc, calling->recv),
CC_SET_FASTPATH(cc, vm_call_attrset, !(vm_ci_flag(ci) & aset_mask)));
2022-10-12 12:27:23 +03:00
}
else {
cc = &((struct rb_callcache) {
.flags = T_IMEMO |
(imemo_callcache << FL_USHIFT) |
VM_CALLCACHE_UNMARKABLE |
VM_CALLCACHE_ON_STACK,
.klass = cc->klass,
.cme_ = cc->cme_,
.call_ = cc->call_,
.aux_ = {
.attr = {
.value = INVALID_SHAPE_ID << SHAPE_FLAG_SHIFT,
}
},
});
VM_CALL_METHOD_ATTR(v,
vm_call_attrset_direct(ec, cfp, cc, calling->recv),
CC_SET_FASTPATH(cc, vm_call_attrset, !(vm_ci_flag(ci) & aset_mask)));
}
2021-09-18 10:15:24 +03:00
return v;
case VM_METHOD_TYPE_IVAR:
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
CALLER_SETUP_ARG(cfp, calling, ci, 0);
rb_check_arity(calling->argc, 0, 0);
vm_cc_attr_index_initialize(cc, INVALID_SHAPE_ID);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
const unsigned int ivar_mask = (VM_CALL_ARGS_SPLAT | VM_CALL_KW_SPLAT | VM_CALL_FORWARDING);
2021-09-18 10:15:24 +03:00
VM_CALL_METHOD_ATTR(v,
vm_call_ivar(ec, cfp, calling),
CC_SET_FASTPATH(cc, vm_call_ivar, !(vm_ci_flag(ci) & ivar_mask)));
return v;
case VM_METHOD_TYPE_MISSING:
vm_cc_method_missing_reason_set(cc, 0);
CC_SET_FASTPATH(cc, vm_call_method_missing, TRUE);
return vm_call_method_missing(ec, cfp, calling);
case VM_METHOD_TYPE_BMETHOD:
CC_SET_FASTPATH(cc, vm_call_bmethod, TRUE);
return vm_call_bmethod(ec, cfp, calling);
case VM_METHOD_TYPE_ALIAS:
CC_SET_FASTPATH(cc, vm_call_alias, TRUE);
return vm_call_alias(ec, cfp, calling);
case VM_METHOD_TYPE_OPTIMIZED:
return vm_call_optimized(ec, cfp, calling, ci, cc);
case VM_METHOD_TYPE_UNDEF:
break;
case VM_METHOD_TYPE_ZSUPER:
return vm_call_zsuper(ec, cfp, calling, RCLASS_ORIGIN(vm_cc_cme(cc)->defined_class));
case VM_METHOD_TYPE_REFINED:
// CC_SET_FASTPATH(cc, vm_call_refined, TRUE);
// should not set FASTPATH since vm_call_refined assumes cc->call is vm_call_super_method on invokesuper.
return vm_call_refined(ec, cfp, calling);
}
rb_bug("vm_call_method: unsupported method type (%d)", vm_cc_cme(cc)->def->type);
}
* revised r37993 to avoid SEGV/ILL in tests. In r37993, a method entry with VM_METHOD_TYPE_REFINED holds only the original method definition, so ci->me is set to a method entry allocated in the stack, and it causes SEGV/ILL. In this commit, a method entry with VM_METHOD_TYPE_REFINED holds the whole original method entry. Furthermore, rb_thread_mark() is changed to mark cfp->klass to avoid GC for iclasses created by copy_refinement_iclass(). * vm_method.c (rb_method_entry_make): add a method entry with VM_METHOD_TYPE_REFINED to the class refined by the refinement if the target module is a refinement. When a method entry with VM_METHOD_TYPE_UNDEF is invoked by vm_call_method(), a method with the same name is searched in refinements. If such a method is found, the method is invoked. Otherwise, the original method in the refined class (rb_method_definition_t::body.orig_me) is invoked. This change is made to simplify the normal method lookup and to improve the performance of normal method calls. * vm_method.c (EXPR1, search_method, rb_method_entry), vm_eval.c (rb_call0, rb_search_method_entry): do not use refinements for method lookup. * vm_insnhelper.c (vm_call_method): search methods in refinements if ci->me is VM_METHOD_TYPE_REFINED. If the method is called by super (i.e., ci->call == vm_call_super_method), skip the same method entry as the current method to avoid infinite call of the same method. * class.c (include_modules_at): add a refined method entry for each method defined in a module included in a refinement. * class.c (rb_prepend_module): set an empty table to RCLASS_M_TBL(klass) to add refined method entries, because refinements should have priority over prepended modules. * proc.c (mnew): use rb_method_entry_with_refinements() to get a refined method. * vm.c (rb_thread_mark): mark cfp->klass for iclasses created by copy_refinement_iclass(). * vm.c (Init_VM), cont.c (fiber_init): initialize th->cfp->klass. * test/ruby/test_refinement.rb (test_inline_method_cache): do not skip the test because it should pass successfully. * test/ruby/test_refinement.rb (test_redefine_refined_method): new test for the case a refined method is redefined. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@38236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-12-06 17:08:41 +04:00
NORETURN(static void vm_raise_method_missing(rb_execution_context_t *ec, int argc, const VALUE *argv, VALUE obj, int call_status));
static VALUE
vm_call_method_nome(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
/* method missing */
const struct rb_callinfo *ci = calling->cd->ci;
const int stat = ci_missing_reason(ci);
if (vm_ci_mid(ci) == idMethodMissing) {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (UNLIKELY(calling->heap_argv)) {
vm_raise_method_missing(ec, RARRAY_LENINT(calling->heap_argv), RARRAY_CONST_PTR(calling->heap_argv), calling->recv, stat);
}
else {
rb_control_frame_t *reg_cfp = cfp;
VALUE *argv = STACK_ADDR_FROM_TOP(calling->argc);
vm_raise_method_missing(ec, calling->argc, argv, calling->recv, stat);
}
}
else {
return vm_call_method_missing_body(ec, cfp, calling, ci, stat);
}
}
/* Protected method calls and super invocations need to check that the receiver
* (self for super) inherits the module on which the method is defined.
* In the case of refinements, it should consider the original class not the
* refinement.
*/
static VALUE
vm_defined_class_for_protected_call(const rb_callable_method_entry_t *me)
{
VALUE defined_class = me->defined_class;
VALUE refined_class = RCLASS_REFINED_CLASS(defined_class);
return NIL_P(refined_class) ? defined_class : refined_class;
}
static inline VALUE
vm_call_method(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct rb_calling_info *calling)
{
const struct rb_callinfo *ci = calling->cd->ci;
const struct rb_callcache *cc = calling->cc;
VM_ASSERT(callable_method_entry_p(vm_cc_cme(cc)));
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
if (vm_cc_cme(cc) != NULL) {
switch (METHOD_ENTRY_VISI(vm_cc_cme(cc))) {
case METHOD_VISI_PUBLIC: /* likely */
return vm_call_method_each_type(ec, cfp, calling);
case METHOD_VISI_PRIVATE:
if (!(vm_ci_flag(ci) & VM_CALL_FCALL)) {
enum method_missing_reason stat = MISSING_PRIVATE;
if (vm_ci_flag(ci) & VM_CALL_VCALL) stat |= MISSING_VCALL;
vm_cc_method_missing_reason_set(cc, stat);
CC_SET_FASTPATH(cc, vm_call_method_missing, TRUE);
return vm_call_method_missing(ec, cfp, calling);
}
return vm_call_method_each_type(ec, cfp, calling);
case METHOD_VISI_PROTECTED:
if (!(vm_ci_flag(ci) & (VM_CALL_OPT_SEND | VM_CALL_FCALL))) {
VALUE defined_class = vm_defined_class_for_protected_call(vm_cc_cme(cc));
if (!rb_obj_is_kind_of(cfp->self, defined_class)) {
vm_cc_method_missing_reason_set(cc, MISSING_PROTECTED);
return vm_call_method_missing(ec, cfp, calling);
}
else {
/* caching method info to dummy cc */
VM_ASSERT(vm_cc_cme(cc) != NULL);
struct rb_callcache cc_on_stack = *cc;
FL_SET_RAW((VALUE)&cc_on_stack, VM_CALLCACHE_UNMARKABLE);
calling->cc = &cc_on_stack;
return vm_call_method_each_type(ec, cfp, calling);
}
}
return vm_call_method_each_type(ec, cfp, calling);
default:
rb_bug("unreachable");
}
}
else {
return vm_call_method_nome(ec, cfp, calling);
}
}
static VALUE
vm_call_general(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
{
RB_DEBUG_COUNTER_INC(ccf_general);
return vm_call_method(ec, reg_cfp, calling);
}
void
rb_vm_cc_general(const struct rb_callcache *cc)
{
VM_ASSERT(IMEMO_TYPE_P(cc, imemo_callcache));
VM_ASSERT(cc != vm_cc_empty());
*(vm_call_handler *)&cc->call_ = vm_call_general;
}
* revised r37993 to avoid SEGV/ILL in tests. In r37993, a method entry with VM_METHOD_TYPE_REFINED holds only the original method definition, so ci->me is set to a method entry allocated in the stack, and it causes SEGV/ILL. In this commit, a method entry with VM_METHOD_TYPE_REFINED holds the whole original method entry. Furthermore, rb_thread_mark() is changed to mark cfp->klass to avoid GC for iclasses created by copy_refinement_iclass(). * vm_method.c (rb_method_entry_make): add a method entry with VM_METHOD_TYPE_REFINED to the class refined by the refinement if the target module is a refinement. When a method entry with VM_METHOD_TYPE_UNDEF is invoked by vm_call_method(), a method with the same name is searched in refinements. If such a method is found, the method is invoked. Otherwise, the original method in the refined class (rb_method_definition_t::body.orig_me) is invoked. This change is made to simplify the normal method lookup and to improve the performance of normal method calls. * vm_method.c (EXPR1, search_method, rb_method_entry), vm_eval.c (rb_call0, rb_search_method_entry): do not use refinements for method lookup. * vm_insnhelper.c (vm_call_method): search methods in refinements if ci->me is VM_METHOD_TYPE_REFINED. If the method is called by super (i.e., ci->call == vm_call_super_method), skip the same method entry as the current method to avoid infinite call of the same method. * class.c (include_modules_at): add a refined method entry for each method defined in a module included in a refinement. * class.c (rb_prepend_module): set an empty table to RCLASS_M_TBL(klass) to add refined method entries, because refinements should have priority over prepended modules. * proc.c (mnew): use rb_method_entry_with_refinements() to get a refined method. * vm.c (rb_thread_mark): mark cfp->klass for iclasses created by copy_refinement_iclass(). * vm.c (Init_VM), cont.c (fiber_init): initialize th->cfp->klass. * test/ruby/test_refinement.rb (test_inline_method_cache): do not skip the test because it should pass successfully. * test/ruby/test_refinement.rb (test_redefine_refined_method): new test for the case a refined method is redefined. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@38236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-12-06 17:08:41 +04:00
static VALUE
vm_call_super_method(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, struct rb_calling_info *calling)
* revised r37993 to avoid SEGV/ILL in tests. In r37993, a method entry with VM_METHOD_TYPE_REFINED holds only the original method definition, so ci->me is set to a method entry allocated in the stack, and it causes SEGV/ILL. In this commit, a method entry with VM_METHOD_TYPE_REFINED holds the whole original method entry. Furthermore, rb_thread_mark() is changed to mark cfp->klass to avoid GC for iclasses created by copy_refinement_iclass(). * vm_method.c (rb_method_entry_make): add a method entry with VM_METHOD_TYPE_REFINED to the class refined by the refinement if the target module is a refinement. When a method entry with VM_METHOD_TYPE_UNDEF is invoked by vm_call_method(), a method with the same name is searched in refinements. If such a method is found, the method is invoked. Otherwise, the original method in the refined class (rb_method_definition_t::body.orig_me) is invoked. This change is made to simplify the normal method lookup and to improve the performance of normal method calls. * vm_method.c (EXPR1, search_method, rb_method_entry), vm_eval.c (rb_call0, rb_search_method_entry): do not use refinements for method lookup. * vm_insnhelper.c (vm_call_method): search methods in refinements if ci->me is VM_METHOD_TYPE_REFINED. If the method is called by super (i.e., ci->call == vm_call_super_method), skip the same method entry as the current method to avoid infinite call of the same method. * class.c (include_modules_at): add a refined method entry for each method defined in a module included in a refinement. * class.c (rb_prepend_module): set an empty table to RCLASS_M_TBL(klass) to add refined method entries, because refinements should have priority over prepended modules. * proc.c (mnew): use rb_method_entry_with_refinements() to get a refined method. * vm.c (rb_thread_mark): mark cfp->klass for iclasses created by copy_refinement_iclass(). * vm.c (Init_VM), cont.c (fiber_init): initialize th->cfp->klass. * test/ruby/test_refinement.rb (test_inline_method_cache): do not skip the test because it should pass successfully. * test/ruby/test_refinement.rb (test_redefine_refined_method): new test for the case a refined method is redefined. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@38236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-12-06 17:08:41 +04:00
{
RB_DEBUG_COUNTER_INC(ccf_super_method);
// This line is introduced to make different from `vm_call_general` because some compilers (VC we found)
// can merge the function and the address of the function becomes same.
// The address of `vm_call_super_method` is used in `search_refined_method`, so it should be different.
if (ec == NULL) rb_bug("unreachable");
/* this check is required to distinguish with other functions. */
2021-11-16 11:52:20 +03:00
VM_ASSERT(vm_cc_call(calling->cc) == vm_call_super_method);
return vm_call_method(ec, reg_cfp, calling);
* revised r37993 to avoid SEGV/ILL in tests. In r37993, a method entry with VM_METHOD_TYPE_REFINED holds only the original method definition, so ci->me is set to a method entry allocated in the stack, and it causes SEGV/ILL. In this commit, a method entry with VM_METHOD_TYPE_REFINED holds the whole original method entry. Furthermore, rb_thread_mark() is changed to mark cfp->klass to avoid GC for iclasses created by copy_refinement_iclass(). * vm_method.c (rb_method_entry_make): add a method entry with VM_METHOD_TYPE_REFINED to the class refined by the refinement if the target module is a refinement. When a method entry with VM_METHOD_TYPE_UNDEF is invoked by vm_call_method(), a method with the same name is searched in refinements. If such a method is found, the method is invoked. Otherwise, the original method in the refined class (rb_method_definition_t::body.orig_me) is invoked. This change is made to simplify the normal method lookup and to improve the performance of normal method calls. * vm_method.c (EXPR1, search_method, rb_method_entry), vm_eval.c (rb_call0, rb_search_method_entry): do not use refinements for method lookup. * vm_insnhelper.c (vm_call_method): search methods in refinements if ci->me is VM_METHOD_TYPE_REFINED. If the method is called by super (i.e., ci->call == vm_call_super_method), skip the same method entry as the current method to avoid infinite call of the same method. * class.c (include_modules_at): add a refined method entry for each method defined in a module included in a refinement. * class.c (rb_prepend_module): set an empty table to RCLASS_M_TBL(klass) to add refined method entries, because refinements should have priority over prepended modules. * proc.c (mnew): use rb_method_entry_with_refinements() to get a refined method. * vm.c (rb_thread_mark): mark cfp->klass for iclasses created by copy_refinement_iclass(). * vm.c (Init_VM), cont.c (fiber_init): initialize th->cfp->klass. * test/ruby/test_refinement.rb (test_inline_method_cache): do not skip the test because it should pass successfully. * test/ruby/test_refinement.rb (test_redefine_refined_method): new test for the case a refined method is redefined. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@38236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-12-06 17:08:41 +04:00
}
/* super */
static inline VALUE
vm_search_normal_superclass(VALUE klass)
{
if (BUILTIN_TYPE(klass) == T_ICLASS &&
RB_TYPE_P(RBASIC(klass)->klass, T_MODULE) &&
Improve the performance of super This PR improves the performance of `super` calls. While working on some Rails optimizations jhawthorn discovered that `super` calls were slower than expected. The changes here do the following: 1) Adds a check for whether the call frame is not equal to the method entry iseq. This avoids the `rb_obj_is_kind_of` check on the next line which is quite slow. If the current call frame is equal to the method entry we know we can't have an instance eval, etc. 2) Changes `FL_TEST` to `FL_TEST_RAW`. This is safe because we've already done the check for `T_ICLASS` above. 3) Adds a benchmark for `T_ICLASS` super calls. 4) Note: makes a chage for `method_entry_cref` to use `const`. On master the benchmarks showed that `super` is 1.76x slower. Our changes improved the performance so that it is now only 1.36x slower. Benchmark IPS: ``` Warming up -------------------------------------- super 244.918k i/100ms method call 383.007k i/100ms Calculating ------------------------------------- super 2.280M (± 6.7%) i/s - 11.511M in 5.071758s method call 3.834M (± 4.9%) i/s - 19.150M in 5.008444s Comparison: method call: 3833648.3 i/s super: 2279837.9 i/s - 1.68x (± 0.00) slower ``` With changes: ``` Warming up -------------------------------------- super 308.777k i/100ms method call 375.051k i/100ms Calculating ------------------------------------- super 2.951M (± 5.4%) i/s - 14.821M in 5.039592s method call 3.551M (± 4.9%) i/s - 18.002M in 5.081695s Comparison: method call: 3551372.7 i/s super: 2950557.9 i/s - 1.20x (± 0.00) slower ``` Ruby VM benchmarks also showed an improvement: Existing `vm_super` benchmark`. ``` $ make benchmark ITEM=vm_super | |compare-ruby|built-ruby| |:---------|-----------:|---------:| |vm_super | 21.555M| 37.819M| | | -| 1.75x| ``` New `vm_iclass_super` benchmark: ``` $ make benchmark ITEM=vm_iclass_super | |compare-ruby|built-ruby| |:----------------|-----------:|---------:| |vm_iclass_super | 1.669M| 3.683M| | | -| 2.21x| ``` This is the benchmark script used for the benchmark-ips benchmarks: ```ruby require "benchmark/ips" class Foo def zuper; end def top; end last_method = "top" ("A".."M").each do |module_name| eval <<-EOM module #{module_name} def zuper; super; end def #{module_name.downcase} #{last_method} end end prepend #{module_name} EOM last_method = module_name.downcase end end foo = Foo.new Benchmark.ips do |x| x.report "super" do foo.zuper end x.report "method call" do foo.m end x.compare! end ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org> Co-authored-by: John Hawthorn <john@hawthorn.email>
2020-08-11 20:22:43 +03:00
FL_TEST_RAW(RBASIC(klass)->klass, RMODULE_IS_REFINEMENT)) {
* fix the behavior when a module is included into a refinement. This change is a little tricky, so it might be better to prohibit module inclusion to refinements. * include/ruby/ruby.h (RMODULE_INCLUDED_INTO_REFINEMENT): new flag to represent that a module (iclass) is included into a refinement. * class.c (include_modules_at): set RMODULE_INCLUDED_INTO_REFINEMENT if klass is a refinement. * eval.c (rb_mod_refine): set the superclass of a refinement to the refined class for super. * eval.c (rb_using_refinement): skip the above superclass (the refined class) when creating iclasses for refinements. Otherwise, `using Refinement1; using Refinement2' creates iclasses: <Refinement2> -> <RefinedClass> -> <Refinement1> -> RefinedClass, where <Module> is an iclass for Module, so RefinedClass is searched before Refinement1. The correct iclasses should be <Refinement2> -> <Refinement1> -> RefinedClass. * vm_insnhelper.c (vm_search_normal_superclass): if klass is an iclass for a refinement, use the refinement's superclass instead of the iclass's superclass. Otherwise, multiple refinements are searched by super. For example, if a refinement Refinement2 includes a module M (i.e., Refinement2 -> <M> -> RefinedClass, and if refinements iclasses are <Refinement2> -> <M>' -> <Refinement1> -> RefinedClass, then super in <Refinement2> should use Refinement2's superclass <M> instead of <Refinement2>'s superclass <M>'. * vm_insnhelper.c (vm_search_super_method): do not raise a NotImplementError if current_defind_class is a module included into a refinement. Because of the change of vm_search_normal_superclass(), the receiver might not be an instance of the module('s iclass). * test/ruby/test_refinement.rb: related test. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@38298 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-12-10 20:05:45 +04:00
klass = RBASIC(klass)->klass;
}
* fix the behavior when a module is included into a refinement. This change is a little tricky, so it might be better to prohibit module inclusion to refinements. * include/ruby/ruby.h (RMODULE_INCLUDED_INTO_REFINEMENT): new flag to represent that a module (iclass) is included into a refinement. * class.c (include_modules_at): set RMODULE_INCLUDED_INTO_REFINEMENT if klass is a refinement. * eval.c (rb_mod_refine): set the superclass of a refinement to the refined class for super. * eval.c (rb_using_refinement): skip the above superclass (the refined class) when creating iclasses for refinements. Otherwise, `using Refinement1; using Refinement2' creates iclasses: <Refinement2> -> <RefinedClass> -> <Refinement1> -> RefinedClass, where <Module> is an iclass for Module, so RefinedClass is searched before Refinement1. The correct iclasses should be <Refinement2> -> <Refinement1> -> RefinedClass. * vm_insnhelper.c (vm_search_normal_superclass): if klass is an iclass for a refinement, use the refinement's superclass instead of the iclass's superclass. Otherwise, multiple refinements are searched by super. For example, if a refinement Refinement2 includes a module M (i.e., Refinement2 -> <M> -> RefinedClass, and if refinements iclasses are <Refinement2> -> <M>' -> <Refinement1> -> RefinedClass, then super in <Refinement2> should use Refinement2's superclass <M> instead of <Refinement2>'s superclass <M>'. * vm_insnhelper.c (vm_search_super_method): do not raise a NotImplementError if current_defind_class is a module included into a refinement. Because of the change of vm_search_normal_superclass(), the receiver might not be an instance of the module('s iclass). * test/ruby/test_refinement.rb: related test. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@38298 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2012-12-10 20:05:45 +04:00
klass = RCLASS_ORIGIN(klass);
return RCLASS_SUPER(klass);
}
NORETURN(static void vm_super_outside(void));
static void
vm_super_outside(void)
{
rb_raise(rb_eNoMethodError, "super called outside of method");
}
static const struct rb_callcache *
empty_cc_for_super(void)
{
return &vm_empty_cc_for_super;
}
static const struct rb_callcache *
vm_search_super_method(const rb_control_frame_t *reg_cfp, struct rb_call_data *cd, VALUE recv)
{
VALUE current_defined_class;
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(reg_cfp);
* method.h: introduce rb_callable_method_entry_t to remove rb_control_frame_t::klass. [Bug #11278], [Bug #11279] rb_method_entry_t data belong to modules/classes. rb_method_entry_t::owner points defined module or class. module M def foo; end end In this case, owner is M. rb_callable_method_entry_t data belong to only classes. For modules, MRI creates corresponding T_ICLASS internally. rb_callable_method_entry_t can also belong to T_ICLASS. rb_callable_method_entry_t::defined_class points T_CLASS or T_ICLASS. rb_method_entry_t data for classes (not for modules) are also rb_callable_method_entry_t data because it is completely same data. In this case, rb_method_entry_t::owner == rb_method_entry_t::defined_class. For example, there are classes C and D, and incldues M, class C; include M; end class D; include M; end then, two T_ICLASS objects for C's super class and D's super class will be created. When C.new.foo is called, then M#foo is searcheed and rb_callable_method_t data is used by VM to invoke M#foo. rb_method_entry_t data is only one for M#foo. However, rb_callable_method_entry_t data are two (and can be more). It is proportional to the number of including (and prepending) classes (the number of T_ICLASS which point to the module). Now, created rb_callable_method_entry_t are collected when the original module M was modified. We can think it is a cache. We need to select what kind of method entry data is needed. To operate definition, then you need to use rb_method_entry_t. You can access them by the following functions. * rb_method_entry(VALUE klass, ID id); * rb_method_entry_with_refinements(VALUE klass, ID id); * rb_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method(VALUE refinements, const rb_method_entry_t *me); To invoke methods, then you need to use rb_callable_method_entry_t which you can get by the following APIs corresponding to the above listed functions. * rb_callable_method_entry(VALUE klass, ID id); * rb_callable_method_entry_with_refinements(VALUE klass, ID id); * rb_callable_method_entry_without_refinements(VALUE klass, ID id); * rb_resolve_refined_method_callable(VALUE refinements, const rb_callable_method_entry_t *me); VM pushes rb_callable_method_entry_t, so that rb_vm_frame_method_entry() returns rb_callable_method_entry_t. You can check a super class of current method by rb_callable_method_entry_t::defined_class. * method.h: renamed from rb_method_entry_t::klass to rb_method_entry_t::owner. * internal.h: add rb_classext_struct::callable_m_tbl to cache rb_callable_method_entry_t data. We need to consider abotu this field again because it is only active for T_ICLASS. * class.c (method_entry_i): ditto. * class.c (rb_define_attr): rb_method_entry() does not takes defiend_class_ptr. * gc.c (mark_method_entry): mark RCLASS_CALLABLE_M_TBL() for T_ICLASS. * cont.c (fiber_init): rb_control_frame_t::klass is removed. * proc.c: fix `struct METHOD' data structure because rb_callable_method_t has all information. * vm_core.h: remove several fields. * rb_control_frame_t::klass. * rb_block_t::klass. And catch up changes. * eval.c: catch up changes. * gc.c: ditto. * insns.def: ditto. * vm.c: ditto. * vm_args.c: ditto. * vm_backtrace.c: ditto. * vm_dump.c: ditto. * vm_eval.c: ditto. * vm_insnhelper.c: ditto. * vm_method.c: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@51126 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-07-03 14:24:50 +03:00
if (!me) {
vm_super_outside();
}
current_defined_class = vm_defined_class_for_protected_call(me);
if (BUILTIN_TYPE(current_defined_class) != T_MODULE &&
Improve the performance of super This PR improves the performance of `super` calls. While working on some Rails optimizations jhawthorn discovered that `super` calls were slower than expected. The changes here do the following: 1) Adds a check for whether the call frame is not equal to the method entry iseq. This avoids the `rb_obj_is_kind_of` check on the next line which is quite slow. If the current call frame is equal to the method entry we know we can't have an instance eval, etc. 2) Changes `FL_TEST` to `FL_TEST_RAW`. This is safe because we've already done the check for `T_ICLASS` above. 3) Adds a benchmark for `T_ICLASS` super calls. 4) Note: makes a chage for `method_entry_cref` to use `const`. On master the benchmarks showed that `super` is 1.76x slower. Our changes improved the performance so that it is now only 1.36x slower. Benchmark IPS: ``` Warming up -------------------------------------- super 244.918k i/100ms method call 383.007k i/100ms Calculating ------------------------------------- super 2.280M (± 6.7%) i/s - 11.511M in 5.071758s method call 3.834M (± 4.9%) i/s - 19.150M in 5.008444s Comparison: method call: 3833648.3 i/s super: 2279837.9 i/s - 1.68x (± 0.00) slower ``` With changes: ``` Warming up -------------------------------------- super 308.777k i/100ms method call 375.051k i/100ms Calculating ------------------------------------- super 2.951M (± 5.4%) i/s - 14.821M in 5.039592s method call 3.551M (± 4.9%) i/s - 18.002M in 5.081695s Comparison: method call: 3551372.7 i/s super: 2950557.9 i/s - 1.20x (± 0.00) slower ``` Ruby VM benchmarks also showed an improvement: Existing `vm_super` benchmark`. ``` $ make benchmark ITEM=vm_super | |compare-ruby|built-ruby| |:---------|-----------:|---------:| |vm_super | 21.555M| 37.819M| | | -| 1.75x| ``` New `vm_iclass_super` benchmark: ``` $ make benchmark ITEM=vm_iclass_super | |compare-ruby|built-ruby| |:----------------|-----------:|---------:| |vm_iclass_super | 1.669M| 3.683M| | | -| 2.21x| ``` This is the benchmark script used for the benchmark-ips benchmarks: ```ruby require "benchmark/ips" class Foo def zuper; end def top; end last_method = "top" ("A".."M").each do |module_name| eval <<-EOM module #{module_name} def zuper; super; end def #{module_name.downcase} #{last_method} end end prepend #{module_name} EOM last_method = module_name.downcase end end foo = Foo.new Benchmark.ips do |x| x.report "super" do foo.zuper end x.report "method call" do foo.m end x.compare! end ``` Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org> Co-authored-by: John Hawthorn <john@hawthorn.email>
2020-08-11 20:22:43 +03:00
reg_cfp->iseq != method_entry_iseqptr(me) &&
!rb_obj_is_kind_of(recv, current_defined_class)) {
VALUE m = RB_TYPE_P(current_defined_class, T_ICLASS) ?
RCLASS_INCLUDER(current_defined_class) : current_defined_class;
if (m) { /* not bound UnboundMethod */
rb_raise(rb_eTypeError,
"self has wrong type to call super in this context: "
"%"PRIsVALUE" (expected %"PRIsVALUE")",
rb_obj_class(recv), m);
}
}
if (me->def->type == VM_METHOD_TYPE_BMETHOD && (vm_ci_flag(cd->ci) & VM_CALL_ZSUPER)) {
rb_raise(rb_eRuntimeError,
"implicit argument passing of super from method defined"
" by define_method() is not supported."
" Specify all arguments explicitly.");
}
ID mid = me->def->original_id;
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
if (!vm_ci_markable(cd->ci)) {
VM_FORCE_WRITE((const VALUE *)&cd->ci->mid, (VALUE)mid);
}
else {
// update iseq. really? (TODO)
cd->ci = vm_ci_new_runtime(mid,
vm_ci_flag(cd->ci),
vm_ci_argc(cd->ci),
vm_ci_kwarg(cd->ci));
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
RB_OBJ_WRITTEN(reg_cfp->iseq, Qundef, cd->ci);
}
const struct rb_callcache *cc;
VALUE klass = vm_search_normal_superclass(me->defined_class);
if (!klass) {
/* bound instance method of module */
cc = vm_cc_new(klass, NULL, vm_call_method_missing, cc_type_super);
RB_OBJ_WRITE(reg_cfp->iseq, &cd->cc, cc);
}
else {
cc = vm_search_method_fastpath((VALUE)reg_cfp->iseq, cd, klass);
const rb_callable_method_entry_t *cached_cme = vm_cc_cme(cc);
// define_method can cache for different method id
if (cached_cme == NULL) {
// empty_cc_for_super is not markable object
cd->cc = empty_cc_for_super();
}
else if (cached_cme->called_id != mid) {
const rb_callable_method_entry_t *cme = rb_callable_method_entry(klass, mid);
if (cme) {
cc = vm_cc_new(klass, cme, vm_call_super_method, cc_type_super);
RB_OBJ_WRITE(reg_cfp->iseq, &cd->cc, cc);
}
else {
cd->cc = cc = empty_cc_for_super();
}
}
else {
switch (cached_cme->def->type) {
// vm_call_refined (search_refined_method) assumes cc->call is vm_call_super_method on invokesuper
case VM_METHOD_TYPE_REFINED:
// cc->klass is superclass of receiver class. Checking cc->klass is not enough to invalidate IVC for the receiver class.
case VM_METHOD_TYPE_ATTRSET:
case VM_METHOD_TYPE_IVAR:
vm_cc_call_set(cc, vm_call_super_method); // invalidate fastpath
break;
default:
break; // use fastpath
}
}
}
VM_ASSERT((vm_cc_cme(cc), true));
return cc;
}
/* yield */
static inline int
block_proc_is_lambda(const VALUE procval)
{
rb_proc_t *proc;
if (procval) {
GetProcPtr(procval, proc);
return proc->is_lambda;
}
else {
return 0;
}
}
static VALUE
vm_yield_with_cfunc(rb_execution_context_t *ec,
const struct rb_captured_block *captured,
VALUE self, int argc, const VALUE *argv, int kw_splat, VALUE block_handler,
const rb_callable_method_entry_t *me)
{
int is_lambda = FALSE; /* TODO */
VALUE val, arg, blockarg;
int frame_flag;
const struct vm_ifunc *ifunc = captured->code.ifunc;
if (is_lambda) {
arg = rb_ary_new4(argc, argv);
}
else if (argc == 0) {
arg = Qnil;
}
else {
arg = argv[0];
}
blockarg = rb_vm_bh_to_procval(ec, block_handler);
frame_flag = VM_FRAME_MAGIC_IFUNC | VM_FRAME_FLAG_CFRAME | (me ? VM_FRAME_FLAG_BMETHOD : 0);
if (kw_splat) {
2020-02-21 18:17:31 +03:00
frame_flag |= VM_FRAME_FLAG_CFRAME_KW;
}
vm_push_frame(ec, (const rb_iseq_t *)captured->code.ifunc,
frame_flag,
self,
VM_GUARDED_PREV_EP(captured->ep),
(VALUE)me,
0, ec->cfp->sp, 0, 0);
val = (*ifunc->func)(arg, (VALUE)ifunc->data, argc, argv, blockarg);
rb_vm_pop_frame(ec);
return val;
}
VALUE
rb_vm_yield_with_cfunc(rb_execution_context_t *ec, const struct rb_captured_block *captured, int argc, const VALUE *argv)
{
return vm_yield_with_cfunc(ec, captured, captured->self, argc, argv, 0, VM_BLOCK_HANDLER_NONE, NULL);
}
static VALUE
vm_yield_with_symbol(rb_execution_context_t *ec, VALUE symbol, int argc, const VALUE *argv, int kw_splat, VALUE block_handler)
{
return rb_sym_proc_call(SYM2ID(symbol), argc, argv, kw_splat, rb_vm_bh_to_procval(ec, block_handler));
}
static inline int
vm_callee_setup_block_arg_arg0_splat(rb_control_frame_t *cfp, const rb_iseq_t *iseq, VALUE *argv, VALUE ary)
{
int i;
long len = RARRAY_LEN(ary);
CHECK_VM_STACK_OVERFLOW(cfp, ISEQ_BODY(iseq)->param.lead_num);
for (i=0; i<len && i<ISEQ_BODY(iseq)->param.lead_num; i++) {
argv[i] = RARRAY_AREF(ary, i);
}
return i;
}
static inline VALUE
vm_callee_setup_block_arg_arg0_check(VALUE *argv)
{
VALUE ary, arg0 = argv[0];
ary = rb_check_array_type(arg0);
#if 0
argv[0] = arg0;
#else
VM_ASSERT(argv[0] == arg0);
#endif
return ary;
}
static int
vm_callee_setup_block_arg(rb_execution_context_t *ec, struct rb_calling_info *calling, const struct rb_callinfo *ci, const rb_iseq_t *iseq, VALUE *argv, const enum arg_setup_type arg_setup_type)
{
if (rb_simple_iseq_p(iseq)) {
rb_control_frame_t *cfp = ec->cfp;
VALUE arg0;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
CALLER_SETUP_ARG(cfp, calling, ci, ISEQ_BODY(iseq)->param.lead_num);
if (arg_setup_type == arg_setup_block &&
calling->argc == 1 &&
ISEQ_BODY(iseq)->param.flags.has_lead &&
!ISEQ_BODY(iseq)->param.flags.ambiguous_param0 &&
!NIL_P(arg0 = vm_callee_setup_block_arg_arg0_check(argv))) {
calling->argc = vm_callee_setup_block_arg_arg0_splat(cfp, iseq, argv, arg0);
}
if (calling->argc != ISEQ_BODY(iseq)->param.lead_num) {
if (arg_setup_type == arg_setup_block) {
if (calling->argc < ISEQ_BODY(iseq)->param.lead_num) {
int i;
CHECK_VM_STACK_OVERFLOW(cfp, ISEQ_BODY(iseq)->param.lead_num);
for (i=calling->argc; i<ISEQ_BODY(iseq)->param.lead_num; i++) argv[i] = Qnil;
calling->argc = ISEQ_BODY(iseq)->param.lead_num; /* fill rest parameters */
}
else if (calling->argc > ISEQ_BODY(iseq)->param.lead_num) {
calling->argc = ISEQ_BODY(iseq)->param.lead_num; /* simply truncate arguments */
}
}
else {
argument_arity_error(ec, iseq, calling->argc, ISEQ_BODY(iseq)->param.lead_num, ISEQ_BODY(iseq)->param.lead_num);
}
}
return 0;
}
else {
return setup_parameters_complex(ec, iseq, calling, ci, argv, arg_setup_type);
}
}
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 21:02:55 +03:00
static int
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
vm_yield_setup_args(rb_execution_context_t *ec, const rb_iseq_t *iseq, const int argc, VALUE *argv, int flags, VALUE block_handler, enum arg_setup_type arg_setup_type)
{
struct rb_calling_info calling_entry, *calling;
calling = &calling_entry;
calling->argc = argc;
calling->block_handler = block_handler;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
calling->kw_splat = (flags & VM_CALL_KW_SPLAT) ? 1 : 0;
calling->recv = Qundef;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
calling->heap_argv = 0;
struct rb_callinfo dummy_ci = VM_CI_ON_STACK(0, flags, 0, 0);
return vm_callee_setup_block_arg(ec, calling, &dummy_ci, iseq, argv, arg_setup_type);
}
/* ruby iseq -> ruby block */
static VALUE
vm_invoke_iseq_block(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
struct rb_calling_info *calling, const struct rb_callinfo *ci,
bool is_lambda, VALUE block_handler)
{
const struct rb_captured_block *captured = VM_BH_TO_ISEQ_BLOCK(block_handler);
const rb_iseq_t *iseq = rb_iseq_check(captured->code.iseq);
const int arg_size = ISEQ_BODY(iseq)->param.size;
VALUE * const rsp = GET_SP() - calling->argc;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
VALUE * const argv = rsp;
int opt_pc = vm_callee_setup_block_arg(ec, calling, ci, iseq, argv, is_lambda ? arg_setup_method : arg_setup_block);
SET_SP(rsp);
vm_push_frame(ec, iseq,
VM_FRAME_MAGIC_BLOCK | (is_lambda ? VM_FRAME_FLAG_LAMBDA : 0),
captured->self,
VM_GUARDED_PREV_EP(captured->ep), 0,
ISEQ_BODY(iseq)->iseq_encoded + opt_pc,
rsp + arg_size,
ISEQ_BODY(iseq)->local_table_size - arg_size, ISEQ_BODY(iseq)->stack_max);
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 21:02:55 +03:00
return Qundef;
}
static VALUE
vm_invoke_symbol_block(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
struct rb_calling_info *calling, const struct rb_callinfo *ci,
MAYBE_UNUSED(bool is_lambda), VALUE block_handler)
{
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
VALUE symbol = VM_BH_TO_SYMBOL(block_handler);
int flags = vm_ci_flag(ci);
if (UNLIKELY(!(flags & VM_CALL_ARGS_SIMPLE) &&
((calling->argc == 0) ||
(calling->argc == 1 && (flags & (VM_CALL_ARGS_SPLAT | VM_CALL_KW_SPLAT))) ||
(calling->argc == 2 && (flags & VM_CALL_ARGS_SPLAT) && (flags & VM_CALL_KW_SPLAT)) ||
((flags & VM_CALL_KWARG) && (vm_ci_kwarg(ci)->keyword_len == calling->argc))))) {
CALLER_SETUP_ARG(reg_cfp, calling, ci, ALLOW_HEAP_ARGV);
flags = 0;
if (UNLIKELY(calling->heap_argv)) {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
#if VM_ARGC_STACK_MAX < 0
if (RARRAY_LEN(calling->heap_argv) < 1) {
rb_raise(rb_eArgError, "no receiver given");
}
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
#endif
calling->recv = rb_ary_shift(calling->heap_argv);
// Modify stack to avoid cfp consistency error
reg_cfp->sp++;
reg_cfp->sp[-1] = reg_cfp->sp[-2];
reg_cfp->sp[-2] = calling->recv;
flags |= VM_CALL_ARGS_SPLAT;
}
else {
if (calling->argc < 1) {
rb_raise(rb_eArgError, "no receiver given");
}
calling->recv = TOPN(--calling->argc);
}
if (calling->kw_splat) {
flags |= VM_CALL_KW_SPLAT;
}
}
else {
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
if (calling->argc < 1) {
rb_raise(rb_eArgError, "no receiver given");
}
calling->recv = TOPN(--calling->argc);
}
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
return vm_call_symbol(ec, reg_cfp, calling, ci, symbol, flags);
}
static VALUE
vm_invoke_ifunc_block(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
struct rb_calling_info *calling, const struct rb_callinfo *ci,
MAYBE_UNUSED(bool is_lambda), VALUE block_handler)
{
VALUE val;
int argc;
const struct rb_captured_block *captured = VM_BH_TO_IFUNC_BLOCK(block_handler);
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
CALLER_SETUP_ARG(ec->cfp, calling, ci, ALLOW_HEAP_ARGV_KEEP_KWSPLAT);
argc = calling->argc;
Generalize cfunc large array splat fix to fix many additional cases raising SystemStackError Originally, when 2e7bceb34ea858649e1f975a934ce1894d1f06a6 fixed cfuncs to no longer use the VM stack for large array splats, it was thought to have fully fixed Bug #4040, since the issue was fixed for methods defined in Ruby (iseqs) back in Ruby 2.2. After additional research, I determined that same issue affects almost all types of method calls, not just iseq and cfunc calls. There were two main types of remaining issues, important cases (where large array splat should work) and pedantic cases (where large array splat raised SystemStackError instead of ArgumentError). Important cases: ```ruby define_method(:a){|*a|} a(*1380888.times) def b(*a); end send(:b, *1380888.times) :b.to_proc.call(self, *1380888.times) def d; yield(*1380888.times) end d(&method(:b)) def self.method_missing(*a); end not_a_method(*1380888.times) ``` Pedantic cases: ```ruby def a; end a(*1380888.times) def b(_); end b(*1380888.times) def c(_=nil); end c(*1380888.times) c = Class.new do attr_accessor :a alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) c = Struct.new(:a) do alias b a= end.new c.a(*1380888.times) c.b(*1380888.times) ``` This patch fixes all usage of CALLER_SETUP_ARG with splatting a large number of arguments, and required similar fixes to use a temporary hidden array in three other cases where the VM would use the VM stack for handling a large number of arguments. However, it is possible there may be additional cases where splatting a large number of arguments still causes a SystemStackError. This has a measurable performance impact, as it requires additional checks for a large number of arguments in many additional cases. This change is fairly invasive, as there were many different VM functions that needed to be modified to support this. To avoid too much API change, I modified struct rb_calling_info to add a heap_argv member for storing the array, so I would not have to thread it through many functions. This struct is always stack allocated, which helps ensure sure GC doesn't collect it early. Because of how invasive the changes are, and how rarely large arrays are actually splatted in Ruby code, the existing test/spec suites are not great at testing for correct behavior. To try to find and fix all issues, I tested this in CI with VM_ARGC_STACK_MAX to -1, ensuring that a temporary array is used for all array splat method calls. This was very helpful in finding breaking cases, especially ones involving flagged keyword hashes. Fixes [Bug #4040] Co-authored-by: Jimmy Miller <jimmy.miller@shopify.com>
2023-03-07 02:58:58 +03:00
val = vm_yield_with_cfunc(ec, captured, captured->self, CALLING_ARGC(calling), calling->heap_argv ? RARRAY_CONST_PTR(calling->heap_argv) : STACK_ADDR_FROM_TOP(argc), calling->kw_splat, calling->block_handler, NULL);
POPN(argc); /* TODO: should put before C/yield? */
return val;
}
static VALUE
vm_proc_to_block_handler(VALUE procval)
{
const struct rb_block *block = vm_proc_block(procval);
switch (vm_block_type(block)) {
case block_type_iseq:
return VM_BH_FROM_ISEQ_BLOCK(&block->as.captured);
case block_type_ifunc:
return VM_BH_FROM_IFUNC_BLOCK(&block->as.captured);
case block_type_symbol:
return VM_BH_FROM_SYMBOL(block->as.symbol);
case block_type_proc:
return VM_BH_FROM_PROC(block->as.proc);
}
VM_UNREACHABLE(vm_yield_with_proc);
return Qundef;
}
static VALUE
vm_invoke_proc_block(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
struct rb_calling_info *calling, const struct rb_callinfo *ci,
bool is_lambda, VALUE block_handler)
{
while (vm_block_handler_type(block_handler) == block_handler_type_proc) {
VALUE proc = VM_BH_TO_PROC(block_handler);
is_lambda = block_proc_is_lambda(proc);
block_handler = vm_proc_to_block_handler(proc);
}
return vm_invoke_block(ec, reg_cfp, calling, ci, is_lambda, block_handler);
}
static inline VALUE
vm_invoke_block(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
struct rb_calling_info *calling, const struct rb_callinfo *ci,
bool is_lambda, VALUE block_handler)
{
VALUE (*func)(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp,
struct rb_calling_info *calling, const struct rb_callinfo *ci,
bool is_lambda, VALUE block_handler);
switch (vm_block_handler_type(block_handler)) {
case block_handler_type_iseq: func = vm_invoke_iseq_block; break;
case block_handler_type_ifunc: func = vm_invoke_ifunc_block; break;
case block_handler_type_proc: func = vm_invoke_proc_block; break;
case block_handler_type_symbol: func = vm_invoke_symbol_block; break;
default: rb_bug("vm_invoke_block: unreachable");
}
return func(ec, reg_cfp, calling, ci, is_lambda, block_handler);
}
static VALUE
vm_make_proc_with_iseq(const rb_iseq_t *blockiseq)
{
const rb_execution_context_t *ec = GET_EC();
const rb_control_frame_t *cfp = rb_vm_get_ruby_level_next_cfp(ec, ec->cfp);
struct rb_captured_block *captured;
if (cfp == 0) {
rb_bug("vm_make_proc_with_iseq: unreachable");
}
captured = VM_CFP_TO_CAPTURED_BLOCK(cfp);
captured->code.iseq = blockiseq;
return rb_vm_make_proc(ec, captured, rb_cProc);
}
static VALUE
vm_once_exec(VALUE iseq)
{
VALUE proc = vm_make_proc_with_iseq((rb_iseq_t *)iseq);
return rb_proc_call_with_block(proc, 0, 0, Qnil);
}
static VALUE
vm_once_clear(VALUE data)
{
union iseq_inline_storage_entry *is = (union iseq_inline_storage_entry *)data;
is->once.running_thread = NULL;
return Qnil;
}
* rewrite method/block parameter fitting logic to optimize keyword arguments/parameters and a splat argument. [Feature #10440] (Details are described in this ticket) Most of complex part is moved to vm_args.c. Now, ISeq#to_a does not catch up new instruction format. * vm_core.h: change iseq data structures. * introduce rb_call_info_kw_arg_t to represent keyword arguments. * add rb_call_info_t::kw_arg. * rename rb_iseq_t::arg_post_len to rb_iseq_t::arg_post_num. * rename rb_iseq_t::arg_keywords to arg_keyword_num. * rename rb_iseq_t::arg_keyword to rb_iseq_t::arg_keyword_bits. to represent keyword bitmap parameter index. This bitmap parameter shows that which keyword parameters are given or not given (0 for given). It is refered by `checkkeyword' instruction described bellow. * rename rb_iseq_t::arg_keyword_check to rb_iseq_t::arg_keyword_rest to represent keyword rest parameter index. * add rb_iseq_t::arg_keyword_default_values to represent default keyword values. * rename VM_CALL_ARGS_SKIP_SETUP to VM_CALL_ARGS_SIMPLE to represent (ci->flag & (SPLAT|BLOCKARG)) && ci->blockiseq == NULL && ci->kw_arg == NULL. * vm_insnhelper.c, vm_args.c: rewrite with refactoring. * rewrite splat argument code. * rewrite keyword arguments/parameters code. * merge method and block parameter fitting code into one code base. * vm.c, vm_eval.c: catch up these changes. * compile.c (new_callinfo): callinfo requires kw_arg parameter. * compile.c (compile_array_): check the last argument Hash object or not. If Hash object and all keys are Symbol literals, they are compiled to keyword arguments. * insns.def (checkkeyword): add new instruction. This instruction check the availability of corresponding keyword. For example, a method "def foo k1: 'v1'; end" is cimpiled to the following instructions. 0000 checkkeyword 2, 0 # check k1 is given. 0003 branchif 9 # if given, jump to address #9 0005 putstring "v1" 0007 setlocal_OP__WC__0 3 # k1 = 'v1' 0009 trace 8 0011 putnil 0012 trace 16 0014 leave * insns.def (opt_send_simple): removed and add new instruction "opt_send_without_block". * parse.y (new_args_tail_gen): reorder variables. Before this patch, a method "def foo(k1: 1, kr1:, k2: 2, **krest, &b)" has parameter variables "k1, kr1, k2, &b, internal_id, krest", but this patch reorders to "kr1, k1, k2, internal_id, krest, &b". (locate a block variable at last) * parse.y (vtable_pop): added. This function remove latest `n' variables from vtable. * iseq.c: catch up iseq data changes. * proc.c: ditto. * class.c (keyword_error): export as rb_keyword_error(). * common.mk: depend vm_args.c for vm.o. * hash.c (rb_hash_has_key): export. * internal.h: ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48239 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2014-11-02 21:02:55 +03:00
/* defined insn */
2021-03-17 19:10:42 +03:00
static bool
check_respond_to_missing(VALUE obj, VALUE v)
{
VALUE args[2];
VALUE r;
args[0] = obj; args[1] = Qfalse;
r = rb_check_funcall(v, idRespond_to_missing, 2, args);
2022-11-15 07:24:08 +03:00
if (!UNDEF_P(r) && RTEST(r)) {
2021-03-17 19:10:42 +03:00
return true;
}
else {
2021-03-17 19:10:42 +03:00
return false;
}
}
2021-03-17 19:10:42 +03:00
static bool
vm_defined(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, rb_num_t op_type, VALUE obj, VALUE v)
{
VALUE klass;
enum defined_type type = (enum defined_type)op_type;
switch (type) {
case DEFINED_IVAR:
return rb_ivar_defined(GET_SELF(), SYM2ID(obj));
break;
case DEFINED_GVAR:
return rb_gvar_defined(SYM2ID(obj));
break;
case DEFINED_CVAR: {
const rb_cref_t *cref = vm_get_cref(GET_EP());
klass = vm_get_cvar_base(cref, GET_CFP(), 0);
return rb_cvar_defined(klass, SYM2ID(obj));
break;
}
case DEFINED_CONST:
case DEFINED_CONST_FROM: {
bool allow_nil = type == DEFINED_CONST;
klass = v;
return vm_get_ev_const(ec, klass, SYM2ID(obj), allow_nil, true);
break;
}
case DEFINED_FUNC:
klass = CLASS_OF(v);
return rb_ec_obj_respond_to(ec, v, SYM2ID(obj), TRUE);
break;
case DEFINED_METHOD:{
VALUE klass = CLASS_OF(v);
const rb_method_entry_t *me = rb_method_entry_with_refinements(klass, SYM2ID(obj), NULL);
2022-07-21 19:23:58 +03:00
if (me) {
switch (METHOD_ENTRY_VISI(me)) {
case METHOD_VISI_PRIVATE:
break;
case METHOD_VISI_PROTECTED:
if (!rb_obj_is_kind_of(GET_SELF(), rb_class_real(me->defined_class))) {
break;
}
case METHOD_VISI_PUBLIC:
2021-03-17 19:10:42 +03:00
return true;
break;
default:
rb_bug("vm_defined: unreachable: %u", (unsigned int)METHOD_ENTRY_VISI(me));
}
}
else {
return check_respond_to_missing(obj, v);
}
break;
}
case DEFINED_YIELD:
if (GET_BLOCK_HANDLER() != VM_BLOCK_HANDLER_NONE) {
2021-03-17 19:10:42 +03:00
return true;
}
break;
case DEFINED_ZSUPER:
{
const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(GET_CFP());
if (me) {
VALUE klass = vm_search_normal_superclass(me->defined_class);
if (!klass) return false;
ID id = me->def->original_id;
return rb_method_boundp(klass, id, 0);
}
}
break;
case DEFINED_REF:
return RTEST(vm_backref_defined(ec, GET_LEP(), FIX2INT(obj)));
default:
rb_bug("unimplemented defined? type (VM)");
break;
}
2021-03-17 19:10:42 +03:00
return false;
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
2021-05-11 23:06:07 +03:00
bool
rb_vm_defined(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, rb_num_t op_type, VALUE obj, VALUE v)
{
return vm_defined(ec, reg_cfp, op_type, obj, v);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static const VALUE *
vm_get_ep(const VALUE *const reg_ep, rb_num_t lv)
{
rb_num_t i;
const VALUE *ep = reg_ep;
for (i = 0; i < lv; i++) {
ep = GET_PREV_EP(ep);
}
return ep;
}
static VALUE
vm_get_special_object(const VALUE *const reg_ep,
enum vm_special_object_type type)
{
switch (type) {
case VM_SPECIAL_OBJECT_VMCORE:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return rb_mRubyVMFrozenCore;
case VM_SPECIAL_OBJECT_CBASE:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return vm_get_cbase(reg_ep);
case VM_SPECIAL_OBJECT_CONST_BASE:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return vm_get_const_base(reg_ep);
default:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
rb_bug("putspecialobject insn: unknown value_type %d", type);
}
}
static VALUE
vm_concat_array(VALUE ary1, VALUE ary2st)
{
const VALUE ary2 = ary2st;
VALUE tmp1 = rb_check_to_array(ary1);
VALUE tmp2 = rb_check_to_array(ary2);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
if (NIL_P(tmp1)) {
tmp1 = rb_ary_new3(1, ary1);
}
Add concattoarray VM instruction This instruction is similar to concatarray, but assumes the first object is already an array, and appends to it directly. This is different than concatarray, which will create a new array instead of appending to an existing array. Additionally, for both concatarray and concattoarray, if the second argument cannot be converted to an array, then just push it onto the array, instead of creating a new array to wrap it, and then using concat array. This saves an array allocation in that case. This allows `f(*a, *a, *1)` to allocate only a single array on the caller side (which can be reused on the callee side in the case of `def f(*a)`). Prior to this commit, `f(*a, *a, *1)` would generate 4 arrays: * a dupped by splatarray true * a dupped again by first concatarray * 1 wrapped in array by third splatarray * result of [*a, *a] dupped by second concatarray Instructions Before for `a = []; f(*a, *a, *1)`: ``` 0000 newarray 0 ( 1)[Li] 0002 setlocal_WC_0 a@0 0004 putself 0005 getlocal_WC_0 a@0 0007 splatarray true 0009 getlocal_WC_0 a@0 0011 splatarray false 0013 concatarray 0014 putobject_INT2FIX_1_ 0015 splatarray false 0017 concatarray 0018 opt_send_without_block <calldata!mid:g, argc:1, ARGS_SPLAT|ARGS_SPLAT_MUT|FCALL> 0020 leave ``` Instructions After for `a = []; f(*a, *a, *1)`: ``` 0000 newarray 0 ( 1)[Li] 0002 setlocal_WC_0 a@0 0004 putself 0005 getlocal_WC_0 a@0 0007 splatarray true 0009 getlocal_WC_0 a@0 0011 concattoarray 0012 putobject_INT2FIX_1_ 0013 concattoarray 0014 opt_send_without_block <calldata!mid:f, argc:1, ARGS_SPLAT|ARGS_SPLAT_MUT|FCALL> 0016 leave ```
2023-11-25 07:23:58 +03:00
if (tmp1 == ary1) {
tmp1 = rb_ary_dup(ary1);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
if (NIL_P(tmp2)) {
Add concattoarray VM instruction This instruction is similar to concatarray, but assumes the first object is already an array, and appends to it directly. This is different than concatarray, which will create a new array instead of appending to an existing array. Additionally, for both concatarray and concattoarray, if the second argument cannot be converted to an array, then just push it onto the array, instead of creating a new array to wrap it, and then using concat array. This saves an array allocation in that case. This allows `f(*a, *a, *1)` to allocate only a single array on the caller side (which can be reused on the callee side in the case of `def f(*a)`). Prior to this commit, `f(*a, *a, *1)` would generate 4 arrays: * a dupped by splatarray true * a dupped again by first concatarray * 1 wrapped in array by third splatarray * result of [*a, *a] dupped by second concatarray Instructions Before for `a = []; f(*a, *a, *1)`: ``` 0000 newarray 0 ( 1)[Li] 0002 setlocal_WC_0 a@0 0004 putself 0005 getlocal_WC_0 a@0 0007 splatarray true 0009 getlocal_WC_0 a@0 0011 splatarray false 0013 concatarray 0014 putobject_INT2FIX_1_ 0015 splatarray false 0017 concatarray 0018 opt_send_without_block <calldata!mid:g, argc:1, ARGS_SPLAT|ARGS_SPLAT_MUT|FCALL> 0020 leave ``` Instructions After for `a = []; f(*a, *a, *1)`: ``` 0000 newarray 0 ( 1)[Li] 0002 setlocal_WC_0 a@0 0004 putself 0005 getlocal_WC_0 a@0 0007 splatarray true 0009 getlocal_WC_0 a@0 0011 concattoarray 0012 putobject_INT2FIX_1_ 0013 concattoarray 0014 opt_send_without_block <calldata!mid:f, argc:1, ARGS_SPLAT|ARGS_SPLAT_MUT|FCALL> 0016 leave ```
2023-11-25 07:23:58 +03:00
return rb_ary_push(tmp1, ary2);
} else {
return rb_ary_concat(tmp1, tmp2);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
Add concattoarray VM instruction This instruction is similar to concatarray, but assumes the first object is already an array, and appends to it directly. This is different than concatarray, which will create a new array instead of appending to an existing array. Additionally, for both concatarray and concattoarray, if the second argument cannot be converted to an array, then just push it onto the array, instead of creating a new array to wrap it, and then using concat array. This saves an array allocation in that case. This allows `f(*a, *a, *1)` to allocate only a single array on the caller side (which can be reused on the callee side in the case of `def f(*a)`). Prior to this commit, `f(*a, *a, *1)` would generate 4 arrays: * a dupped by splatarray true * a dupped again by first concatarray * 1 wrapped in array by third splatarray * result of [*a, *a] dupped by second concatarray Instructions Before for `a = []; f(*a, *a, *1)`: ``` 0000 newarray 0 ( 1)[Li] 0002 setlocal_WC_0 a@0 0004 putself 0005 getlocal_WC_0 a@0 0007 splatarray true 0009 getlocal_WC_0 a@0 0011 splatarray false 0013 concatarray 0014 putobject_INT2FIX_1_ 0015 splatarray false 0017 concatarray 0018 opt_send_without_block <calldata!mid:g, argc:1, ARGS_SPLAT|ARGS_SPLAT_MUT|FCALL> 0020 leave ``` Instructions After for `a = []; f(*a, *a, *1)`: ``` 0000 newarray 0 ( 1)[Li] 0002 setlocal_WC_0 a@0 0004 putself 0005 getlocal_WC_0 a@0 0007 splatarray true 0009 getlocal_WC_0 a@0 0011 concattoarray 0012 putobject_INT2FIX_1_ 0013 concattoarray 0014 opt_send_without_block <calldata!mid:f, argc:1, ARGS_SPLAT|ARGS_SPLAT_MUT|FCALL> 0016 leave ```
2023-11-25 07:23:58 +03:00
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
Add concattoarray VM instruction This instruction is similar to concatarray, but assumes the first object is already an array, and appends to it directly. This is different than concatarray, which will create a new array instead of appending to an existing array. Additionally, for both concatarray and concattoarray, if the second argument cannot be converted to an array, then just push it onto the array, instead of creating a new array to wrap it, and then using concat array. This saves an array allocation in that case. This allows `f(*a, *a, *1)` to allocate only a single array on the caller side (which can be reused on the callee side in the case of `def f(*a)`). Prior to this commit, `f(*a, *a, *1)` would generate 4 arrays: * a dupped by splatarray true * a dupped again by first concatarray * 1 wrapped in array by third splatarray * result of [*a, *a] dupped by second concatarray Instructions Before for `a = []; f(*a, *a, *1)`: ``` 0000 newarray 0 ( 1)[Li] 0002 setlocal_WC_0 a@0 0004 putself 0005 getlocal_WC_0 a@0 0007 splatarray true 0009 getlocal_WC_0 a@0 0011 splatarray false 0013 concatarray 0014 putobject_INT2FIX_1_ 0015 splatarray false 0017 concatarray 0018 opt_send_without_block <calldata!mid:g, argc:1, ARGS_SPLAT|ARGS_SPLAT_MUT|FCALL> 0020 leave ``` Instructions After for `a = []; f(*a, *a, *1)`: ``` 0000 newarray 0 ( 1)[Li] 0002 setlocal_WC_0 a@0 0004 putself 0005 getlocal_WC_0 a@0 0007 splatarray true 0009 getlocal_WC_0 a@0 0011 concattoarray 0012 putobject_INT2FIX_1_ 0013 concattoarray 0014 opt_send_without_block <calldata!mid:f, argc:1, ARGS_SPLAT|ARGS_SPLAT_MUT|FCALL> 0016 leave ```
2023-11-25 07:23:58 +03:00
static VALUE
vm_concat_to_array(VALUE ary1, VALUE ary2st)
{
/* ary1 must be a newly created array */
const VALUE ary2 = ary2st;
VALUE tmp2 = rb_check_to_array(ary2);
if (NIL_P(tmp2)) {
return rb_ary_push(ary1, ary2);
} else {
return rb_ary_concat(ary1, tmp2);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
// YJIT implementation is using the C function
// and needs to call a non-static function
VALUE
rb_vm_concat_array(VALUE ary1, VALUE ary2st)
{
return vm_concat_array(ary1, ary2st);
}
VALUE
rb_vm_concat_to_array(VALUE ary1, VALUE ary2st)
{
return vm_concat_to_array(ary1, ary2st);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_splat_array(VALUE flag, VALUE ary)
{
VALUE tmp = rb_check_to_array(ary);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
if (NIL_P(tmp)) {
return rb_ary_new3(1, ary);
}
else if (RTEST(flag)) {
return rb_ary_dup(tmp);
}
else {
return tmp;
}
}
// YJIT implementation is using the C function
// and needs to call a non-static function
2021-07-14 22:16:56 +03:00
VALUE
rb_vm_splat_array(VALUE flag, VALUE ary)
{
return vm_splat_array(flag, ary);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_check_match(rb_execution_context_t *ec, VALUE target, VALUE pattern, rb_num_t flag)
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
{
enum vm_check_match_type type = ((int)flag) & VM_CHECKMATCH_TYPE_MASK;
if (flag & VM_CHECKMATCH_ARRAY) {
long i;
const long n = RARRAY_LEN(pattern);
for (i = 0; i < n; i++) {
VALUE v = RARRAY_AREF(pattern, i);
VALUE c = check_match(ec, v, target, type);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
if (RTEST(c)) {
return c;
}
}
return Qfalse;
}
else {
return check_match(ec, pattern, target, type);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
VALUE
rb_vm_check_match(rb_execution_context_t *ec, VALUE target, VALUE pattern, rb_num_t flag)
{
return vm_check_match(ec, target, pattern, flag);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_check_keyword(lindex_t bits, lindex_t idx, const VALUE *ep)
{
const VALUE kw_bits = *(ep - bits);
if (FIXNUM_P(kw_bits)) {
unsigned int b = (unsigned int)FIX2ULONG(kw_bits);
if ((idx < KW_SPECIFIED_BITS_MAX) && (b & (0x01 << idx)))
return Qfalse;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else {
VM_ASSERT(RB_TYPE_P(kw_bits, T_HASH));
if (rb_hash_has_key(kw_bits, INT2FIX(idx))) return Qfalse;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
return Qtrue;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
static void
vm_dtrace(rb_event_flag_t flag, rb_execution_context_t *ec)
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
{
if (RUBY_DTRACE_METHOD_ENTRY_ENABLED() ||
RUBY_DTRACE_METHOD_RETURN_ENABLED() ||
RUBY_DTRACE_CMETHOD_ENTRY_ENABLED() ||
RUBY_DTRACE_CMETHOD_RETURN_ENABLED()) {
switch (flag) {
case RUBY_EVENT_CALL:
RUBY_DTRACE_METHOD_ENTRY_HOOK(ec, 0, 0);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return;
case RUBY_EVENT_C_CALL:
RUBY_DTRACE_CMETHOD_ENTRY_HOOK(ec, 0, 0);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return;
case RUBY_EVENT_RETURN:
RUBY_DTRACE_METHOD_RETURN_HOOK(ec, 0, 0);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return;
case RUBY_EVENT_C_RETURN:
RUBY_DTRACE_CMETHOD_RETURN_HOOK(ec, 0, 0);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return;
}
}
}
static VALUE
vm_const_get_under(ID id, rb_num_t flags, VALUE cbase)
{
if (!rb_const_defined_at(cbase, id)) {
return 0;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (VM_DEFINECLASS_SCOPED_P(flags)) {
return rb_public_const_get_at(cbase, id);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else {
return rb_const_get_at(cbase, id);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
static VALUE
vm_check_if_class(ID id, rb_num_t flags, VALUE super, VALUE klass)
{
if (!RB_TYPE_P(klass, T_CLASS)) {
return 0;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (VM_DEFINECLASS_HAS_SUPERCLASS_P(flags)) {
VALUE tmp = rb_class_real(RCLASS_SUPER(klass));
if (tmp != super) {
rb_raise(rb_eTypeError,
"superclass mismatch for class %"PRIsVALUE"",
rb_id2str(id));
}
else {
return klass;
}
}
else {
return klass;
}
}
static VALUE
vm_check_if_module(ID id, VALUE mod)
{
if (!RB_TYPE_P(mod, T_MODULE)) {
return 0;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else {
return mod;
}
}
2019-10-09 19:08:42 +03:00
static VALUE
declare_under(ID id, VALUE cbase, VALUE c)
{
rb_set_class_path_string(c, cbase, rb_id2str(id));
rb_const_set(cbase, id, c);
return c;
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_declare_class(ID id, rb_num_t flags, VALUE cbase, VALUE super)
{
/* new class declaration */
VALUE s = VM_DEFINECLASS_HAS_SUPERCLASS_P(flags) ? super : rb_cObject;
2019-10-09 19:08:42 +03:00
VALUE c = declare_under(id, cbase, rb_define_class_id(id, s));
Set allocator on class creation Allocating an instance of a class uses the allocator for the class. When the class has no allocator set, Ruby looks for it in the super class (see rb_get_alloc_func()). It's uncommon for classes created from Ruby code to ever have an allocator set, so it's common during the allocation process to search all the way to BasicObject from the class with which the allocation is being performed. This makes creating instances of classes that have long ancestry chains more expensive than creating instances of classes have that shorter ancestry chains. Setting the allocator at class creation time removes the need to perform a search for the alloctor during allocation. This is a breaking change for C-extensions that assume that classes created from Ruby code have no allocator set. Libraries that setup a class hierarchy in Ruby code and then set the allocator on some parent class, for example, can experience breakage. This seems like an unusual use case and hopefully it is rare or non-existent in practice. Rails has many classes that have upwards of 60 elements in the ancestry chain and benchmark shows a significant improvement for allocating with a class that includes 64 modules. ``` pre: ruby 3.0.0dev (2020-11-12T14:39:27Z master 6325866421) post: ruby 3.0.0dev (2020-11-12T20:15:30Z cut-allocator-lookup) Comparison: allocate_8_deep post: 10336985.6 i/s pre: 8691873.1 i/s - 1.19x slower allocate_32_deep post: 10423181.2 i/s pre: 6264879.1 i/s - 1.66x slower allocate_64_deep post: 10541851.2 i/s pre: 4936321.5 i/s - 2.14x slower allocate_128_deep post: 10451505.0 i/s pre: 3031313.5 i/s - 3.45x slower ```
2020-11-12 23:15:30 +03:00
rb_define_alloc_func(c, rb_get_alloc_func(c));
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
rb_class_inherited(s, c);
return c;
}
static VALUE
vm_declare_module(ID id, VALUE cbase)
{
/* new module declaration */
return declare_under(id, cbase, rb_module_new());
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
NORETURN(static void unmatched_redefinition(const char *type, VALUE cbase, ID id, VALUE old));
static void
unmatched_redefinition(const char *type, VALUE cbase, ID id, VALUE old)
{
VALUE name = rb_id2str(id);
VALUE message = rb_sprintf("%"PRIsVALUE" is not a %s",
name, type);
VALUE location = rb_const_source_location_at(cbase, id);
if (!NIL_P(location)) {
rb_str_catf(message, "\n%"PRIsVALUE":%"PRIsVALUE":"
" previous definition of %"PRIsVALUE" was here",
rb_ary_entry(location, 0), rb_ary_entry(location, 1), name);
}
rb_exc_raise(rb_exc_new_str(rb_eTypeError, message));
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_define_class(ID id, rb_num_t flags, VALUE cbase, VALUE super)
{
VALUE klass;
if (VM_DEFINECLASS_HAS_SUPERCLASS_P(flags) && !RB_TYPE_P(super, T_CLASS)) {
rb_raise(rb_eTypeError,
"superclass must be an instance of Class (given an instance of %"PRIsVALUE")",
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
rb_obj_class(super));
}
vm_check_if_namespace(cbase);
/* find klass */
rb_autoload_load(cbase, id);
if ((klass = vm_const_get_under(id, flags, cbase)) != 0) {
if (!vm_check_if_class(id, flags, super, klass))
unmatched_redefinition("class", cbase, id, klass);
return klass;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else {
return vm_declare_class(id, flags, cbase, super);
}
}
static VALUE
vm_define_module(ID id, rb_num_t flags, VALUE cbase)
{
VALUE mod;
vm_check_if_namespace(cbase);
if ((mod = vm_const_get_under(id, flags, cbase)) != 0) {
if (!vm_check_if_module(id, mod))
unmatched_redefinition("module", cbase, id, mod);
return mod;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else {
return vm_declare_module(id, cbase);
}
}
static VALUE
vm_find_or_create_class_by_id(ID id,
rb_num_t flags,
VALUE cbase,
VALUE super)
{
rb_vm_defineclass_type_t type = VM_DEFINECLASS_TYPE(flags);
switch (type) {
case VM_DEFINECLASS_TYPE_CLASS:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
/* classdef returns class scope value */
return vm_define_class(id, flags, cbase, super);
case VM_DEFINECLASS_TYPE_SINGLETON_CLASS:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
/* classdef returns class scope value */
return rb_singleton_class(cbase);
case VM_DEFINECLASS_TYPE_MODULE:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
/* classdef returns class scope value */
return vm_define_module(id, flags, cbase);
default:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
rb_bug("unknown defineclass type: %d", (int)type);
}
}
static rb_method_visibility_t
vm_scope_visibility_get(const rb_execution_context_t *ec)
{
const rb_control_frame_t *cfp = rb_vm_get_ruby_level_next_cfp(ec, ec->cfp);
if (!vm_env_cref_by_cref(cfp->ep)) {
return METHOD_VISI_PUBLIC;
}
else {
return CREF_SCOPE_VISI(vm_ec_cref(ec))->method_visi;
}
}
static int
vm_scope_module_func_check(const rb_execution_context_t *ec)
{
const rb_control_frame_t *cfp = rb_vm_get_ruby_level_next_cfp(ec, ec->cfp);
if (!vm_env_cref_by_cref(cfp->ep)) {
return FALSE;
}
else {
return CREF_SCOPE_VISI(vm_ec_cref(ec))->module_func;
}
}
static void
vm_define_method(const rb_execution_context_t *ec, VALUE obj, ID id, VALUE iseqval, int is_singleton)
{
VALUE klass;
rb_method_visibility_t visi;
rb_cref_t *cref = vm_ec_cref(ec);
Lazily create singletons on instance_{exec,eval} (#5146) * Lazily create singletons on instance_{exec,eval} Previously when instance_exec or instance_eval was called on an object, that object would be given a singleton class so that method definitions inside the block would be added to the object rather than its class. This commit aims to improve performance by delaying the creation of the singleton class unless/until one is needed for method definition. Most of the time instance_eval is used without any method definition. This was implemented by adding a flag to the cref indicating that it represents a singleton of the object rather than a class itself. In this case CREF_CLASS returns the object's existing class, but in cases that we are defining a method (either via definemethod or VM_SPECIAL_OBJECT_CBASE which is used for undef and alias). This also happens to fix what I believe is a bug. Previously instance_eval behaved differently with regards to constant access for true/false/nil than for all other objects. I don't think this was intentional. String::Foo = "foo" "".instance_eval("Foo") # => "foo" Integer::Foo = "foo" 123.instance_eval("Foo") # => "foo" TrueClass::Foo = "foo" true.instance_eval("Foo") # NameError: uninitialized constant Foo This also slightly changes the error message when trying to define a method through instance_eval on an object which can't have a singleton class. Before: $ ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': no class/module to add method (TypeError) After: $ ./ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) IMO this error is a small improvement on the original and better matches the (both old and new) message when definging a method using `def self.` $ ruby -e '123.instance_eval{ def self.foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) Co-authored-by: Matthew Draper <matthew@trebex.net> * Remove "under" argument from yield_under * Move CREF_SINGLETON_SET into vm_cref_new * Simplify vm_get_const_base * Fix leaf VM_SPECIAL_OBJECT_CONST_BASE Co-authored-by: Matthew Draper <matthew@trebex.net>
2021-12-03 02:53:39 +03:00
if (is_singleton) {
klass = rb_singleton_class(obj); /* class and frozen checked in this API */
visi = METHOD_VISI_PUBLIC;
}
Lazily create singletons on instance_{exec,eval} (#5146) * Lazily create singletons on instance_{exec,eval} Previously when instance_exec or instance_eval was called on an object, that object would be given a singleton class so that method definitions inside the block would be added to the object rather than its class. This commit aims to improve performance by delaying the creation of the singleton class unless/until one is needed for method definition. Most of the time instance_eval is used without any method definition. This was implemented by adding a flag to the cref indicating that it represents a singleton of the object rather than a class itself. In this case CREF_CLASS returns the object's existing class, but in cases that we are defining a method (either via definemethod or VM_SPECIAL_OBJECT_CBASE which is used for undef and alias). This also happens to fix what I believe is a bug. Previously instance_eval behaved differently with regards to constant access for true/false/nil than for all other objects. I don't think this was intentional. String::Foo = "foo" "".instance_eval("Foo") # => "foo" Integer::Foo = "foo" 123.instance_eval("Foo") # => "foo" TrueClass::Foo = "foo" true.instance_eval("Foo") # NameError: uninitialized constant Foo This also slightly changes the error message when trying to define a method through instance_eval on an object which can't have a singleton class. Before: $ ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': no class/module to add method (TypeError) After: $ ./ruby -e '123.instance_eval { def foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) IMO this error is a small improvement on the original and better matches the (both old and new) message when definging a method using `def self.` $ ruby -e '123.instance_eval{ def self.foo; end }' -e:1:in `block in <main>': can't define singleton (TypeError) Co-authored-by: Matthew Draper <matthew@trebex.net> * Remove "under" argument from yield_under * Move CREF_SINGLETON_SET into vm_cref_new * Simplify vm_get_const_base * Fix leaf VM_SPECIAL_OBJECT_CONST_BASE Co-authored-by: Matthew Draper <matthew@trebex.net>
2021-12-03 02:53:39 +03:00
else {
klass = CREF_CLASS_FOR_DEFINITION(cref);
visi = vm_scope_visibility_get(ec);
}
if (NIL_P(klass)) {
rb_raise(rb_eTypeError, "no class/module to add method");
}
rb_add_method_iseq(klass, id, (const rb_iseq_t *)iseqval, cref, visi);
// Set max_iv_count on klasses based on number of ivar sets that are in the initialize method
if (id == idInitialize && klass != rb_cObject && RB_TYPE_P(klass, T_CLASS) && (rb_get_alloc_func(klass) == rb_class_allocate_instance)) {
RCLASS_EXT(klass)->max_iv_count = rb_estimate_iv_count(klass, (const rb_iseq_t *)iseqval);
}
if (!is_singleton && vm_scope_module_func_check(ec)) {
klass = rb_singleton_class(klass);
rb_add_method_iseq(klass, id, (const rb_iseq_t *)iseqval, cref, METHOD_VISI_PUBLIC);
}
}
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
static VALUE
vm_invokeblock_i(struct rb_execution_context_struct *ec,
struct rb_control_frame_struct *reg_cfp,
struct rb_calling_info *calling)
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
{
const struct rb_callinfo *ci = calling->cd->ci;
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
VALUE block_handler = VM_CF_BLOCK_HANDLER(GET_CFP());
if (block_handler == VM_BLOCK_HANDLER_NONE) {
rb_vm_localjump_error("no block given (yield)", Qnil, 0);
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
}
else {
return vm_invoke_block(ec, GET_CFP(), calling, ci, false, block_handler);
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
}
}
enum method_explorer_type {
mexp_search_method,
mexp_search_invokeblock,
mexp_search_super,
};
2023-03-07 09:03:39 +03:00
static inline VALUE
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
vm_sendish(
struct rb_execution_context_struct *ec,
struct rb_control_frame_struct *reg_cfp,
struct rb_call_data *cd,
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
VALUE block_handler,
discourage inlining for vm_sendish() reversing 9213771817 only for JIT, because it made JIT slower. $ benchmark-driver -v --rbenv 'before;after;before --jit;after --jit' --repeat-count=36 --alternate --output=all benchmark.yml before: ruby 3.0.0dev (2020-12-19T07:38:17Z master a139318538) [x86_64-linux] after: ruby 3.0.0dev (2020-12-19T07:52:01Z master ce9faaeff5) [x86_64-linux] last_commit=discourage inlining for vm_sendish() before --jit: ruby 3.0.0dev (2020-12-19T07:38:17Z master a139318538) +JIT [x86_64-linux] after --jit: ruby 3.0.0dev (2020-12-19T07:52:01Z master ce9faaeff5) +JIT [x86_64-linux] last_commit=discourage inlining for vm_sendish() Calculating ------------------------------------- before after before --jit after --jit Optcarrot Lan_Master.nes 42.83365858987760 42.68912456143848 76.50136803552716 65.74704713379785 fps 42.87724738609940 42.89045158177300 79.72624911659534 81.26221749201044 43.34963955708526 42.95431841174180 80.18085951039328 82.86458983313545 43.56786038452823 43.57563008888242 80.45933051716041 83.09150550702445 43.83219269706004 43.60748924115331 80.67164125046142 83.39458202043882 43.99035062888973 43.62050459554573 80.93204435712701 83.56303651352751 44.25176047881120 44.04822899344536 81.15051082548314 83.58166141398522 44.41978060794512 44.06521657912991 81.35651907376140 83.80036752456826 44.46864790591856 44.09325484326153 81.53456531520031 83.87502933718609 45.54712020644544 44.70693952869038 81.97738413452767 83.95818356402224 45.84292299382878 44.77704345873913 82.35118338199700 83.95966387450966 45.89411137280815 45.41425773286726 83.01052538434648 84.12812994632024 45.93130099197283 46.16884439916935 83.50833510120576 84.26276094927231 46.13648038236674 46.66645417860622 84.88757531920830 85.41732546800056 46.74873798919658 46.71790568883760 84.90953097036886 85.56340808970482 47.11273577214855 46.74581938882115 84.93196765297411 85.57603396455576 47.17870777128640 46.82414166607185 84.97178445888456 86.63510466280221 47.19338055580042 46.83645774240446 85.43536447262163 86.74129103462393 47.25761413477774 46.86834469505590 85.59822430471097 86.85376073363715 47.53327847102834 46.90228589364909 85.76446609620548 87.26108400015282 47.64308771617673 47.02814519551055 85.79904863600991 87.72293541243303 47.80286861846863 47.44672838168050 85.88640862064263 87.86803587836525 47.86455937950740 47.65301489003541 85.88750199172448 88.16881051171814 47.90065455321760 47.73425082354376 85.94295700508701 88.71267004066843 47.90727961241468 47.86377917424705 85.94674546805844 88.77726627283683 47.93243954623904 47.88720812998766 86.51872778134982 88.78993962536994 47.95062952008558 47.88774830879015 86.63116771614249 88.88085054889298 47.95097849989396 47.89825669442417 86.77387990931732 89.72021826461126 48.04730571166697 47.89981045730949 86.95084011077047 89.75804193954582 48.08042611622322 48.03246661737583 87.87239147980547 90.05949240088842 48.08999523258601 48.15253490344558 88.31289344498016 90.36439442190294 48.25670456430854 48.26904755214532 88.33999433286937 90.54253266759406 48.25947200597002 48.41894159956091 88.35502296938638 90.72591894564106 48.30826210577268 48.43125201523194 88.58311746582939 90.77173035874087 48.31514124187375 48.53932287546499 88.89099681179805 91.07747476133886 48.44349281318267 48.58969411593706 89.34043973691581 91.08545627378257
2020-12-19 10:30:09 +03:00
enum method_explorer_type method_explorer
) {
VALUE val = Qundef;
const struct rb_callinfo *ci = cd->ci;
const struct rb_callcache *cc;
int argc = vm_ci_argc(ci);
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
VALUE recv = TOPN(argc);
struct rb_calling_info calling = {
.block_handler = block_handler,
.kw_splat = IS_ARGS_KW_SPLAT(ci) > 0,
.recv = recv,
.argc = argc,
.cd = cd,
};
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
switch (method_explorer) {
case mexp_search_method:
calling.cc = cc = vm_search_method_fastpath((VALUE)reg_cfp->iseq, cd, CLASS_OF(recv));
val = vm_cc_call(cc)(ec, GET_CFP(), &calling);
break;
case mexp_search_super:
calling.cc = cc = vm_search_super_method(reg_cfp, cd, recv);
val = vm_cc_call(cc)(ec, GET_CFP(), &calling);
break;
case mexp_search_invokeblock:
val = vm_invokeblock_i(ec, GET_CFP(), &calling);
break;
}
return val;
insns.def: refactor to avoid CALL_METHOD macro These send and its variant instructions are the most frequently called paths in the entire process. Reducing macro expansions to make them dedicated function called vm_sendish() is the main goal of this changeset. It reduces the size of vm_exec_coref from 25,552 bytes to 23,728 bytes on my machine. I see no significant slowdown. Fix: [GH-2056] vanilla: ruby 2.6.0dev (2018-12-19 trunk 66449) [x86_64-darwin15] ours: ruby 2.6.0dev (2018-12-19 refactor-send 66449) [x86_64-darwin15] last_commit=insns.def: refactor to avoid CALL_METHOD macro Calculating ------------------------------------- vanilla ours vm2_defined_method 2.645M 2.823M i/s - 6.000M times in 5.109888s 4.783254s vm2_method 8.553M 8.873M i/s - 6.000M times in 1.579892s 1.524026s vm2_method_missing 3.772M 3.858M i/s - 6.000M times in 3.579482s 3.499220s vm2_method_with_block 8.494M 8.944M i/s - 6.000M times in 1.589774s 1.509463s vm2_poly_method 0.571 0.607 i/s - 1.000 times in 3.947570s 3.733528s vm2_poly_method_ov 5.514 5.168 i/s - 1.000 times in 0.408156s 0.436169s vm3_clearmethodcache 2.875 2.837 i/s - 1.000 times in 0.783018s 0.793493s Comparison: vm2_defined_method ours: 2822555.4 i/s vanilla: 2644878.1 i/s - 1.07x slower vm2_method ours: 8872947.8 i/s vanilla: 8553433.1 i/s - 1.04x slower vm2_method_missing ours: 3858192.3 i/s vanilla: 3772296.3 i/s - 1.02x slower vm2_method_with_block ours: 8943825.1 i/s vanilla: 8493955.0 i/s - 1.05x slower vm2_poly_method ours: 0.6 i/s vanilla: 0.6 i/s - 1.06x slower vm2_poly_method_ov vanilla: 5.5 i/s ours: 5.2 i/s - 1.07x slower vm3_clearmethodcache vanilla: 2.9 i/s ours: 2.8 i/s - 1.01x slower git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66565 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-12-26 03:59:37 +03:00
}
VALUE
rb_vm_send(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, CALL_DATA cd, ISEQ blockiseq)
{
stack_check(ec);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
struct rb_forwarding_call_data adjusted_cd;
struct rb_callinfo adjusted_ci;
VALUE bh;
VALUE val;
if (vm_ci_flag(cd->ci) & VM_CALL_FORWARDING) {
bh = vm_caller_setup_fwd_args(GET_EC(), GET_CFP(), cd, blockiseq, false, &adjusted_cd, &adjusted_ci);
val = vm_sendish(ec, GET_CFP(), &adjusted_cd.cd, bh, mexp_search_method);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
if (cd->cc != adjusted_cd.cd.cc && vm_cc_markable(adjusted_cd.cd.cc)) {
RB_OBJ_WRITE(GET_ISEQ(), &cd->cc, adjusted_cd.cd.cc);
}
}
else {
bh = vm_caller_setup_arg_block(ec, GET_CFP(), cd->ci, blockiseq, false);
val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_method);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
}
VM_EXEC(ec, val);
return val;
}
VALUE
rb_vm_opt_send_without_block(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, CALL_DATA cd)
{
stack_check(ec);
VALUE bh = VM_BLOCK_HANDLER_NONE;
VALUE val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_method);
VM_EXEC(ec, val);
return val;
}
VALUE
rb_vm_invokesuper(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, CALL_DATA cd, ISEQ blockiseq)
{
stack_check(ec);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
struct rb_forwarding_call_data adjusted_cd;
struct rb_callinfo adjusted_ci;
VALUE bh;
VALUE val;
if (vm_ci_flag(cd->ci) & VM_CALL_FORWARDING) {
bh = vm_caller_setup_fwd_args(GET_EC(), GET_CFP(), cd, blockiseq, true, &adjusted_cd, &adjusted_ci);
val = vm_sendish(ec, GET_CFP(), &adjusted_cd.cd, bh, mexp_search_super);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
if (cd->cc != adjusted_cd.cd.cc && vm_cc_markable(adjusted_cd.cd.cc)) {
RB_OBJ_WRITE(GET_ISEQ(), &cd->cc, adjusted_cd.cd.cc);
}
}
else {
bh = vm_caller_setup_arg_block(ec, GET_CFP(), cd->ci, blockiseq, true);
val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_super);
Optimized forwarding callers and callees This patch optimizes forwarding callers and callees. It only optimizes methods that only take `...` as their parameter, and then pass `...` to other calls. Calls it optimizes look like this: ```ruby def bar(a) = a def foo(...) = bar(...) # optimized foo(123) ``` ```ruby def bar(a) = a def foo(...) = bar(1, 2, ...) # optimized foo(123) ``` ```ruby def bar(*a) = a def foo(...) list = [1, 2] bar(*list, ...) # optimized end foo(123) ``` All variants of the above but using `super` are also optimized, including a bare super like this: ```ruby def foo(...) super end ``` This patch eliminates intermediate allocations made when calling methods that accept `...`. We can observe allocation elimination like this: ```ruby def m x = GC.stat(:total_allocated_objects) yield GC.stat(:total_allocated_objects) - x end def bar(a) = a def foo(...) = bar(...) def test m { foo(123) } end test p test # allocates 1 object on master, but 0 objects with this patch ``` ```ruby def bar(a, b:) = a + b def foo(...) = bar(...) def test m { foo(1, b: 2) } end test p test # allocates 2 objects on master, but 0 objects with this patch ``` How does it work? ----------------- This patch works by using a dynamic stack size when passing forwarded parameters to callees. The caller's info object (known as the "CI") contains the stack size of the parameters, so we pass the CI object itself as a parameter to the callee. When forwarding parameters, the forwarding ISeq uses the caller's CI to determine how much stack to copy, then copies the caller's stack before calling the callee. The CI at the forwarded call site is adjusted using information from the caller's CI. I think this description is kind of confusing, so let's walk through an example with code. ```ruby def delegatee(a, b) = a + b def delegator(...) delegatee(...) # CI2 (FORWARDING) end def caller delegator(1, 2) # CI1 (argc: 2) end ``` Before we call the delegator method, the stack looks like this: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | 5| delegatee(...) # CI2 (FORWARDING) | 6| end | 7| | 8| def caller | -> 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The ISeq for `delegator` is tagged as "forwardable", so when `caller` calls in to `delegator`, it writes `CI1` on to the stack as a local variable for the `delegator` method. The `delegator` method has a special local called `...` that holds the caller's CI object. Here is the ISeq disasm fo `delegator`: ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` The local called `...` will contain the caller's CI: CI1. Here is the stack when we enter `delegator`: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 -> 4| # | CI1 (argc: 2) 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | 9| delegator(1, 2) # CI1 (argc: 2) | 10| end | ``` The CI at `delegatee` on line 5 is tagged as "FORWARDING", so it knows to memcopy the caller's stack before calling `delegatee`. In this case, it will memcopy self, 1, and 2 to the stack before calling `delegatee`. It knows how much memory to copy from the caller because `CI1` contains stack size information (argc: 2). Before executing the `send` instruction, we push `...` on the stack. The `send` instruction pops `...`, and because it is tagged with `FORWARDING`, it knows to memcopy (using the information in the CI it just popped): ``` == disasm: #<ISeq:delegator@-e:1 (1,0)-(1,39)> local table (size: 1, argc: 0 [opts: 0, rest: -1, post: 0, block: -1, kw: -1@-1, kwrest: -1]) [ 1] "..."@0 0000 putself ( 1)[LiCa] 0001 getlocal_WC_0 "..."@0 0003 send <calldata!mid:delegatee, argc:0, FCALL|FORWARDING>, nil 0006 leave [Re] ``` Instruction 001 puts the caller's CI on the stack. `send` is tagged with FORWARDING, so it reads the CI and _copies_ the callers stack to this stack: ``` Executing Line | Code | Stack ---------------+---------------------------------------+-------- 1| def delegatee(a, b) = a + b | self 2| | 1 3| def delegator(...) | 2 4| # | CI1 (argc: 2) -> 5| delegatee(...) # CI2 (FORWARDING) | cref_or_me 6| end | specval 7| | type 8| def caller | self 9| delegator(1, 2) # CI1 (argc: 2) | 1 10| end | 2 ``` The "FORWARDING" call site combines information from CI1 with CI2 in order to support passing other values in addition to the `...` value, as well as perfectly forward splat args, kwargs, etc. Since we're able to copy the stack from `caller` in to `delegator`'s stack, we can avoid allocating objects. I want to do this to eliminate object allocations for delegate methods. My long term goal is to implement `Class#new` in Ruby and it uses `...`. I was able to implement `Class#new` in Ruby [here](https://github.com/ruby/ruby/pull/9289). If we adopt the technique in this patch, then we can optimize allocating objects that take keyword parameters for `initialize`. For example, this code will allocate 2 objects: one for `SomeObject`, and one for the kwargs: ```ruby SomeObject.new(foo: 1) ``` If we combine this technique, plus implement `Class#new` in Ruby, then we can reduce allocations for this common operation. Co-Authored-By: John Hawthorn <john@hawthorn.email> Co-Authored-By: Alan Wu <XrXr@users.noreply.github.com>
2024-04-15 20:48:53 +03:00
}
VM_EXEC(ec, val);
return val;
}
VALUE
rb_vm_invokeblock(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, CALL_DATA cd)
{
stack_check(ec);
VALUE bh = VM_BLOCK_HANDLER_NONE;
VALUE val = vm_sendish(ec, GET_CFP(), cd, bh, mexp_search_invokeblock);
VM_EXEC(ec, val);
return val;
}
/* object.c */
VALUE rb_nil_to_s(VALUE);
VALUE rb_true_to_s(VALUE);
VALUE rb_false_to_s(VALUE);
/* numeric.c */
VALUE rb_int_to_s(int argc, VALUE *argv, VALUE x);
VALUE rb_fix_to_s(VALUE);
/* variable.c */
VALUE rb_mod_to_s(VALUE);
VALUE rb_mod_name(VALUE);
static VALUE
vm_objtostring(const rb_iseq_t *iseq, VALUE recv, CALL_DATA cd)
{
int type = TYPE(recv);
if (type == T_STRING) {
return recv;
}
const struct rb_callcache *cc = vm_search_method((VALUE)iseq, cd, recv);
switch (type) {
case T_SYMBOL:
if (check_cfunc(vm_cc_cme(cc), rb_sym_to_s)) {
// rb_sym_to_s() allocates a mutable string, but since we are only
// going to use this string for interpolation, it's fine to use the
// frozen string.
return rb_sym2str(recv);
}
break;
case T_MODULE:
case T_CLASS:
if (check_cfunc(vm_cc_cme(cc), rb_mod_to_s)) {
// rb_mod_to_s() allocates a mutable string, but since we are only
// going to use this string for interpolation, it's fine to use the
// frozen string.
VALUE val = rb_mod_name(recv);
2022-06-29 15:59:39 +03:00
if (NIL_P(val)) {
val = rb_mod_to_s(recv);
}
return val;
}
break;
case T_NIL:
if (check_cfunc(vm_cc_cme(cc), rb_nil_to_s)) {
return rb_nil_to_s(recv);
}
break;
case T_TRUE:
if (check_cfunc(vm_cc_cme(cc), rb_true_to_s)) {
return rb_true_to_s(recv);
}
break;
case T_FALSE:
if (check_cfunc(vm_cc_cme(cc), rb_false_to_s)) {
return rb_false_to_s(recv);
}
break;
case T_FIXNUM:
if (check_cfunc(vm_cc_cme(cc), rb_int_to_s)) {
return rb_fix_to_s(recv);
}
break;
}
return Qundef;
}
static VALUE
vm_opt_ary_freeze(VALUE ary, int bop, ID id)
{
if (BASIC_OP_UNREDEFINED_P(bop, ARRAY_REDEFINED_OP_FLAG)) {
return ary;
}
else {
return Qundef;
}
}
static VALUE
vm_opt_hash_freeze(VALUE hash, int bop, ID id)
{
if (BASIC_OP_UNREDEFINED_P(bop, HASH_REDEFINED_OP_FLAG)) {
return hash;
}
else {
return Qundef;
}
}
static VALUE
vm_opt_str_freeze(VALUE str, int bop, ID id)
{
if (BASIC_OP_UNREDEFINED_P(bop, STRING_REDEFINED_OP_FLAG)) {
return str;
}
else {
return Qundef;
}
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
/* this macro is mandatory to use OPTIMIZED_CMP. What a design! */
#define id_cmp idCmp
static VALUE
vm_opt_newarray_max(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr)
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
{
if (BASIC_OP_UNREDEFINED_P(BOP_MAX, ARRAY_REDEFINED_OP_FLAG)) {
if (num == 0) {
return Qnil;
}
else {
VALUE result = *ptr;
rb_snum_t i = num - 1;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
while (i-- > 0) {
const VALUE v = *++ptr;
Introduce BOP_CMP for optimized comparison Prior to this commit the `OPTIMIZED_CMP` macro relied on a method lookup to determine whether `<=>` was overridden. The result of the lookup was cached, but only for the duration of the specific method that initialized the cmp_opt_data cache structure. With this method lookup, `[x,y].max` is slower than doing `x > y ? x : y` even though there's an optimized instruction for "new array max". (John noticed somebody a proposed micro-optimization based on this fact in https://github.com/mastodon/mastodon/pull/19903.) ```rb a, b = 1, 2 Benchmark.ips do |bm| bm.report('conditional') { a > b ? a : b } bm.report('method') { [a, b].max } bm.compare! end ``` Before: ``` Comparison: conditional: 22603733.2 i/s method: 19820412.7 i/s - 1.14x (± 0.00) slower ``` This commit replaces the method lookup with a new CMP basic op, which gives the examples above equivalent performance. After: ``` Comparison: method: 24022466.5 i/s conditional: 23851094.2 i/s - same-ish: difference falls within error ``` Relevant benchmarks show an improvement to Array#max and Array#min when not using the optimized newarray_max instruction as well. They are noticeably faster for small arrays with the relevant types, and the same or maybe a touch faster on larger arrays. ``` $ make benchmark COMPARE_RUBY=<master@5958c305> ITEM=array_min $ make benchmark COMPARE_RUBY=<master@5958c305> ITEM=array_max ``` The benchmarks added in this commit also look generally improved. Co-authored-by: John Hawthorn <jhawthorn@github.com>
2022-11-23 05:16:11 +03:00
if (OPTIMIZED_CMP(v, result) > 0) {
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
result = v;
}
}
return result;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
else {
return rb_vm_call_with_refinements(ec, rb_ary_new4(num, ptr), idMax, 0, NULL, RB_NO_KEYWORDS);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
VALUE
rb_vm_opt_newarray_max(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr)
{
return vm_opt_newarray_max(ec, num, ptr);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_opt_newarray_min(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr)
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
{
if (BASIC_OP_UNREDEFINED_P(BOP_MIN, ARRAY_REDEFINED_OP_FLAG)) {
if (num == 0) {
return Qnil;
}
else {
VALUE result = *ptr;
rb_snum_t i = num - 1;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
while (i-- > 0) {
const VALUE v = *++ptr;
Introduce BOP_CMP for optimized comparison Prior to this commit the `OPTIMIZED_CMP` macro relied on a method lookup to determine whether `<=>` was overridden. The result of the lookup was cached, but only for the duration of the specific method that initialized the cmp_opt_data cache structure. With this method lookup, `[x,y].max` is slower than doing `x > y ? x : y` even though there's an optimized instruction for "new array max". (John noticed somebody a proposed micro-optimization based on this fact in https://github.com/mastodon/mastodon/pull/19903.) ```rb a, b = 1, 2 Benchmark.ips do |bm| bm.report('conditional') { a > b ? a : b } bm.report('method') { [a, b].max } bm.compare! end ``` Before: ``` Comparison: conditional: 22603733.2 i/s method: 19820412.7 i/s - 1.14x (± 0.00) slower ``` This commit replaces the method lookup with a new CMP basic op, which gives the examples above equivalent performance. After: ``` Comparison: method: 24022466.5 i/s conditional: 23851094.2 i/s - same-ish: difference falls within error ``` Relevant benchmarks show an improvement to Array#max and Array#min when not using the optimized newarray_max instruction as well. They are noticeably faster for small arrays with the relevant types, and the same or maybe a touch faster on larger arrays. ``` $ make benchmark COMPARE_RUBY=<master@5958c305> ITEM=array_min $ make benchmark COMPARE_RUBY=<master@5958c305> ITEM=array_max ``` The benchmarks added in this commit also look generally improved. Co-authored-by: John Hawthorn <jhawthorn@github.com>
2022-11-23 05:16:11 +03:00
if (OPTIMIZED_CMP(v, result) < 0) {
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
result = v;
}
}
return result;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
else {
return rb_vm_call_with_refinements(ec, rb_ary_new4(num, ptr), idMin, 0, NULL, RB_NO_KEYWORDS);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
VALUE
rb_vm_opt_newarray_min(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr)
{
return vm_opt_newarray_min(ec, num, ptr);
}
static VALUE
vm_opt_newarray_hash(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr)
{
// If Array#hash is _not_ monkeypatched, use the optimized call
if (BASIC_OP_UNREDEFINED_P(BOP_HASH, ARRAY_REDEFINED_OP_FLAG)) {
return rb_ary_hash_values(num, ptr);
}
else {
return rb_vm_call_with_refinements(ec, rb_ary_new4(num, ptr), idHash, 0, NULL, RB_NO_KEYWORDS);
}
}
VALUE
rb_vm_opt_newarray_hash(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr)
{
return vm_opt_newarray_hash(ec, num, ptr);
}
VALUE rb_setup_fake_ary(struct RArray *fake_ary, const VALUE *list, long len);
VALUE rb_ec_pack_ary(rb_execution_context_t *ec, VALUE ary, VALUE fmt, VALUE buffer);
static VALUE
vm_opt_newarray_pack_buffer(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr, VALUE fmt, VALUE buffer)
{
if (BASIC_OP_UNREDEFINED_P(BOP_PACK, ARRAY_REDEFINED_OP_FLAG)) {
struct RArray fake_ary;
VALUE ary = rb_setup_fake_ary(&fake_ary, ptr, num);
return rb_ec_pack_ary(ec, ary, fmt, (UNDEF_P(buffer) ? Qnil : buffer));
}
else {
// The opt_newarray_send insn drops the keyword args so we need to rebuild them.
// Setup an array with room for keyword hash.
VALUE args[2];
args[0] = fmt;
int kw_splat = RB_NO_KEYWORDS;
int argc = 1;
if (!UNDEF_P(buffer)) {
args[1] = rb_hash_new_with_size(1);
rb_hash_aset(args[1], ID2SYM(idBuffer), buffer);
kw_splat = RB_PASS_KEYWORDS;
argc++;
}
return rb_vm_call_with_refinements(ec, rb_ary_new4(num, ptr), idPack, argc, args, kw_splat);
}
}
VALUE
rb_vm_opt_newarray_pack_buffer(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr, VALUE fmt, VALUE buffer)
{
return vm_opt_newarray_pack_buffer(ec, num, ptr, fmt, buffer);
}
VALUE
rb_vm_opt_newarray_pack(rb_execution_context_t *ec, rb_num_t num, const VALUE *ptr, VALUE fmt)
{
return vm_opt_newarray_pack_buffer(ec, num, ptr, fmt, Qundef);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
#undef id_cmp
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
static void
vm_track_constant_cache(ID id, void *ic)
{
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
struct rb_id_table *const_cache = GET_VM()->constant_cache;
VALUE lookup_result;
st_table *ics;
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
if (rb_id_table_lookup(const_cache, id, &lookup_result)) {
ics = (st_table *)lookup_result;
}
else {
ics = st_init_numtable();
rb_id_table_insert(const_cache, id, (VALUE)ics);
}
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
st_insert(ics, (st_data_t) ic, (st_data_t) Qtrue);
}
static void
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
vm_ic_track_const_chain(rb_control_frame_t *cfp, IC ic, const ID *segments)
{
RB_VM_LOCK_ENTER();
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
for (int i = 0; segments[i]; i++) {
ID id = segments[i];
if (id == idNULL) continue;
vm_track_constant_cache(id, ic);
}
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
RB_VM_LOCK_LEAVE();
}
2023-03-07 10:15:30 +03:00
// For RJIT inlining
static inline bool
vm_inlined_ic_hit_p(VALUE flags, VALUE value, const rb_cref_t *ic_cref, const VALUE *reg_ep)
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
{
if ((flags & IMEMO_CONST_CACHE_SHAREABLE) || rb_ractor_main_p()) {
2022-10-01 09:58:47 +03:00
VM_ASSERT(ractor_incidental_shareable_p(flags & IMEMO_CONST_CACHE_SHAREABLE, value));
return (ic_cref == NULL || // no need to check CREF
ic_cref == vm_get_cref(reg_ep));
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
return false;
}
static bool
vm_ic_hit_p(const struct iseq_inline_constant_cache_entry *ice, const VALUE *reg_ep)
{
VM_ASSERT(IMEMO_TYPE_P(ice, imemo_constcache));
return vm_inlined_ic_hit_p(ice->flags, ice->value, ice->ic_cref, reg_ep);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
// YJIT needs this function to never allocate and never raise
bool
rb_vm_ic_hit_p(IC ic, const VALUE *reg_ep)
{
return ic->entry && vm_ic_hit_p(ic->entry, reg_ep);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static void
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
vm_ic_update(const rb_iseq_t *iseq, IC ic, VALUE val, const VALUE *reg_ep, const VALUE *pc)
{
if (ruby_vm_const_missing_count > 0) {
ruby_vm_const_missing_count = 0;
ic->entry = NULL;
return;
}
struct iseq_inline_constant_cache_entry *ice = IMEMO_NEW(struct iseq_inline_constant_cache_entry, imemo_constcache, 0);
RB_OBJ_WRITE(ice, &ice->value, val);
ice->ic_cref = vm_get_const_key_cref(reg_ep);
if (rb_ractor_shareable_p(val)) ice->flags |= IMEMO_CONST_CACHE_SHAREABLE;
RB_OBJ_WRITE(iseq, &ic->entry, ice);
New constant caching insn: opt_getconstant_path Previously YARV bytecode implemented constant caching by having a pair of instructions, opt_getinlinecache and opt_setinlinecache, wrapping a series of getconstant calls (with putobject providing supporting arguments). This commit replaces that pattern with a new instruction, opt_getconstant_path, handling both getting/setting the inline cache and fetching the constant on a cache miss. This is implemented by storing the full constant path as a null-terminated array of IDs inside of the IC structure. idNULL is used to signal an absolute constant reference. $ ./miniruby --dump=insns -e '::Foo::Bar::Baz' == disasm: #<ISeq:<main>@-e:1 (1,0)-(1,13)> (catch: FALSE) 0000 opt_getconstant_path <ic:0 ::Foo::Bar::Baz> ( 1)[Li] 0002 leave The motivation for this is that we had increasingly found the need to disassemble the instructions between the opt_getinlinecache and opt_setinlinecache in order to determine the constant we are fetching, or otherwise store metadata. This disassembly was done: * In opt_setinlinecache, to register the IC against the constant names it is using for granular invalidation. * In rb_iseq_free, to unregister the IC from the invalidation table. * In YJIT to find the position of a opt_getinlinecache instruction to invalidate it when the cache is populated * In YJIT to register the constant names being used for invalidation. With this change we no longe need disassemly for these (in fact rb_iseq_each is now unused), as the list of constant names being referenced is held in the IC. This should also make it possible to make more optimizations in the future. This may also reduce the size of iseqs, as previously each segment required 32 bytes (on 64-bit platforms) for each constant segment. This implementation only stores one ID per-segment. There should be no significant performance change between this and the previous implementation. Previously opt_getinlinecache was a "leaf" instruction, but it included a jump (almost always to a separate cache line). Now opt_getconstant_path is a non-leaf (it may raise/autoload/call const_missing) but it does not jump. These seem to even out.
2022-08-10 20:35:48 +03:00
RUBY_ASSERT(pc >= ISEQ_BODY(iseq)->iseq_encoded);
unsigned pos = (unsigned)(pc - ISEQ_BODY(iseq)->iseq_encoded);
rb_yjit_constant_ic_update(iseq, ic, pos);
2023-03-07 10:17:25 +03:00
rb_rjit_constant_ic_update(iseq, ic, pos);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
VALUE
rb_vm_opt_getconstant_path(rb_execution_context_t *ec, rb_control_frame_t *const reg_cfp, IC ic)
{
VALUE val;
const ID *segments = ic->segments;
struct iseq_inline_constant_cache_entry *ice = ic->entry;
if (ice && vm_ic_hit_p(ice, GET_EP())) {
val = ice->value;
VM_ASSERT(val == vm_get_ev_const_chain(ec, segments));
2024-01-07 18:50:41 +03:00
}
else {
ruby_vm_constant_cache_misses++;
val = vm_get_ev_const_chain(ec, segments);
vm_ic_track_const_chain(GET_CFP(), ic, segments);
// Undo the PC increment to get the address to this instruction
// INSN_ATTR(width) == 2
vm_ic_update(GET_ISEQ(), ic, val, GET_EP(), GET_PC() - 2);
}
return val;
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_once_dispatch(rb_execution_context_t *ec, ISEQ iseq, ISE is)
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
{
rb_thread_t *th = rb_ec_thread_ptr(ec);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
rb_thread_t *const RUNNING_THREAD_ONCE_DONE = (rb_thread_t *)(0x1);
again:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
if (is->once.running_thread == RUNNING_THREAD_ONCE_DONE) {
return is->once.value;
}
else if (is->once.running_thread == NULL) {
VALUE val;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
is->once.running_thread = th;
val = rb_ensure(vm_once_exec, (VALUE)iseq, vm_once_clear, (VALUE)is);
RB_OBJ_WRITE(ec->cfp->iseq, &is->once.value, val);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
/* is->once.running_thread is cleared by vm_once_clear() */
is->once.running_thread = RUNNING_THREAD_ONCE_DONE; /* success */
return val;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (is->once.running_thread == th) {
/* recursive once */
return vm_once_exec((VALUE)iseq);
}
else {
/* waiting for finish */
RUBY_VM_CHECK_INTS(ec);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
rb_thread_schedule();
goto again;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
static OFFSET
vm_case_dispatch(CDHASH hash, OFFSET else_offset, VALUE key)
{
switch (OBJ_BUILTIN_TYPE(key)) {
case -1:
case T_FLOAT:
case T_SYMBOL:
case T_BIGNUM:
case T_STRING:
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
if (BASIC_OP_UNREDEFINED_P(BOP_EQQ,
SYMBOL_REDEFINED_OP_FLAG |
INTEGER_REDEFINED_OP_FLAG |
FLOAT_REDEFINED_OP_FLAG |
NIL_REDEFINED_OP_FLAG |
TRUE_REDEFINED_OP_FLAG |
FALSE_REDEFINED_OP_FLAG |
STRING_REDEFINED_OP_FLAG)) {
st_data_t val;
if (RB_FLOAT_TYPE_P(key)) {
double kval = RFLOAT_VALUE(key);
if (!isinf(kval) && modf(kval, &kval) == 0.0) {
key = FIXABLE(kval) ? LONG2FIX((long)kval) : rb_dbl2big(kval);
}
}
if (rb_hash_stlike_lookup(hash, key, &val)) {
return FIX2LONG((VALUE)val);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else {
return else_offset;
}
}
}
return 0;
}
NORETURN(static void
vm_stack_consistency_error(const rb_execution_context_t *ec,
const rb_control_frame_t *,
const VALUE *));
static void
vm_stack_consistency_error(const rb_execution_context_t *ec,
const rb_control_frame_t *cfp,
const VALUE *bp)
{
const ptrdiff_t nsp = VM_SP_CNT(ec, cfp->sp);
const ptrdiff_t nbp = VM_SP_CNT(ec, bp);
static const char stack_consistency_error[] =
"Stack consistency error (sp: %"PRIdPTRDIFF", bp: %"PRIdPTRDIFF")";
#if defined RUBY_DEVEL
VALUE mesg = rb_sprintf(stack_consistency_error, nsp, nbp);
rb_str_cat_cstr(mesg, "\n");
rb_str_append(mesg, rb_iseq_disasm(cfp->iseq));
rb_exc_fatal(rb_exc_new3(rb_eFatal, mesg));
#else
rb_bug(stack_consistency_error, nsp, nbp);
#endif
}
static VALUE
vm_opt_plus(VALUE recv, VALUE obj)
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_PLUS, INTEGER_REDEFINED_OP_FLAG)) {
return rb_fix_plus_fix(recv, obj);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (FLONUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
return DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (SPECIAL_CONST_P(recv) || SPECIAL_CONST_P(obj)) {
return Qundef;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (RBASIC_CLASS(recv) == rb_cFloat &&
RBASIC_CLASS(obj) == rb_cFloat &&
BASIC_OP_UNREDEFINED_P(BOP_PLUS, FLOAT_REDEFINED_OP_FLAG)) {
return DBL2NUM(RFLOAT_VALUE(recv) + RFLOAT_VALUE(obj));
}
else if (RBASIC_CLASS(recv) == rb_cString &&
RBASIC_CLASS(obj) == rb_cString &&
BASIC_OP_UNREDEFINED_P(BOP_PLUS, STRING_REDEFINED_OP_FLAG)) {
2019-08-06 14:59:41 +03:00
return rb_str_opt_plus(recv, obj);
}
else if (RBASIC_CLASS(recv) == rb_cArray &&
RBASIC_CLASS(obj) == rb_cArray &&
BASIC_OP_UNREDEFINED_P(BOP_PLUS, ARRAY_REDEFINED_OP_FLAG)) {
return rb_ary_plus(recv, obj);
}
else {
return Qundef;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
static VALUE
vm_opt_minus(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_MINUS, INTEGER_REDEFINED_OP_FLAG)) {
return rb_fix_minus_fix(recv, obj);
}
else if (FLONUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_MINUS, FLOAT_REDEFINED_OP_FLAG)) {
return DBL2NUM(RFLOAT_VALUE(recv) - RFLOAT_VALUE(obj));
}
else if (SPECIAL_CONST_P(recv) || SPECIAL_CONST_P(obj)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cFloat &&
RBASIC_CLASS(obj) == rb_cFloat &&
BASIC_OP_UNREDEFINED_P(BOP_MINUS, FLOAT_REDEFINED_OP_FLAG)) {
return DBL2NUM(RFLOAT_VALUE(recv) - RFLOAT_VALUE(obj));
}
else {
return Qundef;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
static VALUE
vm_opt_mult(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_MULT, INTEGER_REDEFINED_OP_FLAG)) {
return rb_fix_mul_fix(recv, obj);
}
else if (FLONUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_MULT, FLOAT_REDEFINED_OP_FLAG)) {
return DBL2NUM(RFLOAT_VALUE(recv) * RFLOAT_VALUE(obj));
}
else if (SPECIAL_CONST_P(recv) || SPECIAL_CONST_P(obj)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cFloat &&
RBASIC_CLASS(obj) == rb_cFloat &&
BASIC_OP_UNREDEFINED_P(BOP_MULT, FLOAT_REDEFINED_OP_FLAG)) {
return DBL2NUM(RFLOAT_VALUE(recv) * RFLOAT_VALUE(obj));
}
else {
return Qundef;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
static VALUE
vm_opt_div(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_DIV, INTEGER_REDEFINED_OP_FLAG)) {
return (FIX2LONG(obj) == 0) ? Qundef : rb_fix_div_fix(recv, obj);
}
else if (FLONUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_DIV, FLOAT_REDEFINED_OP_FLAG)) {
return rb_flo_div_flo(recv, obj);
}
else if (SPECIAL_CONST_P(recv) || SPECIAL_CONST_P(obj)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cFloat &&
RBASIC_CLASS(obj) == rb_cFloat &&
BASIC_OP_UNREDEFINED_P(BOP_DIV, FLOAT_REDEFINED_OP_FLAG)) {
return rb_flo_div_flo(recv, obj);
}
else {
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return Qundef;
}
}
static VALUE
vm_opt_mod(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_MOD, INTEGER_REDEFINED_OP_FLAG)) {
return (FIX2LONG(obj) == 0) ? Qundef : rb_fix_mod_fix(recv, obj);
}
else if (FLONUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_MOD, FLOAT_REDEFINED_OP_FLAG)) {
return DBL2NUM(ruby_float_mod(RFLOAT_VALUE(recv), RFLOAT_VALUE(obj)));
}
else if (SPECIAL_CONST_P(recv) || SPECIAL_CONST_P(obj)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cFloat &&
RBASIC_CLASS(obj) == rb_cFloat &&
BASIC_OP_UNREDEFINED_P(BOP_MOD, FLOAT_REDEFINED_OP_FLAG)) {
return DBL2NUM(ruby_float_mod(RFLOAT_VALUE(recv), RFLOAT_VALUE(obj)));
}
else {
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return Qundef;
}
}
static VALUE
vm_opt_neq(const rb_iseq_t *iseq, CALL_DATA cd, CALL_DATA cd_eq, VALUE recv, VALUE obj)
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
{
if (vm_method_cfunc_is(iseq, cd, recv, rb_obj_not_equal)) {
VALUE val = opt_equality(iseq, recv, obj, cd_eq);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
2022-11-15 07:24:08 +03:00
if (!UNDEF_P(val)) {
2022-01-01 09:41:00 +03:00
return RBOOL(!RTEST(val));
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
return Qundef;
}
static VALUE
vm_opt_lt(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_LT, INTEGER_REDEFINED_OP_FLAG)) {
2021-08-02 06:06:44 +03:00
return RBOOL((SIGNED_VALUE)recv < (SIGNED_VALUE)obj);
}
else if (FLONUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_LT, FLOAT_REDEFINED_OP_FLAG)) {
2021-08-02 06:06:44 +03:00
return RBOOL(RFLOAT_VALUE(recv) < RFLOAT_VALUE(obj));
}
else if (SPECIAL_CONST_P(recv) || SPECIAL_CONST_P(obj)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cFloat &&
RBASIC_CLASS(obj) == rb_cFloat &&
BASIC_OP_UNREDEFINED_P(BOP_LT, FLOAT_REDEFINED_OP_FLAG)) {
CHECK_CMP_NAN(RFLOAT_VALUE(recv), RFLOAT_VALUE(obj));
2021-08-02 06:06:44 +03:00
return RBOOL(RFLOAT_VALUE(recv) < RFLOAT_VALUE(obj));
}
else {
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return Qundef;
}
}
static VALUE
vm_opt_le(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_LE, INTEGER_REDEFINED_OP_FLAG)) {
2021-08-02 06:06:44 +03:00
return RBOOL((SIGNED_VALUE)recv <= (SIGNED_VALUE)obj);
}
else if (FLONUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_LE, FLOAT_REDEFINED_OP_FLAG)) {
2021-08-02 06:06:44 +03:00
return RBOOL(RFLOAT_VALUE(recv) <= RFLOAT_VALUE(obj));
}
else if (SPECIAL_CONST_P(recv) || SPECIAL_CONST_P(obj)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cFloat &&
RBASIC_CLASS(obj) == rb_cFloat &&
BASIC_OP_UNREDEFINED_P(BOP_LE, FLOAT_REDEFINED_OP_FLAG)) {
CHECK_CMP_NAN(RFLOAT_VALUE(recv), RFLOAT_VALUE(obj));
2021-08-02 06:06:44 +03:00
return RBOOL(RFLOAT_VALUE(recv) <= RFLOAT_VALUE(obj));
}
else {
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return Qundef;
}
}
static VALUE
vm_opt_gt(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_GT, INTEGER_REDEFINED_OP_FLAG)) {
2021-08-02 06:06:44 +03:00
return RBOOL((SIGNED_VALUE)recv > (SIGNED_VALUE)obj);
}
else if (FLONUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_GT, FLOAT_REDEFINED_OP_FLAG)) {
2021-08-02 06:06:44 +03:00
return RBOOL(RFLOAT_VALUE(recv) > RFLOAT_VALUE(obj));
}
else if (SPECIAL_CONST_P(recv) || SPECIAL_CONST_P(obj)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cFloat &&
RBASIC_CLASS(obj) == rb_cFloat &&
BASIC_OP_UNREDEFINED_P(BOP_GT, FLOAT_REDEFINED_OP_FLAG)) {
CHECK_CMP_NAN(RFLOAT_VALUE(recv), RFLOAT_VALUE(obj));
2021-08-02 06:06:44 +03:00
return RBOOL(RFLOAT_VALUE(recv) > RFLOAT_VALUE(obj));
}
else {
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return Qundef;
}
}
static VALUE
vm_opt_ge(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_GE, INTEGER_REDEFINED_OP_FLAG)) {
2021-08-02 06:06:44 +03:00
return RBOOL((SIGNED_VALUE)recv >= (SIGNED_VALUE)obj);
}
else if (FLONUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_GE, FLOAT_REDEFINED_OP_FLAG)) {
2021-08-02 06:06:44 +03:00
return RBOOL(RFLOAT_VALUE(recv) >= RFLOAT_VALUE(obj));
}
else if (SPECIAL_CONST_P(recv) || SPECIAL_CONST_P(obj)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cFloat &&
RBASIC_CLASS(obj) == rb_cFloat &&
BASIC_OP_UNREDEFINED_P(BOP_GE, FLOAT_REDEFINED_OP_FLAG)) {
CHECK_CMP_NAN(RFLOAT_VALUE(recv), RFLOAT_VALUE(obj));
2021-08-02 06:06:44 +03:00
return RBOOL(RFLOAT_VALUE(recv) >= RFLOAT_VALUE(obj));
}
else {
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return Qundef;
}
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_opt_ltlt(VALUE recv, VALUE obj)
{
if (SPECIAL_CONST_P(recv)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cString &&
BASIC_OP_UNREDEFINED_P(BOP_LTLT, STRING_REDEFINED_OP_FLAG)) {
if (LIKELY(RB_TYPE_P(obj, T_STRING))) {
return rb_str_buf_append(recv, obj);
2022-07-27 12:42:27 +03:00
}
else {
return rb_str_concat(recv, obj);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (RBASIC_CLASS(recv) == rb_cArray &&
BASIC_OP_UNREDEFINED_P(BOP_LTLT, ARRAY_REDEFINED_OP_FLAG)) {
return rb_ary_push(recv, obj);
}
else {
return Qundef;
}
}
static VALUE
vm_opt_and(VALUE recv, VALUE obj)
{
// If recv and obj are both fixnums, then the bottom tag bit
// will be 1 on both. 1 & 1 == 1, so the result value will also
// be a fixnum. If either side is *not* a fixnum, then the tag bit
// will be 0, and we return Qundef.
VALUE ret = ((SIGNED_VALUE) recv) & ((SIGNED_VALUE) obj);
if (FIXNUM_P(ret) &&
BASIC_OP_UNREDEFINED_P(BOP_AND, INTEGER_REDEFINED_OP_FLAG)) {
return ret;
}
else {
return Qundef;
}
}
static VALUE
vm_opt_or(VALUE recv, VALUE obj)
{
if (FIXNUM_2_P(recv, obj) &&
BASIC_OP_UNREDEFINED_P(BOP_OR, INTEGER_REDEFINED_OP_FLAG)) {
return recv | obj;
}
else {
return Qundef;
}
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_opt_aref(VALUE recv, VALUE obj)
{
if (SPECIAL_CONST_P(recv)) {
if (FIXNUM_2_P(recv, obj) &&
2019-06-01 07:34:55 +03:00
BASIC_OP_UNREDEFINED_P(BOP_AREF, INTEGER_REDEFINED_OP_FLAG)) {
return rb_fix_aref(recv, obj);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cArray &&
BASIC_OP_UNREDEFINED_P(BOP_AREF, ARRAY_REDEFINED_OP_FLAG)) {
if (FIXNUM_P(obj)) {
return rb_ary_entry_internal(recv, FIX2LONG(obj));
}
else {
return rb_ary_aref1(recv, obj);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (RBASIC_CLASS(recv) == rb_cHash &&
BASIC_OP_UNREDEFINED_P(BOP_AREF, HASH_REDEFINED_OP_FLAG)) {
return rb_hash_aref(recv, obj);
}
else {
return Qundef;
}
}
static VALUE
vm_opt_aset(VALUE recv, VALUE obj, VALUE set)
{
if (SPECIAL_CONST_P(recv)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cArray &&
BASIC_OP_UNREDEFINED_P(BOP_ASET, ARRAY_REDEFINED_OP_FLAG) &&
FIXNUM_P(obj)) {
rb_ary_store(recv, FIX2LONG(obj), set);
return set;
}
else if (RBASIC_CLASS(recv) == rb_cHash &&
BASIC_OP_UNREDEFINED_P(BOP_ASET, HASH_REDEFINED_OP_FLAG)) {
rb_hash_aset(recv, obj, set);
return set;
}
else {
return Qundef;
}
}
static VALUE
vm_opt_aref_with(VALUE recv, VALUE key)
{
if (!SPECIAL_CONST_P(recv) && RBASIC_CLASS(recv) == rb_cHash &&
BASIC_OP_UNREDEFINED_P(BOP_AREF, HASH_REDEFINED_OP_FLAG) &&
rb_hash_compare_by_id_p(recv) == Qfalse &&
!FL_TEST(recv, RHASH_PROC_DEFAULT)) {
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
return rb_hash_aref(recv, key);
}
else {
return Qundef;
}
}
VALUE
rb_vm_opt_aref_with(VALUE recv, VALUE key)
{
return vm_opt_aref_with(recv, key);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_opt_aset_with(VALUE recv, VALUE key, VALUE val)
{
if (!SPECIAL_CONST_P(recv) && RBASIC_CLASS(recv) == rb_cHash &&
BASIC_OP_UNREDEFINED_P(BOP_ASET, HASH_REDEFINED_OP_FLAG) &&
rb_hash_compare_by_id_p(recv) == Qfalse) {
return rb_hash_aset(recv, key, val);
}
else {
return Qundef;
}
}
static VALUE
vm_opt_length(VALUE recv, int bop)
{
if (SPECIAL_CONST_P(recv)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cString &&
BASIC_OP_UNREDEFINED_P(bop, STRING_REDEFINED_OP_FLAG)) {
if (bop == BOP_EMPTY_P) {
return LONG2NUM(RSTRING_LEN(recv));
}
else {
return rb_str_length(recv);
}
}
else if (RBASIC_CLASS(recv) == rb_cArray &&
BASIC_OP_UNREDEFINED_P(bop, ARRAY_REDEFINED_OP_FLAG)) {
return LONG2NUM(RARRAY_LEN(recv));
}
else if (RBASIC_CLASS(recv) == rb_cHash &&
BASIC_OP_UNREDEFINED_P(bop, HASH_REDEFINED_OP_FLAG)) {
return INT2FIX(RHASH_SIZE(recv));
}
else {
return Qundef;
}
}
static VALUE
vm_opt_empty_p(VALUE recv)
{
switch (vm_opt_length(recv, BOP_EMPTY_P)) {
case Qundef: return Qundef;
case INT2FIX(0): return Qtrue;
default: return Qfalse;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
VALUE rb_false(VALUE obj);
static VALUE
vm_opt_nil_p(const rb_iseq_t *iseq, CALL_DATA cd, VALUE recv)
{
2021-10-03 16:34:45 +03:00
if (NIL_P(recv) &&
BASIC_OP_UNREDEFINED_P(BOP_NIL_P, NIL_REDEFINED_OP_FLAG)) {
return Qtrue;
}
else if (vm_method_cfunc_is(iseq, cd, recv, rb_false)) {
return Qfalse;
2019-09-27 04:20:56 +03:00
}
else {
return Qundef;
}
}
static VALUE
fix_succ(VALUE x)
{
switch (x) {
case ~0UL:
/* 0xFFFF_FFFF == INT2FIX(-1)
* `-1.succ` is of course 0. */
return INT2FIX(0);
case RSHIFT(~0UL, 1):
/* 0x7FFF_FFFF == LONG2FIX(0x3FFF_FFFF)
* 0x3FFF_FFFF + 1 == 0x4000_0000, which is a Bignum. */
return rb_uint2big(1UL << (SIZEOF_LONG * CHAR_BIT - 2));
default:
/* LONG2FIX(FIX2LONG(x)+FIX2LONG(y))
* == ((lx*2+1)/2 + (ly*2+1)/2)*2+1
* == lx*2 + ly*2 + 1
* == (lx*2+1) + (ly*2+1) - 1
* == x + y - 1
*
* Here, if we put y := INT2FIX(1):
*
* == x + INT2FIX(1) - 1
* == x + 2 .
*/
return x + 2;
}
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
static VALUE
vm_opt_succ(VALUE recv)
{
if (FIXNUM_P(recv) &&
BASIC_OP_UNREDEFINED_P(BOP_SUCC, INTEGER_REDEFINED_OP_FLAG)) {
return fix_succ(recv);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (SPECIAL_CONST_P(recv)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cString &&
BASIC_OP_UNREDEFINED_P(BOP_SUCC, STRING_REDEFINED_OP_FLAG)) {
return rb_str_succ(recv);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
else {
return Qundef;
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
}
static VALUE
vm_opt_not(const rb_iseq_t *iseq, CALL_DATA cd, VALUE recv)
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
{
if (vm_method_cfunc_is(iseq, cd, recv, rb_obj_not)) {
2022-01-01 09:41:00 +03:00
return RBOOL(!RTEST(recv));
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else {
return Qundef;
}
}
static VALUE
vm_opt_regexpmatch2(VALUE recv, VALUE obj)
{
if (SPECIAL_CONST_P(recv)) {
return Qundef;
}
else if (RBASIC_CLASS(recv) == rb_cString &&
CLASS_OF(obj) == rb_cRegexp &&
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
BASIC_OP_UNREDEFINED_P(BOP_MATCH, STRING_REDEFINED_OP_FLAG)) {
return rb_reg_match(obj, recv);
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
}
else if (RBASIC_CLASS(recv) == rb_cRegexp &&
BASIC_OP_UNREDEFINED_P(BOP_MATCH, REGEXP_REDEFINED_OP_FLAG)) {
return rb_reg_match(recv, obj);
}
split insns.def into functions Contemporary C compilers are good at function inlining. They fold multiple functions into one. However they are not yet smart enough to unfold a function into several ones. So generally speaking, it is wiser for a C programmer to manually split C functions whenever possible. That should make rooms for compilers to optimize at will. Before this changeset insns.def was converted into single HUGE function called vm_exec_core(). By moving each instruction's core into individual functions, generated C source code is reduced from 3,428 lines to 2,847 lines. Looking at the generated assembly however, it seems my compiler (gcc 6.2) is extraordinary smart so that it inlines almost all functions I introduced in this changeset back into that vm_exec_core. On my machine compiled machine binary of the function does not shrink very much in size (28,432 bytes to 26,816 bytes, according to nm(1)). I believe this change is zero-cost. Several benchmarks I exercised showed no significant difference beyond error mergin. For instance 3 repeated runs of optcarrot benchmark on my machine resulted in: before this: 28.330329285707490, 27.513378371065920, 29.40420215754537 after this: 27.107195867280414, 25.549324021385907, 30.31581919050884 in fps (greater==faster). ---- * internal.h (rb_obj_not_equal): used from vm_insnhelper.c * insns.def: move vast majority of lines into vm_insnhelper.c * vm_insnhelper.c: moved here. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58390 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2017-04-18 13:58:49 +03:00
else {
return Qundef;
}
}
rb_event_flag_t rb_iseq_event_flags(const rb_iseq_t *iseq, size_t pos);
NOINLINE(static void vm_trace(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp));
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
static inline void
vm_trace_hook(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, const VALUE *pc,
rb_event_flag_t pc_events, rb_event_flag_t target_event,
rb_hook_list_t *global_hooks, rb_hook_list_t *const *local_hooks_ptr, VALUE val)
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
{
rb_event_flag_t event = pc_events & target_event;
VALUE self = GET_SELF();
VM_ASSERT(rb_popcount64((uint64_t)event) == 1);
if (event & global_hooks->events) {
/* increment PC because source line is calculated with PC-1 */
reg_cfp->pc++;
vm_dtrace(event, ec);
rb_exec_event_hook_orig(ec, global_hooks, event, self, 0, 0, 0 , val, 0);
reg_cfp->pc--;
}
// Load here since global hook above can add and free local hooks
rb_hook_list_t *local_hooks = *local_hooks_ptr;
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
if (local_hooks != NULL) {
if (event & local_hooks->events) {
/* increment PC because source line is calculated with PC-1 */
reg_cfp->pc++;
rb_exec_event_hook_orig(ec, local_hooks, event, self, 0, 0, 0 , val, 0);
reg_cfp->pc--;
}
}
}
#define VM_TRACE_HOOK(target_event, val) do { \
if ((pc_events & (target_event)) & enabled_flags) { \
vm_trace_hook(ec, reg_cfp, pc, pc_events, (target_event), global_hooks, local_hooks_ptr, (val)); \
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
} \
} while (0)
static VALUE
rescue_errinfo(rb_execution_context_t *ec, rb_control_frame_t *cfp)
{
VM_ASSERT(VM_FRAME_RUBYFRAME_P(cfp));
VM_ASSERT(ISEQ_BODY(cfp->iseq)->type == ISEQ_TYPE_RESCUE);
return cfp->ep[VM_ENV_INDEX_LAST_LVAR];
}
static void
vm_trace(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp)
{
const VALUE *pc = reg_cfp->pc;
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
rb_event_flag_t enabled_flags = ruby_vm_event_flags & ISEQ_TRACE_EVENTS;
rb_event_flag_t global_events = enabled_flags;
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
if (enabled_flags == 0 && ruby_vm_event_local_num == 0) {
return;
}
else {
const rb_iseq_t *iseq = reg_cfp->iseq;
VALUE iseq_val = (VALUE)iseq;
size_t pos = pc - ISEQ_BODY(iseq)->iseq_encoded;
rb_event_flag_t pc_events = rb_iseq_event_flags(iseq, pos);
rb_hook_list_t *local_hooks = iseq->aux.exec.local_hooks;
rb_hook_list_t *const *local_hooks_ptr = &iseq->aux.exec.local_hooks;
rb_event_flag_t iseq_local_events = local_hooks != NULL ? local_hooks->events : 0;
rb_hook_list_t *bmethod_local_hooks = NULL;
rb_hook_list_t **bmethod_local_hooks_ptr = NULL;
rb_event_flag_t bmethod_local_events = 0;
const bool bmethod_frame = VM_FRAME_BMETHOD_P(reg_cfp);
enabled_flags |= iseq_local_events;
VM_ASSERT((iseq_local_events & ~ISEQ_TRACE_EVENTS) == 0);
if (bmethod_frame) {
const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(reg_cfp);
VM_ASSERT(me->def->type == VM_METHOD_TYPE_BMETHOD);
bmethod_local_hooks = me->def->body.bmethod.hooks;
bmethod_local_hooks_ptr = &me->def->body.bmethod.hooks;
if (bmethod_local_hooks) {
bmethod_local_events = bmethod_local_hooks->events;
}
}
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
if ((pc_events & enabled_flags) == 0 && !bmethod_frame) {
#if 0
/* disable trace */
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
/* TODO: incomplete */
rb_iseq_trace_set(iseq, vm_event_flags & ISEQ_TRACE_EVENTS);
#else
/* do not disable trace because of performance problem
* (re-enable overhead)
*/
#endif
return;
}
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
else if (ec->trace_arg != NULL) {
/* already tracing */
return;
}
else {
rb_hook_list_t *global_hooks = rb_ec_ractor_hooks(ec);
/* Note, not considering iseq local events here since the same
* iseq could be used in multiple bmethods. */
rb_event_flag_t bmethod_events = global_events | bmethod_local_events;
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
if (0) {
ruby_debug_printf("vm_trace>>%4d (%4x) - %s:%d %s\n",
(int)pos,
(int)pc_events,
RSTRING_PTR(rb_iseq_path(iseq)),
(int)rb_iseq_line_no(iseq, pos),
RSTRING_PTR(rb_iseq_label(iseq)));
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
}
VM_ASSERT(reg_cfp->pc == pc);
VM_ASSERT(pc_events != 0);
/* check traces */
if ((pc_events & RUBY_EVENT_B_CALL) && bmethod_frame && (bmethod_events & RUBY_EVENT_CALL)) {
/* b_call instruction running as a method. Fire call event. */
vm_trace_hook(ec, reg_cfp, pc, RUBY_EVENT_CALL, RUBY_EVENT_CALL, global_hooks, bmethod_local_hooks_ptr, Qundef);
}
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
VM_TRACE_HOOK(RUBY_EVENT_CLASS | RUBY_EVENT_CALL | RUBY_EVENT_B_CALL, Qundef);
VM_TRACE_HOOK(RUBY_EVENT_RESCUE, rescue_errinfo(ec, reg_cfp));
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
VM_TRACE_HOOK(RUBY_EVENT_LINE, Qundef);
VM_TRACE_HOOK(RUBY_EVENT_COVERAGE_LINE, Qundef);
VM_TRACE_HOOK(RUBY_EVENT_COVERAGE_BRANCH, Qundef);
VM_TRACE_HOOK(RUBY_EVENT_END | RUBY_EVENT_RETURN | RUBY_EVENT_B_RETURN, TOPN(0));
if ((pc_events & RUBY_EVENT_B_RETURN) && bmethod_frame && (bmethod_events & RUBY_EVENT_RETURN)) {
/* b_return instruction running as a method. Fire return event. */
vm_trace_hook(ec, reg_cfp, pc, RUBY_EVENT_RETURN, RUBY_EVENT_RETURN, global_hooks, bmethod_local_hooks_ptr, TOPN(0));
}
// Pin the iseq since `local_hooks_ptr` points inside the iseq's slot on the GC heap.
// We need the pointer to stay valid in case compaction happens in a trace hook.
//
// Similar treatment is unnecessary for `bmethod_local_hooks_ptr` since
// storage for `rb_method_definition_t` is not on the GC heap.
RB_GC_GUARD(iseq_val);
Support targetting TracePoint [Feature #15289] * vm_trace.c (rb_tracepoint_enable_for_target): support targetting TracePoint. [Feature #15289] Tragetting TracePoint is only enabled on specified method, proc and so on, example: `tp.enable(target: code)`. `code` should be consisted of InstructionSeuqnece (iseq) (RubyVM::InstructionSeuqnece.of(code) should not return nil) If code is a tree of iseq, TracePoint is enabled on all of iseqs in a tree. Enabled tragetting TracePoints can not enabled again with and without target. * vm_core.h (rb_iseq_t): introduce `rb_iseq_t::local_hooks` to store local hooks. `rb_iseq_t::aux::trace_events` is renamed to `global_trace_events` to contrast with `local_hooks`. * vm_core.h (rb_hook_list_t): add `rb_hook_list_t::running` to represent how many Threads/Fibers are used this list. If this field is 0, nobody using this hooks and we can delete it. This is why we can remove code from cont.c. * vm_core.h (rb_vm_t): because of above change, we can eliminate `rb_vm_t::trace_running` field. Also renamed from `rb_vm_t::event_hooks` to `global_hooks`. * vm_core.h, vm.c (ruby_vm_event_enabled_global_flags): renamed from `ruby_vm_event_enabled_flags. * vm_core.h, vm.c (ruby_vm_event_local_num): added to count enabled targetting TracePoints. * vm_core.h, vm_trace.c (rb_exec_event_hooks): accepts hook list. * vm_core.h (rb_vm_global_hooks): added for convinience. * method.h (rb_method_bmethod_t): added to maintain Proc and `rb_hook_list_t` for bmethod (defined by define_method). * prelude.rb (TracePoint#enable): extracet a keyword parameter (because it is easy than writing in C). It calls `TracePoint#__enable` internal method written in C. * vm_insnhelper.c (vm_trace): check also iseq->local_hooks. * vm.c (invoke_bmethod): check def->body.bmethod.hooks. * vm.c (hook_before_rewind): check iseq->local_hooks and def->body.bmethod.hooks before rewind by exception. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@66003 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2018-11-26 21:16:39 +03:00
}
}
}
#undef VM_TRACE_HOOK
#if VM_CHECK_MODE > 0
NORETURN( NOINLINE( COLDFUNC
2020-12-25 17:36:25 +03:00
void rb_vm_canary_is_found_dead(enum ruby_vminsn_type i, VALUE c)));
void
Init_vm_stack_canary(void)
{
/* This has to be called _after_ our PRNG is properly set up. */
int n = ruby_fill_random_bytes(&vm_stack_canary, sizeof vm_stack_canary, false);
vm_stack_canary |= 0x01; // valid VALUE (Fixnum)
vm_stack_canary_was_born = true;
VM_ASSERT(n == 0);
}
2023-03-07 08:34:31 +03:00
void
2020-12-25 17:36:25 +03:00
rb_vm_canary_is_found_dead(enum ruby_vminsn_type i, VALUE c)
{
/* Because a method has already been called, why not call
* another one. */
const char *insn = rb_insns_name(i);
VALUE inspection = rb_inspect(c);
const char *str = StringValueCStr(inspection);
rb_bug("dead canary found at %s: %s", insn, str);
}
#else
void Init_vm_stack_canary(void) { /* nothing to do */ }
#endif
2019-11-07 10:58:00 +03:00
/* a part of the following code is generated by this ruby script:
16.times{|i|
typedef_args = (0...i).map{|j| "VALUE v#{j+1}"}.join(", ")
typedef_args.prepend(", ") if i != 0
call_args = (0...i).map{|j| "argv[#{j}]"}.join(", ")
call_args.prepend(", ") if i != 0
puts %Q{
static VALUE
builtin_invoker#{i}(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr#{i}_t)(rb_execution_context_t *ec, VALUE self#{typedef_args});
return (*(rb_invoke_funcptr#{i}_t)funcptr)(ec, self#{call_args});
}}
}
puts
puts "static VALUE (* const cfunc_invokers[])(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr) = {"
16.times{|i|
puts " builtin_invoker#{i},"
}
puts "};"
*/
static VALUE
builtin_invoker0(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr0_t)(rb_execution_context_t *ec, VALUE self);
return (*(rb_invoke_funcptr0_t)funcptr)(ec, self);
}
static VALUE
builtin_invoker1(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr1_t)(rb_execution_context_t *ec, VALUE self, VALUE v1);
return (*(rb_invoke_funcptr1_t)funcptr)(ec, self, argv[0]);
}
static VALUE
builtin_invoker2(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr2_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2);
return (*(rb_invoke_funcptr2_t)funcptr)(ec, self, argv[0], argv[1]);
}
static VALUE
builtin_invoker3(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr3_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3);
return (*(rb_invoke_funcptr3_t)funcptr)(ec, self, argv[0], argv[1], argv[2]);
}
static VALUE
builtin_invoker4(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr4_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4);
return (*(rb_invoke_funcptr4_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3]);
}
static VALUE
builtin_invoker5(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr5_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5);
return (*(rb_invoke_funcptr5_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4]);
}
static VALUE
builtin_invoker6(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr6_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6);
return (*(rb_invoke_funcptr6_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5]);
}
static VALUE
builtin_invoker7(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr7_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6, VALUE v7);
return (*(rb_invoke_funcptr7_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6]);
}
static VALUE
builtin_invoker8(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr8_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6, VALUE v7, VALUE v8);
return (*(rb_invoke_funcptr8_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7]);
}
static VALUE
builtin_invoker9(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr9_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6, VALUE v7, VALUE v8, VALUE v9);
return (*(rb_invoke_funcptr9_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8]);
}
static VALUE
builtin_invoker10(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr10_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6, VALUE v7, VALUE v8, VALUE v9, VALUE v10);
return (*(rb_invoke_funcptr10_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9]);
}
static VALUE
builtin_invoker11(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr11_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6, VALUE v7, VALUE v8, VALUE v9, VALUE v10, VALUE v11);
return (*(rb_invoke_funcptr11_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10]);
}
static VALUE
builtin_invoker12(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr12_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6, VALUE v7, VALUE v8, VALUE v9, VALUE v10, VALUE v11, VALUE v12);
return (*(rb_invoke_funcptr12_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11]);
}
static VALUE
builtin_invoker13(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr13_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6, VALUE v7, VALUE v8, VALUE v9, VALUE v10, VALUE v11, VALUE v12, VALUE v13);
return (*(rb_invoke_funcptr13_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12]);
}
static VALUE
builtin_invoker14(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr14_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6, VALUE v7, VALUE v8, VALUE v9, VALUE v10, VALUE v11, VALUE v12, VALUE v13, VALUE v14);
return (*(rb_invoke_funcptr14_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13]);
}
static VALUE
builtin_invoker15(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr)
{
typedef VALUE (*rb_invoke_funcptr15_t)(rb_execution_context_t *ec, VALUE self, VALUE v1, VALUE v2, VALUE v3, VALUE v4, VALUE v5, VALUE v6, VALUE v7, VALUE v8, VALUE v9, VALUE v10, VALUE v11, VALUE v12, VALUE v13, VALUE v14, VALUE v15);
return (*(rb_invoke_funcptr15_t)funcptr)(ec, self, argv[0], argv[1], argv[2], argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], argv[9], argv[10], argv[11], argv[12], argv[13], argv[14]);
}
typedef VALUE (*builtin_invoker)(rb_execution_context_t *ec, VALUE self, const VALUE *argv, rb_insn_func_t funcptr);
static builtin_invoker
lookup_builtin_invoker(int argc)
{
static const builtin_invoker invokers[] = {
builtin_invoker0,
builtin_invoker1,
builtin_invoker2,
builtin_invoker3,
builtin_invoker4,
builtin_invoker5,
builtin_invoker6,
builtin_invoker7,
builtin_invoker8,
builtin_invoker9,
builtin_invoker10,
builtin_invoker11,
builtin_invoker12,
builtin_invoker13,
builtin_invoker14,
builtin_invoker15,
};
return invokers[argc];
}
static inline VALUE
invoke_bf(rb_execution_context_t *ec, rb_control_frame_t *reg_cfp, const struct rb_builtin_function* bf, const VALUE *argv)
2019-11-07 10:58:00 +03:00
{
2023-03-12 01:25:11 +03:00
const bool canary_p = ISEQ_BODY(reg_cfp->iseq)->builtin_attrs & BUILTIN_ATTR_LEAF; // Verify an assumption of `Primitive.attr! :leaf`
SETUP_CANARY(canary_p);
rb_insn_func_t func_ptr = (rb_insn_func_t)(uintptr_t)bf->func_ptr;
VALUE ret = (*lookup_builtin_invoker(bf->argc))(ec, reg_cfp->self, argv, func_ptr);
CHECK_CANARY(canary_p, BIN(invokebuiltin));
return ret;
2019-11-07 10:58:00 +03:00
}
static VALUE
vm_invoke_builtin(rb_execution_context_t *ec, rb_control_frame_t *cfp, const struct rb_builtin_function* bf, const VALUE *argv)
2019-11-07 10:58:00 +03:00
{
return invoke_bf(ec, cfp, bf, argv);
}
static VALUE
vm_invoke_builtin_delegate(rb_execution_context_t *ec, rb_control_frame_t *cfp, const struct rb_builtin_function *bf, unsigned int start_index)
2019-11-07 10:58:00 +03:00
{
if (0) { // debug print
fputs("vm_invoke_builtin_delegate: passing -> ", stderr);
for (int i=0; i<bf->argc; i++) {
ruby_debug_printf(":%s ", rb_id2name(ISEQ_BODY(cfp->iseq)->local_table[i+start_index]));
}
ruby_debug_printf("\n" "%s %s(%d):%p\n", RUBY_FUNCTION_NAME_STRING, bf->name, bf->argc,
(void *)(uintptr_t)bf->func_ptr);
}
if (bf->argc == 0) {
return invoke_bf(ec, cfp, bf, NULL);
}
else {
const VALUE *argv = cfp->ep - ISEQ_BODY(cfp->iseq)->local_table_size - VM_ENV_DATA_SIZE + 1 + start_index;
return invoke_bf(ec, cfp, bf, argv);
}
2019-11-07 10:58:00 +03:00
}
// for __builtin_inline!()
VALUE
rb_vm_lvar_exposed(rb_execution_context_t *ec, int index)
{
const rb_control_frame_t *cfp = ec->cfp;
return cfp->ep[index];
}