From f302e725e10ae05e613e2c24cae0741f65f2db91 Mon Sep 17 00:00:00 2001 From: Alan Wu Date: Mon, 17 Jul 2023 13:57:58 -0400 Subject: [PATCH] Remove __bp__ and speed-up bmethod calls (#8060) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove rb_control_frame_t::__bp__ and optimize bmethod calls This commit removes the __bp__ field from rb_control_frame_t. It was introduced to help MJIT, but since MJIT was replaced by RJIT, we can use vm_base_ptr() to compute it from the SP of the previous control frame instead. Removing the field avoids needing to set it up when pushing new frames. Simply removing __bp__ would cause crashes since RJIT and YJIT used a slightly different stack layout for bmethod calls than the interpreter. At the moment of the call, the two layouts looked as follows: ┌────────────┐ ┌────────────┐ │ frame_base │ │ frame_base │ ├────────────┤ ├────────────┤ │ ... │ │ ... │ ├────────────┤ ├────────────┤ │ args │ │ args │ ├────────────┤ └────────────┘<─prev_frame_sp │ receiver │ prev_frame_sp─>└────────────┘ RJIT & YJIT interpreter Essentially, vm_base_ptr() needs to compute the address to frame_base given prev_frame_sp in the diagrams. The presence of the receiver created an off-by-one situation. Make the interpreter use the layout the JITs use for iseq-to-iseq bmethod calls. Doing so removes unnecessary argument shifting and vm_exec_core() re-entry from the interpreter, yielding a speed improvement visible through `benchmark/vm_defined_method.yml`: patched: 7578743.1 i/s master: 4796596.3 i/s - 1.58x slower C-to-iseq bmethod calls now store one more VALUE than before, but that should have negligible impact on overall performance. Note that re-entering vm_exec_core() used to be necessary for firing TracePoint events, but that's no longer the case since 9121e57a5f50bc91bae48b3b91edb283bf96cb6b. Closes ruby/ruby#6428 --- lib/ruby_vm/rjit/insn_compiler.rb | 1 - rjit_c.rb | 1 - vm.c | 24 +++++++++++--------- vm_core.h | 1 - vm_dump.c | 4 ++-- vm_insnhelper.c | 37 +++++++++++++++++-------------- yjit/src/codegen.rs | 1 - yjit/src/cruby.rs | 5 ++--- 8 files changed, 38 insertions(+), 36 deletions(-) diff --git a/lib/ruby_vm/rjit/insn_compiler.rb b/lib/ruby_vm/rjit/insn_compiler.rb index c03f70df03..5b2beb68ed 100644 --- a/lib/ruby_vm/rjit/insn_compiler.rb +++ b/lib/ruby_vm/rjit/insn_compiler.rb @@ -5618,7 +5618,6 @@ module RubyVM::RJIT sp_reg = iseq ? SP : :rax asm.lea(sp_reg, [SP, C.VALUE.size * sp_offset]) asm.mov([CFP, cfp_offset + C.rb_control_frame_t.offsetof(:sp)], sp_reg) - asm.mov([CFP, cfp_offset + C.rb_control_frame_t.offsetof(:__bp__)], sp_reg) # TODO: get rid of this!! # cfp->jit_return is used only for ISEQs if iseq diff --git a/rjit_c.rb b/rjit_c.rb index 735a097d69..9e0ea84f0a 100644 --- a/rjit_c.rb +++ b/rjit_c.rb @@ -1064,7 +1064,6 @@ module RubyVM::RJIT # :nodoc: all self: [self.VALUE, Primitive.cexpr!("OFFSETOF((*((struct rb_control_frame_struct *)NULL)), self)")], ep: [CType::Pointer.new { self.VALUE }, Primitive.cexpr!("OFFSETOF((*((struct rb_control_frame_struct *)NULL)), ep)")], block_code: [CType::Immediate.parse("void *"), Primitive.cexpr!("OFFSETOF((*((struct rb_control_frame_struct *)NULL)), block_code)")], - __bp__: [CType::Pointer.new { self.VALUE }, Primitive.cexpr!("OFFSETOF((*((struct rb_control_frame_struct *)NULL)), __bp__)")], jit_return: [CType::Pointer.new { CType::Immediate.parse("void") }, Primitive.cexpr!("OFFSETOF((*((struct rb_control_frame_struct *)NULL)), jit_return)")], ) end diff --git a/vm.c b/vm.c index f8cdc5a5ea..c132f9a377 100644 --- a/vm.c +++ b/vm.c @@ -198,7 +198,7 @@ VM_CAPTURED_BLOCK_TO_CFP(const struct rb_captured_block *captured) { rb_control_frame_t *cfp = ((rb_control_frame_t *)((VALUE *)(captured) - 3)); VM_ASSERT(!VM_CFP_IN_HEAP_P(GET_EC(), cfp)); - VM_ASSERT(sizeof(rb_control_frame_t)/sizeof(VALUE) == 8 + VM_DEBUG_BP_CHECK ? 1 : 0); + VM_ASSERT(sizeof(rb_control_frame_t)/sizeof(VALUE) == 7 + VM_DEBUG_BP_CHECK ? 1 : 0); return cfp; } @@ -1398,7 +1398,7 @@ invoke_block(rb_execution_context_t *ec, const rb_iseq_t *iseq, VALUE self, cons static VALUE invoke_bmethod(rb_execution_context_t *ec, const rb_iseq_t *iseq, VALUE self, const struct rb_captured_block *captured, const rb_callable_method_entry_t *me, VALUE type, int opt_pc) { - /* bmethod */ + /* bmethod call from outside the VM */ int arg_size = ISEQ_BODY(iseq)->param.size; VALUE ret; @@ -1408,7 +1408,7 @@ invoke_bmethod(rb_execution_context_t *ec, const rb_iseq_t *iseq, VALUE self, co VM_GUARDED_PREV_EP(captured->ep), (VALUE)me, ISEQ_BODY(iseq)->iseq_encoded + opt_pc, - ec->cfp->sp + arg_size, + ec->cfp->sp + 1 /* self */ + arg_size, ISEQ_BODY(iseq)->local_table_size - arg_size, ISEQ_BODY(iseq)->stack_max); @@ -1429,7 +1429,7 @@ invoke_iseq_block_from_c(rb_execution_context_t *ec, const struct rb_captured_bl const rb_cref_t *cref, int is_lambda, const rb_callable_method_entry_t *me) { const rb_iseq_t *iseq = rb_iseq_check(captured->code.iseq); - int i, opt_pc; + int opt_pc; VALUE type = VM_FRAME_MAGIC_BLOCK | (is_lambda ? VM_FRAME_FLAG_LAMBDA : 0); rb_control_frame_t *cfp = ec->cfp; VALUE *sp = cfp->sp; @@ -1448,14 +1448,18 @@ invoke_iseq_block_from_c(rb_execution_context_t *ec, const struct rb_captured_bl use_argv = vm_argv_ruby_array(av, argv, &flags, &argc, kw_splat); } - CHECK_VM_STACK_OVERFLOW(cfp, argc); + CHECK_VM_STACK_OVERFLOW(cfp, argc + 1); vm_check_canary(ec, sp); - cfp->sp = sp + argc; - for (i=0; isp = stack_argv + argc; + MEMCPY(stack_argv, use_argv, VALUE, argc); // restrict: new stack space + + opt_pc = vm_yield_setup_args(ec, iseq, argc, stack_argv, flags, passed_block_handler, (is_lambda ? arg_setup_method : arg_setup_block)); cfp->sp = sp; diff --git a/vm_core.h b/vm_core.h index 4338d65d5e..29ce46e1c3 100644 --- a/vm_core.h +++ b/vm_core.h @@ -815,7 +815,6 @@ typedef struct rb_control_frame_struct { VALUE self; /* cfp[3] / block[0] */ const VALUE *ep; /* cfp[4] / block[1] */ const void *block_code; /* cfp[5] / block[2] */ /* iseq or ifunc or forwarded block handler */ - VALUE *__bp__; /* cfp[6] */ /* outside vm_push_frame, use vm_base_ptr instead. */ #if VM_DEBUG_BP_CHECK VALUE *bp_check; /* cfp[7] */ diff --git a/vm_dump.c b/vm_dump.c index fae34d72aa..ba5434b393 100644 --- a/vm_dump.c +++ b/vm_dump.c @@ -281,14 +281,14 @@ rb_vmdebug_stack_dump_th(VALUE thval) #if VMDEBUG > 2 -/* copy from vm.c */ +/* copy from vm_insnhelper.c */ static const VALUE * vm_base_ptr(const rb_control_frame_t *cfp) { const rb_control_frame_t *prev_cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp); const VALUE *bp = prev_cfp->sp + ISEQ_BODY(cfp->iseq)->local_table_size + VM_ENV_DATA_SIZE; - if (ISEQ_BODY(cfp->iseq)->type == ISEQ_TYPE_METHOD) { + if (ISEQ_BODY(cfp->iseq)->type == ISEQ_TYPE_METHOD || VM_FRAME_BMETHOD_P(cfp)) { bp += 1; } return bp; diff --git a/vm_insnhelper.c b/vm_insnhelper.c index 9a96fc3b93..d33fdb8fa7 100644 --- a/vm_insnhelper.c +++ b/vm_insnhelper.c @@ -381,7 +381,6 @@ vm_push_frame(rb_execution_context_t *ec, .self = self, .ep = sp - 1, .block_code = NULL, - .__bp__ = sp, /* Store initial value of ep as bp to skip calculation cost of bp on JIT cancellation. */ #if VM_DEBUG_BP_CHECK .bp_check = sp, #endif @@ -2455,15 +2454,15 @@ double_cmp_ge(double a, double b) return RBOOL(a >= b); } +// Copied by vm_dump.c static inline VALUE * vm_base_ptr(const rb_control_frame_t *cfp) { -#if 0 // we may optimize and use this once we confirm it does not spoil performance on JIT. const rb_control_frame_t *prev_cfp = RUBY_VM_PREVIOUS_CONTROL_FRAME(cfp); if (cfp->iseq && VM_FRAME_RUBYFRAME_P(cfp)) { VALUE *bp = prev_cfp->sp + ISEQ_BODY(cfp->iseq)->local_table_size + VM_ENV_DATA_SIZE; - if (ISEQ_BODY(cfp->iseq)->type == ISEQ_TYPE_METHOD) { + if (ISEQ_BODY(cfp->iseq)->type == ISEQ_TYPE_METHOD || VM_FRAME_BMETHOD_P(cfp)) { /* adjust `self' */ bp += 1; } @@ -2480,9 +2479,6 @@ vm_base_ptr(const rb_control_frame_t *cfp) else { return NULL; } -#else - return cfp->__bp__; -#endif } /* method call processes with call_info */ @@ -3693,23 +3689,30 @@ vm_call_iseq_bmethod(rb_execution_context_t *ec, rb_control_frame_t *cfp, struct const struct rb_captured_block *captured = &block->as.captured; const rb_iseq_t *iseq = rb_iseq_check(captured->code.iseq); - int i, opt_pc; - - VALUE *sp = cfp->sp - calling->argc - 1; - for (i = 0; i < calling->argc; i++) { - sp[i] = sp[i+1]; - } + VALUE * const argv = cfp->sp - calling->argc; + const int arg_size = ISEQ_BODY(iseq)->param.size; + int opt_pc; if (vm_ci_flag(calling->ci) & VM_CALL_ARGS_SIMPLE) { - opt_pc = vm_callee_setup_block_arg(ec, calling, calling->ci, iseq, sp, arg_setup_method); + opt_pc = vm_callee_setup_block_arg(ec, calling, calling->ci, iseq, argv, arg_setup_method); } else { - opt_pc = setup_parameters_complex(ec, iseq, calling, calling->ci, sp, arg_setup_method); + opt_pc = setup_parameters_complex(ec, iseq, calling, calling->ci, argv, arg_setup_method); } - cfp->sp = sp; - return invoke_bmethod(ec, iseq, calling->recv, captured, cme, - VM_FRAME_MAGIC_BLOCK | VM_FRAME_FLAG_LAMBDA, opt_pc); + cfp->sp = argv - 1; // -1 for the receiver + + vm_push_frame(ec, iseq, + VM_FRAME_MAGIC_BLOCK | VM_FRAME_FLAG_BMETHOD | VM_FRAME_FLAG_LAMBDA, + calling->recv, + VM_GUARDED_PREV_EP(captured->ep), + (VALUE)cme, + ISEQ_BODY(iseq)->iseq_encoded + opt_pc, + argv + arg_size, + ISEQ_BODY(iseq)->local_table_size - arg_size, + ISEQ_BODY(iseq)->stack_max); + + return Qundef; } static VALUE diff --git a/yjit/src/codegen.rs b/yjit/src/codegen.rs index d668b82301..ec84edc4c2 100644 --- a/yjit/src/codegen.rs +++ b/yjit/src/codegen.rs @@ -4850,7 +4850,6 @@ fn gen_push_frame( if let Some(pc) = frame.pc { asm.mov(cfp_opnd(RUBY_OFFSET_CFP_PC), pc.into()); }; - asm.mov(cfp_opnd(RUBY_OFFSET_CFP_BP), sp); asm.mov(cfp_opnd(RUBY_OFFSET_CFP_SP), sp); let iseq: Opnd = if let Some(iseq) = frame.iseq { VALUE::from(iseq).into() diff --git a/yjit/src/cruby.rs b/yjit/src/cruby.rs index 265748cd6d..a3ce30c589 100644 --- a/yjit/src/cruby.rs +++ b/yjit/src/cruby.rs @@ -718,9 +718,8 @@ mod manual_defs { pub const RUBY_OFFSET_CFP_SELF: i32 = 24; pub const RUBY_OFFSET_CFP_EP: i32 = 32; pub const RUBY_OFFSET_CFP_BLOCK_CODE: i32 = 40; - pub const RUBY_OFFSET_CFP_BP: i32 = 48; // field __bp__ - pub const RUBY_OFFSET_CFP_JIT_RETURN: i32 = 56; - pub const RUBY_SIZEOF_CONTROL_FRAME: usize = 64; + pub const RUBY_OFFSET_CFP_JIT_RETURN: i32 = 48; + pub const RUBY_SIZEOF_CONTROL_FRAME: usize = 56; // Constants from rb_execution_context_t vm_core.h pub const RUBY_OFFSET_EC_CFP: i32 = 16;