TracePoint support

This change fixes some cases where YJIT fails to fire tracing events. Most of the situations YJIT did not handle correctly involves enabling tracing while running inside generated code. A new operation to invalidate all generated code is added, which uses patching to make generated code exit at the next VM instruction boundary. A new routine called `jit_prepare_routine_call()` is introduced to facilitate this and should be used when generating code that could allocate, or could otherwise use `RB_VM_LOCK_ENTER()`. The `c_return` event is fired in the middle of an instruction as opposed to at an instruction boundary, so it requires special handling. C method call return points are patched to go to a fucntion which does everything the interpreter does, including firing the `c_return` event. The generated code for C method calls normally does not fire the event. Invalided code should not change after patching so the exits are not clobbered. A new variable is introduced to track the region of code that should not change.
2021-08-25 17:00:45 -04:00 · 2021-08-25 17:00:45 -04:00 · bd876c243a
--- a/README.md
+++ b/README.md
@ -32,7 +32,6 @@ To cite this repository in your publications, please use this bibtex snippet:

 YJIT is a work in progress and as such may not yet be mature enough for mission-critical software. Below is a list of known limitations, all of which we plan to eventually address:

- No support for the `TracePoint` API (see [#54](https://github.com/Shopify/yjit/issues/54)).
 - No garbage collection for generated code.

 Because there is no GC for generated code yet, your software could run out of executable memory if it is large enough. You can change how much executable memory is allocated using [YJIT's command-line options](https://github.com/Shopify/yjit#command-line-options).
--- a/bootstraptest/test_yjit.rb
+++ b/bootstraptest/test_yjit.rb
@ -1612,3 +1612,217 @@ end
 bar(123, 1.1)
 bar(123, 1.1)
 }
+
+# test enabling a line TracePoint in a C method call
+assert_equal '[[:line, true]]', %q{
+  events = []
+  events.instance_variable_set(
+    :@tp,
+    TracePoint.new(:line) { |tp| events << [tp.event, tp.lineno] if tp.path == __FILE__ }
+  )
+  def events.to_str
+    @tp.enable; ''
+  end
+
+  # Stay in generated code while enabling tracing
+  def events.compiled(obj)
+    String(obj)
+    @tp.disable; __LINE__
+  end
+
+  line = events.compiled(events)
+  events[0][-1] = (events[0][-1] == line)
+
+  events
+}
+
+# test enabling a c_return TracePoint in a C method call
+assert_equal '[[:c_return, :String, :string_alias, "events_to_str"]]', %q{
+  events = []
+  events.instance_variable_set(:@tp, TracePoint.new(:c_return) { |tp| events << [tp.event, tp.method_id, tp.callee_id, tp.return_value] })
+  def events.to_str
+    @tp.enable; 'events_to_str'
+  end
+
+  # Stay in generated code while enabling tracing
+  alias string_alias String
+  def events.compiled(obj)
+    string_alias(obj)
+    @tp.disable
+  end
+
+  events.compiled(events)
+
+  events
+}
+
+# test enabling a TracePoint that targets a particular line in a C method call
+assert_equal '[true]', %q{
+  events = []
+  events.instance_variable_set(:@tp, TracePoint.new(:line) { |tp| events << tp.lineno })
+  def events.to_str
+    @tp.enable(target: method(:compiled))
+    ''
+  end
+
+  # Stay in generated code while enabling tracing
+  def events.compiled(obj)
+    String(obj)
+    __LINE__
+  end
+
+  line = events.compiled(events)
+  events[0] = (events[0] == line)
+
+  events
+}
+
+# test enabling tracing in the middle of splatarray
+assert_equal '[true]', %q{
+  events = []
+  obj = Object.new
+  obj.instance_variable_set(:@tp, TracePoint.new(:line) { |tp| events << tp.lineno })
+  def obj.to_a
+    @tp.enable(target: method(:compiled))
+    []
+  end
+
+  # Enable tracing in the middle of the splatarray instruction
+  def obj.compiled(obj)
+    * = *obj
+    __LINE__
+  end
+
+  obj.compiled([])
+  line = obj.compiled(obj)
+  events[0] = (events[0] == line)
+
+  events
+}
+
+# test enabling tracing in the middle of opt_aref. Different since the codegen
+# for it ends in a jump.
+assert_equal '[true]', %q{
+  def lookup(hash, tp)
+    hash[42]
+    tp.disable; __LINE__
+  end
+
+  lines = []
+  tp = TracePoint.new(:line) { lines << _1.lineno if _1.path == __FILE__ }
+
+  lookup(:foo, tp)
+  lookup({}, tp)
+
+  enable_tracing_on_missing = Hash.new { tp.enable }
+
+  expected_line = lookup(enable_tracing_on_missing, tp)
+
+  lines[0] = true if lines[0] == expected_line
+
+  lines
+}
+
+# test enabling c_call tracing before compiling
+assert_equal '[[:c_call, :itself]]', %q{
+  def shouldnt_compile
+    itself
+  end
+
+  events = []
+  tp = TracePoint.new(:c_call) { |tp| events << [tp.event, tp.method_id] }
+
+  # assume first call compiles
+  tp.enable { shouldnt_compile }
+
+  events
+}
+
+# test enabling c_return tracing before compiling
+assert_equal '[[:c_return, :itself, main]]', %q{
+  def shouldnt_compile
+    itself
+  end
+
+  events = []
+  tp = TracePoint.new(:c_return) { |tp| events << [tp.event, tp.method_id, tp.return_value] }
+
+  # assume first call compiles
+  tp.enable { shouldnt_compile }
+
+  events
+}
+
+# test enabling tracing for a suspended fiber
+assert_equal '[[:return, 42]]', %q{
+  def traced_method
+    Fiber.yield
+    42
+  end
+
+  events = []
+  tp = TracePoint.new(:return) { events << [_1.event, _1.return_value] }
+  # assume first call compiles
+  fiber = Fiber.new { traced_method }
+  fiber.resume
+  tp.enable(target: method(:traced_method))
+  fiber.resume
+
+  events
+}
+
+# test compiling on non-tracing ractor then running on a tracing one
+assert_equal '[:itself]', %q{
+  def traced_method
+    itself
+  end
+
+
+  tracing_ractor = Ractor.new do
+    # 1: start tracing
+    events = []
+    tp = TracePoint.new(:c_call) { events << _1.method_id }
+    tp.enable
+    Ractor.yield(nil)
+
+    # 3: run comipled method on tracing ractor
+    Ractor.yield(nil)
+    traced_method
+
+    events
+  ensure
+    tp&.disable
+  end
+
+  tracing_ractor.take
+
+  # 2: compile on non tracing ractor
+  traced_method
+
+  tracing_ractor.take
+  tracing_ractor.take
+}
+
+# Try to hit a lazy branch stub while another ractor enables tracing
+assert_equal '42', %q{
+  def compiled(arg)
+    if arg
+      arg + 1
+    else
+      itself
+      itself
+    end
+  end
+
+  ractor = Ractor.new do
+    compiled(false)
+    Ractor.yield(nil)
+    compiled(41)
+  end
+
+  tp = TracePoint.new(:line) { itself }
+  ractor.take
+  tp.enable
+
+  ractor.take
+}
--- a/common.mk
+++ b/common.mk
@ -7024,7 +7024,6 @@ iseq.$(OBJEXT): {$(VPATH)}vm_callinfo.h
 iseq.$(OBJEXT): {$(VPATH)}vm_core.h
 iseq.$(OBJEXT): {$(VPATH)}vm_opts.h
 iseq.$(OBJEXT): {$(VPATH)}yjit.h
-iseq.$(OBJEXT): {$(VPATH)}yjit_asm.h
 load.$(OBJEXT): $(CCAN_DIR)/check_type/check_type.h
 load.$(OBJEXT): $(CCAN_DIR)/container_of/container_of.h
 load.$(OBJEXT): $(CCAN_DIR)/list/list.h
@ -16722,6 +16721,7 @@ yjit_codegen.$(OBJEXT): $(top_srcdir)/internal/gc.h
 yjit_codegen.$(OBJEXT): $(top_srcdir)/internal/imemo.h
 yjit_codegen.$(OBJEXT): $(top_srcdir)/internal/object.h
 yjit_codegen.$(OBJEXT): $(top_srcdir)/internal/re.h
+yjit_codegen.$(OBJEXT): $(top_srcdir)/internal/sanitizers.h
 yjit_codegen.$(OBJEXT): $(top_srcdir)/internal/serial.h
 yjit_codegen.$(OBJEXT): $(top_srcdir)/internal/static_assert.h
 yjit_codegen.$(OBJEXT): $(top_srcdir)/internal/string.h
@ -16746,6 +16746,7 @@ yjit_codegen.$(OBJEXT): {$(VPATH)}darray.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}debug_counter.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}defines.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}encoding.h
+yjit_codegen.$(OBJEXT): {$(VPATH)}gc.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}id.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}id_table.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}insns.def
@ -16898,6 +16899,9 @@ yjit_codegen.$(OBJEXT): {$(VPATH)}missing.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}node.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}onigmo.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}oniguruma.h
+yjit_codegen.$(OBJEXT): {$(VPATH)}probes.dmyh
+yjit_codegen.$(OBJEXT): {$(VPATH)}probes.h
+yjit_codegen.$(OBJEXT): {$(VPATH)}probes_helper.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}ruby_assert.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}ruby_atomic.h
 yjit_codegen.$(OBJEXT): {$(VPATH)}st.h
--- a/iseq.c
+++ b/iseq.c
@ -3181,14 +3181,6 @@ typedef struct insn_data_struct {
 } insn_data_t;
 static insn_data_t insn_data[VM_INSTRUCTION_SIZE/2];

-
-
-
-#include "yjit_asm.h"
-
-
-
-
 void
 rb_vm_encoded_insn_data_table_init(void)
 {
@ -3305,10 +3297,6 @@ iseq_add_local_tracepoint(const rb_iseq_t *iseq, rb_event_flag_t turnon_events,

    VM_ASSERT(ISEQ_EXECUTABLE_P(iseq));

-#if USE_MJIT
-    // Force write the jit function to NULL
-    *((jit_func_t *)(&body->jit_func)) = 0;
-#endif

    for (pc=0; pc<body->iseq_size;) {
        const struct iseq_insn_info_entry *entry = get_insn_info(iseq, pc);
@ -3445,10 +3433,6 @@ rb_iseq_trace_set(const rb_iseq_t *iseq, rb_event_flag_t turnon_events)
            rb_event_flag_t pc_events = rb_iseq_event_flags(iseq, pc);
            pc += encoded_iseq_trace_instrument(&iseq_encoded[pc], pc_events & enabled_events, true);
 	}
-#if USE_MJIT
-        // Force write the jit function to NULL
-        *((jit_func_t *)(&body->jit_func)) = 0;
-#endif
    }
 }

--- a/vm_trace.c
+++ b/vm_trace.c
@ -30,6 +30,7 @@
 #include "ruby/debug.h"
 #include "vm_core.h"
 #include "ruby/ractor.h"
+#include "yjit.h"

 #include "builtin.h"

@ -97,6 +98,8 @@ update_global_event_hook(rb_event_flag_t vm_events)
        rb_clear_attr_ccs();
    }

+    yjit_tracing_invalidate_all();
+
    ruby_vm_event_flags = vm_events;
    ruby_vm_event_enabled_global_flags |= vm_events;
    rb_objspace_set_event_hook(vm_events);
@ -1212,6 +1215,8 @@ rb_tracepoint_enable_for_target(VALUE tpval, VALUE target, VALUE target_line)
        rb_raise(rb_eArgError, "can not enable any hooks");
    }

+    yjit_tracing_invalidate_all();
+
    ruby_vm_event_local_num++;

    tp->tracing = 1;
--- a/yjit.h
+++ b/yjit.h
@ -73,5 +73,6 @@ void rb_yjit_iseq_update_references(const struct rb_iseq_constant_body *body);
 void rb_yjit_iseq_free(const struct rb_iseq_constant_body *body);
 void rb_yjit_before_ractor_spawn(void);
 void yjit_constant_ic_update(const rb_iseq_t *iseq, IC ic);
+void yjit_tracing_invalidate_all(void);

 #endif // #ifndef YJIT_H
--- a/yjit_codegen.c
+++ b/yjit_codegen.c
@ -1,17 +1,20 @@
-#include <assert.h>
-#include "insns.inc"
 #include "internal.h"
+#include "insns.inc"
 #include "vm_core.h"
 #include "vm_sync.h"
 #include "vm_callinfo.h"
 #include "builtin.h"
+#include "gc.h"
 #include "internal/compile.h"
 #include "internal/class.h"
 #include "internal/object.h"
+#include "internal/sanitizers.h"
 #include "internal/string.h"
 #include "internal/variable.h"
 #include "internal/re.h"
 #include "insns_info.inc"
+#include "probes.h"
+#include "probes_helper.h"
 #include "yjit.h"
 #include "yjit_iface.h"
 #include "yjit_core.h"
@ -36,6 +39,25 @@ codeblock_t* ocb = NULL;
 // Code for exiting back to the interpreter from the leave insn
 static void *leave_exit_code;

+// Code for full logic of returning from C method and exiting to the interpreter
+static uint32_t outline_full_cfunc_return_pos;
+
+// For implementing global code invalidation
+struct codepage_patch {
+    uint32_t mainline_patch_pos;
+    uint32_t outline_target_pos;
+};
+
+typedef rb_darray(struct codepage_patch) patch_array_t;
+
+static patch_array_t global_inval_patches = NULL;
+
+// This number keeps track of the number of bytes counting from the beginning
+// of the page that should not be changed. After patching for global
+// invalidation, no one should make changes to the invalidated code region
+// anymore.
+uint32_t yjit_codepage_frozen_bytes = 0;
+
 // Print the current source location for debugging purposes
 RBIMPL_ATTR_MAYBE_UNUSED()
 static void
@ -156,6 +178,28 @@ jit_save_sp(jitstate_t* jit, ctx_t* ctx)
    }
 }

+// jit_save_pc() + jit_save_sp(). Should be used before calling a routine that
+// could:
+//  - Perform GC allocation
+//  - Take the VM loock through RB_VM_LOCK_ENTER()
+//  - Perform Ruby method call
+static void
+jit_prepare_routine_call(jitstate_t *jit, ctx_t *ctx, x86opnd_t scratch_reg)
+{
+    jit->record_boundary_patch_point = true;
+    jit_save_pc(jit, scratch_reg);
+    jit_save_sp(jit, ctx);
+}
+
+// Record the current codeblock write position for rewriting into a jump into
+// the outline block later. Used to implement global code invalidation.
+static void
+record_global_inval_patch(const codeblock_t *cb, uint32_t outline_block_target_pos)
+{
+    struct codepage_patch patch_point = { cb->write_pos, outline_block_target_pos };
+    if (!rb_darray_append(&global_inval_patches, patch_point)) rb_bug("allocation failed");
+}
+
 static bool jit_guard_known_klass(jitstate_t *jit, ctx_t* ctx, VALUE known_klass, insn_opnd_t insn_opnd, VALUE sample_instance, const int max_chain_depth, uint8_t *side_exit);

 #if RUBY_DEBUG
@ -290,15 +334,13 @@ _counted_side_exit(uint8_t *existing_side_exit, int64_t *counter)


 // Generate an exit to return to the interpreter
-static uint8_t *
-yjit_gen_exit(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
+static uint32_t
+yjit_gen_exit(VALUE *exit_pc, ctx_t *ctx, codeblock_t *cb)
 {
-    uint8_t *code_ptr = cb_get_ptr(cb, cb->write_pos);
+    const uint32_t code_pos = cb->write_pos;

    ADD_COMMENT(cb, "exit to interpreter");

-    VALUE *exit_pc = jit->pc;
-
    // Generate the code to exit to the interpreters
    // Write the adjusted SP back into the CFP
    if (ctx->sp_offset != 0) {
@ -329,7 +371,7 @@ yjit_gen_exit(jitstate_t *jit, ctx_t *ctx, codeblock_t *cb)
    mov(cb, RAX, imm_opnd(Qundef));
    ret(cb);

-    return code_ptr;
+    return code_pos;
 }

 // Generate a continuation for gen_leave() that exits to the interpreter at REG_CFP->pc.
@ -363,7 +405,8 @@ yjit_gen_leave_exit(codeblock_t *cb)
 static uint8_t *
 yjit_side_exit(jitstate_t *jit, ctx_t *ctx)
 {
-    return yjit_gen_exit(jit, ctx, ocb);
+    uint32_t pos = yjit_gen_exit(jit->pc, ctx, ocb);
+    return cb_get_ptr(ocb, pos);
 }

 // Generate a runtime guard that ensures the PC is at the start of the iseq,
@ -399,6 +442,64 @@ yjit_pc_guard(const rb_iseq_t *iseq)
    cb_link_labels(cb);
 }

+// The code we generate in gen_send_cfunc() doesn't fire the c_return TracePoint event
+// like the interpreter. When tracing for c_return is enabled, we patch the code after
+// the C method return to call into this to fire the event.
+static void
+full_cfunc_return(rb_execution_context_t *ec, VALUE return_value)
+{
+    rb_control_frame_t *cfp = ec->cfp;
+    RUBY_ASSERT_ALWAYS(cfp == GET_EC()->cfp);
+    const rb_callable_method_entry_t *me = rb_vm_frame_method_entry(cfp);
+
+    RUBY_ASSERT_ALWAYS(RUBYVM_CFUNC_FRAME_P(cfp));
+    RUBY_ASSERT_ALWAYS(me->def->type == VM_METHOD_TYPE_CFUNC);
+
+    // CHECK_CFP_CONSISTENCY("full_cfunc_return"); TODO revive this
+
+
+    // Pop the C func's frame and fire the c_return TracePoint event
+    // Note that this is the same order as vm_call_cfunc_with_frame().
+    rb_vm_pop_frame(ec);
+    EXEC_EVENT_HOOK(ec, RUBY_EVENT_C_RETURN, cfp->self, me->def->original_id, me->called_id, me->owner, return_value);
+    // Note, this deviates from the interpreter in that users need to enable
+    // a c_return TracePoint for this DTrace hook to work. A reasonable change
+    // since the Ruby return event works this way as well.
+    RUBY_DTRACE_CMETHOD_RETURN_HOOK(ec, me->owner, me->def->original_id);
+
+    // Push return value into the caller's stack. We know that it's a frame that
+    // uses cfp->sp because we are patching a call done with gen_send_cfunc().
+    ec->cfp->sp[0] = return_value;
+    ec->cfp->sp++;
+}
+
+// Landing code for when c_return tracing is enabled. See full_cfunc_return().
+static void
+gen_full_cfunc_return(void)
+{
+    codeblock_t *cb = ocb;
+    outline_full_cfunc_return_pos = ocb->write_pos;
+
+    // This chunk of code expect REG_EC to be filled properly and
+    // RAX to contain the return value of the C method.
+
+    // Call full_cfunc_return()
+    mov(cb, C_ARG_REGS[0], REG_EC);
+    mov(cb, C_ARG_REGS[1], RAX);
+    call_ptr(cb, REG0, (void *)full_cfunc_return);
+
+    // Count the exit
+    GEN_COUNTER_INC(cb, traced_cfunc_return);
+
+    // Return to the interpreter
+    pop(cb, REG_SP);
+    pop(cb, REG_EC);
+    pop(cb, REG_CFP);
+
+    mov(cb, RAX, imm_opnd(Qundef));
+    ret(cb);
+}
+
 /*
 Compile an interpreter entry block to be inserted into an iseq
 Returns `NULL` if compilation fails.
@ -473,6 +574,13 @@ jit_jump_to_next_insn(jitstate_t *jit, const ctx_t *current_context)

    blockid_t jump_block = { jit->iseq, jit_next_insn_idx(jit) };

+    // We are at the end of the current instruction. Record the boundary.
+    if (jit->record_boundary_patch_point) {
+        uint32_t exit_pos = yjit_gen_exit(jit->pc + insn_len(jit->opcode), &reset_depth, ocb);
+        record_global_inval_patch(cb, exit_pos);
+        jit->record_boundary_patch_point = false;
+    }
+
    // Generate the jump instruction
    gen_direct_jump(
        jit->block,
@ -536,6 +644,14 @@ yjit_gen_block(block_t *block, rb_execution_context_t *ec)
        jit.pc = pc;
        jit.opcode = opcode;

+        // If previous instruction requested to record the boundary
+        if (jit.record_boundary_patch_point) {
+            // Generate an exit to this instruction and record it
+            uint32_t exit_pos = yjit_gen_exit(jit.pc, ctx, ocb);
+            record_global_inval_patch(cb, exit_pos);
+            jit.record_boundary_patch_point = false;
+        }
+
        // Verify our existing assumption (DEBUG)
        if (jit_at_current_insn(&jit)) {
            verify_ctx(&jit, ctx);
@ -546,7 +662,7 @@ yjit_gen_block(block_t *block, rb_execution_context_t *ec)
        if (!gen_fn) {
            // If we reach an unknown instruction,
            // exit to the interpreter and stop compiling
-            yjit_gen_exit(&jit, ctx, cb);
+            yjit_gen_exit(jit.pc, ctx, cb);
            break;
        }

@ -576,7 +692,7 @@ yjit_gen_block(block_t *block, rb_execution_context_t *ec)
            // TODO: if the codegen funcion makes changes to ctx and then return YJIT_CANT_COMPILE,
            // the exit this generates would be wrong. We could save a copy of the entry context
            // and assert that ctx is the same here.
-            yjit_gen_exit(&jit, ctx, cb);
+            yjit_gen_exit(jit.pc, ctx, cb);
            break;
        }

@ -596,6 +712,10 @@ yjit_gen_block(block_t *block, rb_execution_context_t *ec)
    // Store the index of the last instruction in the block
    block->end_idx = insn_idx;

+    // We currently can't handle cases where the request is for a block that
+    // doesn't go to the next instruction.
+    RUBY_ASSERT(!jit.record_boundary_patch_point);
+
    if (YJIT_DUMP_MODE >= 2) {
        // Dump list of compiled instrutions
        fprintf(stderr, "Compiled the following for iseq=%p:\n", (void *)iseq);
@ -735,8 +855,7 @@ gen_newarray(jitstate_t* jit, ctx_t* ctx)
    rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);

    // Save the PC and SP because we are allocating
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    x86opnd_t values_ptr = ctx_sp_opnd(ctx, -(sizeof(VALUE) * (uint32_t)n));

@ -760,8 +879,7 @@ gen_duparray(jitstate_t* jit, ctx_t* ctx)
    VALUE ary = jit_get_arg(jit, 0);

    // Save the PC and SP because we are allocating
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    // call rb_ary_resurrect(VALUE ary);
    jit_mov_gc_ptr(jit, cb, C_ARG_REGS[0], ary);
@ -783,8 +901,7 @@ gen_splatarray(jitstate_t* jit, ctx_t* ctx)

    // Save the PC and SP because the callee may allocate
    // Note that this modifies REG_SP, which is why we do it first
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    // Get the operands from the stack
    x86opnd_t ary_opnd = ctx_stack_pop(ctx, 1);
@ -908,8 +1025,7 @@ gen_newhash(jitstate_t* jit, ctx_t* ctx)

    if (n == 0) {
        // Save the PC and SP because we are allocating
-        jit_save_pc(jit, REG0);
-        jit_save_sp(jit, ctx);
+        jit_prepare_routine_call(jit, ctx, REG0);

        // val = rb_hash_new();
        call_ptr(cb, REG0, (void *)rb_hash_new);
@ -1559,8 +1675,7 @@ gen_setinstancevariable(jitstate_t* jit, ctx_t* ctx)

    // Save the PC and SP because the callee may allocate
    // Note that this modifies REG_SP, which is why we do it first
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    // Get the operands from the stack
    x86opnd_t val_opnd = ctx_stack_pop(ctx, 1);
@ -1611,8 +1726,7 @@ gen_defined(jitstate_t* jit, ctx_t* ctx)

    // Save the PC and SP because the callee may allocate
    // Note that this modifies REG_SP, which is why we do it first
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    // Get the operands from the stack
    x86opnd_t v_opnd = ctx_stack_pop(ctx, 1);
@ -1706,8 +1820,7 @@ gen_concatstrings(jitstate_t* jit, ctx_t* ctx)
    rb_num_t n = (rb_num_t)jit_get_arg(jit, 0);

    // Save the PC and SP because we are allocating
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    x86opnd_t values_ptr = ctx_sp_opnd(ctx, -(sizeof(VALUE) * (uint32_t)n));

@ -1975,15 +2088,13 @@ gen_opt_aref(jitstate_t *jit, ctx_t *ctx)

        // Call VALUE rb_hash_aref(VALUE hash, VALUE key).
        {
-            // Write incremented pc to cfp->pc as the routine can raise and allocate
-            jit_save_pc(jit, REG0);
-
            // About to change REG_SP which these operands depend on. Yikes.
            mov(cb, C_ARG_REGS[0], recv_opnd);
            mov(cb, C_ARG_REGS[1], idx_opnd);

+            // Write incremented pc to cfp->pc as the routine can raise and allocate
            // Write sp to cfp->sp since rb_hash_aref might need to call #hash on the key
-            jit_save_sp(jit, ctx);
+            jit_prepare_routine_call(jit, ctx, REG0);

            call_ptr(cb, REG0, (void *)rb_hash_aref);

@ -2009,8 +2120,7 @@ gen_opt_aset(jitstate_t *jit, ctx_t *ctx)
 {
    // Save the PC and SP because the callee may allocate
    // Note that this modifies REG_SP, which is why we do it first
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    uint8_t* side_exit = yjit_side_exit(jit, ctx);

@ -2177,8 +2287,7 @@ gen_opt_mod(jitstate_t* jit, ctx_t* ctx)
 {
    // Save the PC and SP because the callee may allocate bignums
    // Note that this modifies REG_SP, which is why we do it first
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    uint8_t* side_exit = yjit_side_exit(jit, ctx);

@ -2691,6 +2800,25 @@ gen_send_cfunc(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const
        return YJIT_CANT_COMPILE;
    }

+    // Don't JIT if tracing c_call or c_return
+    {
+        rb_event_flag_t tracing_events;
+        if (rb_multi_ractor_p()) {
+            tracing_events = ruby_vm_event_enabled_global_flags;
+        }
+        else {
+            // We could always use ruby_vm_event_enabled_global_flags,
+            // but since events are never removed from it, doing so would mean
+            // we don't compile even after tracing is disabled.
+            tracing_events = rb_ec_ractor_hooks(jit->ec)->events;
+        }
+
+        if (tracing_events & (RUBY_EVENT_C_CALL | RUBY_EVENT_C_RETURN)) {
+            GEN_COUNTER_INC(cb, send_cfunc_tracing);
+            return YJIT_CANT_COMPILE;
+        }
+    }
+
    // Delegate to codegen for C methods if we have it.
    {
        method_codegen_t known_cfunc_codegen;
@ -2842,6 +2970,9 @@ gen_send_cfunc(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const
    // Invalidation logic is in rb_yjit_method_lookup_change()
    call_ptr(cb, REG0, (void*)cfunc->func);

+    // Record code position for TracePoint patching. See full_cfunc_return().
+    record_global_inval_patch(cb, outline_full_cfunc_return_pos);
+
    // Push the return value on the Ruby stack
    x86opnd_t stack_ret = ctx_stack_push(ctx, TYPE_UNKNOWN);
    mov(cb, stack_ret, RAX);
@ -2856,7 +2987,7 @@ gen_send_cfunc(jitstate_t *jit, ctx_t *ctx, const struct rb_callinfo *ci, const
    // cfunc calls may corrupt types
    ctx_clear_local_types(ctx);

-    // Note: gen_oswb_iseq() jumps to the next instruction with ctx->sp_offset == 0
+    // Note: gen_send_iseq() jumps to the next instruction with ctx->sp_offset == 0
    // after the call, while this does not. This difference prevents
    // the two call types from sharing the same successor.

@ -3480,8 +3611,7 @@ gen_getglobal(jitstate_t* jit, ctx_t* ctx)
    ID gid = jit_get_arg(jit, 0);

    // Save the PC and SP because we might make a Ruby call for warning
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    mov(cb, C_ARG_REGS[0], imm_opnd(gid));

@ -3500,8 +3630,7 @@ gen_setglobal(jitstate_t* jit, ctx_t* ctx)

    // Save the PC and SP because we might make a Ruby call for
    // Kernel#set_trace_var
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    mov(cb, C_ARG_REGS[0], imm_opnd(gid));

@ -3519,8 +3648,7 @@ gen_tostring(jitstate_t* jit, ctx_t* ctx)
 {
    // Save the PC and SP because we might make a Ruby call for
    // Kernel#set_trace_var
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    x86opnd_t str = ctx_stack_pop(ctx, 1);
    x86opnd_t val = ctx_stack_pop(ctx, 1);
@ -3545,8 +3673,7 @@ gen_toregexp(jitstate_t* jit, ctx_t* ctx)

    // Save the PC and SP because this allocates an object and could
    // raise an exception.
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    x86opnd_t values_ptr = ctx_sp_opnd(ctx, -(sizeof(VALUE) * (uint32_t)cnt));
    ctx_stack_pop(ctx, cnt);
@ -3678,8 +3805,7 @@ gen_opt_invokebuiltin_delegate(jitstate_t *jit, ctx_t *ctx)
    }

    // If the calls don't allocate, do they need up to date PC, SP?
-    jit_save_pc(jit, REG0);
-    jit_save_sp(jit, ctx);
+    jit_prepare_routine_call(jit, ctx, REG0);

    if (bf->argc > 0) {
        // Load environment pointer EP from CFP
@ -3706,6 +3832,107 @@ gen_opt_invokebuiltin_delegate(jitstate_t *jit, ctx_t *ctx)
    return YJIT_KEEP_COMPILING;
 }

+static int tracing_invalidate_all_i(void *vstart, void *vend, size_t stride, void *data);
+static void invalidate_all_blocks_for_tracing(const rb_iseq_t *iseq);
+
+// Invalidate all generated code and patch C method return code to contain
+// logic for firing the c_return TracePoint event. Once rb_vm_barrier()
+// returns, all other ractors are pausing inside RB_VM_LOCK_ENTER(), which
+// means they are inside a C routine. If there are any generated code on-stack,
+// they are waiting for a return from a C routine. For every routine call, we
+// patch in an exit after the body of the containing VM instruction. This makes
+// it so all the invalidated code exit as soon as execution logically reaches
+// the next VM instruction.
+// The c_return event needs special handling as our codegen never outputs code
+// that contains tracing logic. If we let the normal output code run until the
+// start of the next VM instruction by relying on the patching scheme above, we
+// would fail to fire the c_return event. To handle it, we patch in the full
+// logic at the return address. See full_cfunc_return().
+// In addition to patching, we prevent future entries into invalidated code by
+// removing all live blocks from their iseq.
+void
+yjit_tracing_invalidate_all(void)
+{
+    if (!rb_yjit_enabled_p()) return;
+
+    // Stop other ractors since we are going to patch machine code.
+    RB_VM_LOCK_ENTER();
+    rb_vm_barrier();
+
+    // Make it so all live block versions are no longer valid branch targets
+    rb_objspace_each_objects(tracing_invalidate_all_i, NULL);
+
+    // Apply patches
+    const uint32_t old_pos = cb->write_pos;
+    rb_darray_for(global_inval_patches, patch_idx) {
+        struct codepage_patch patch = rb_darray_get(global_inval_patches, patch_idx);
+        cb_set_pos(cb, patch.mainline_patch_pos);
+        uint8_t *jump_target = cb_get_ptr(ocb, patch.outline_target_pos);
+        jmp_ptr(cb, jump_target);
+    }
+    cb_set_pos(cb, old_pos);
+
+    // Freeze invalidated part of the codepage. We only want to wait for
+    // running instances of the code to exit from now on, so we shouldn't
+    // change the code. There could be other ractors sleeping in
+    // branch_stub_hit(), for example. We could harden this by changing memory
+    // protection on the frozen range.
+    RUBY_ASSERT_ALWAYS(yjit_codepage_frozen_bytes <= old_pos && "frozen bytes should increase monotonically");
+    yjit_codepage_frozen_bytes = old_pos;
+
+    RB_VM_LOCK_LEAVE();
+}
+
+static int
+tracing_invalidate_all_i(void *vstart, void *vend, size_t stride, void *data)
+{
+    VALUE v = (VALUE)vstart;
+    for (; v != (VALUE)vend; v += stride) {
+        void *ptr = asan_poisoned_object_p(v);
+        asan_unpoison_object(v, false);
+
+	if (rb_obj_is_iseq(v)) {
+            rb_iseq_t *iseq = (rb_iseq_t *)v;
+            invalidate_all_blocks_for_tracing(iseq);
+	}
+
+        asan_poison_object_if(ptr, v);
+    }
+    return 0;
+}
+
+static void
+invalidate_all_blocks_for_tracing(const rb_iseq_t *iseq)
+{
+    struct rb_iseq_constant_body *body = iseq->body;
+    if (!body) return; // iseq yet to be initialized
+
+    ASSERT_vm_locking();
+
+    // Empty all blocks on the iseq so we don't compile new blocks that jump to the
+    // invalidted region.
+    // TODO Leaking the blocks for now since we might have situations where
+    // a different ractor is waiting in branch_stub_hit(). If we free the block
+    // that ractor can wake up with a dangling block.
+    rb_darray_for(body->yjit_blocks, version_array_idx) {
+        rb_yjit_block_array_t version_array = rb_darray_get(body->yjit_blocks, version_array_idx);
+        rb_darray_for(version_array, version_idx) {
+            // Stop listening for invalidation events like basic operation redefinition.
+            block_t *block = rb_darray_get(version_array, version_idx);
+            yjit_unlink_method_lookup_dependency(block);
+            yjit_block_assumptions_free(block);
+        }
+        rb_darray_free(version_array);
+    }
+    rb_darray_free(body->yjit_blocks);
+    body->yjit_blocks = NULL;
+
+#if USE_MJIT
+    // Reset output code entry point
+    body->jit_func = NULL;
+#endif
+}
+
 static void
 yjit_reg_method(VALUE klass, const char *mid_str, method_codegen_t gen_fn)
 {
@ -3749,6 +3976,9 @@ yjit_init_codegen(void)
    // Generate the interpreter exit code for leave
    leave_exit_code = yjit_gen_leave_exit(cb);

+    // Generate full exit code for C func
+    gen_full_cfunc_return();
+
    // Map YARV opcodes to the corresponding codegen functions
    yjit_reg_op(BIN(nop), gen_nop);
    yjit_reg_op(BIN(dup), gen_dup);
--- a/yjit_codegen.h
+++ b/yjit_codegen.h
@ -7,6 +7,7 @@
 // Code blocks we generate code into
 extern codeblock_t *cb;
 extern codeblock_t *ocb;
+extern uint32_t yjit_codepage_frozen_bytes;

 // Code generation state
 typedef struct JITState
@ -30,6 +31,10 @@ typedef struct JITState
    // This allows us to peek at run-time values
    rb_execution_context_t* ec;

+    // Whether we need to record the code address at
+    // the end of this bytecode instruction for tracing suppoert
+    bool record_boundary_patch_point;
+
 } jitstate_t;

 typedef enum codegen_status {
--- a/yjit_core.c
+++ b/yjit_core.c
@ -506,11 +506,12 @@ static size_t get_num_versions(blockid_t blockid)
 static void
 add_block_version(blockid_t blockid, block_t* block)
 {
-    // Function entry blocks must have stack size 0
-    RUBY_ASSERT(!(block->blockid.idx == 0 && block->ctx.stack_size > 0));
    const rb_iseq_t *iseq = block->blockid.iseq;
    struct rb_iseq_constant_body *body = iseq->body;

+    // Function entry blocks must have stack size 0
+    RUBY_ASSERT(!(block->blockid.idx == 0 && block->ctx.stack_size > 0));
+
    // Ensure yjit_blocks is initialized for this iseq
    if (rb_darray_size(body->yjit_blocks) == 0) {
        // Initialize yjit_blocks to be as wide as body->iseq_encoded
@ -772,7 +773,7 @@ branch_stub_hit(branch_t* branch, const uint32_t target_idx, rb_execution_contex
        // If this block hasn't yet been compiled
        if (!p_block) {
            // If the new block can be generated right after the branch (at cb->write_pos)
-            if (cb->write_pos == branch->end_pos) {
+            if (cb->write_pos == branch->end_pos && branch->start_pos >= yjit_codepage_frozen_bytes) {
                // This branch should be terminating its block
                RUBY_ASSERT(branch->end_pos == branch->block->end_pos);

@ -801,12 +802,14 @@ branch_stub_hit(branch_t* branch, const uint32_t target_idx, rb_execution_contex
        branch->dst_addrs[target_idx] = dst_addr;

        // Rewrite the branch with the new jump target address
-        RUBY_ASSERT(branch->dst_addrs[0] != NULL);
-        uint32_t cur_pos = cb->write_pos;
-        cb_set_pos(cb, branch->start_pos);
-        branch->gen_fn(cb, branch->dst_addrs[0], branch->dst_addrs[1], branch->shape);
-        RUBY_ASSERT(cb->write_pos == branch->end_pos && "branch can't change size");
-        cb_set_pos(cb, cur_pos);
+        if (branch->start_pos >= yjit_codepage_frozen_bytes) {
+            RUBY_ASSERT(branch->dst_addrs[0] != NULL);
+            uint32_t cur_pos = cb->write_pos;
+            cb_set_pos(cb, branch->start_pos);
+            branch->gen_fn(cb, branch->dst_addrs[0], branch->dst_addrs[1], branch->shape);
+            RUBY_ASSERT(cb->write_pos == branch->end_pos && "branch can't change size");
+            cb_set_pos(cb, cur_pos);
+        }

        // Mark this branch target as patched (no longer a stub)
        branch->blocks[target_idx] = p_block;
@ -921,8 +924,7 @@ void gen_direct_jump(
    block_t* p_block = find_block_version(target0, ctx);

    // If the version already exists
-    if (p_block)
-    {
+    if (p_block) {
        rb_darray_append(&p_block->incoming, branch);

        branch->dst_addrs[0] = cb_get_ptr(cb, p_block->start_pos);
@ -934,10 +936,9 @@ void gen_direct_jump(
        gen_jump_branch(cb, branch->dst_addrs[0], NULL, SHAPE_DEFAULT);
        branch->end_pos = cb->write_pos;
    }
-    else
-    {
-        // The target block will be compiled right after this one (fallthrough)
-        // See the loop in gen_block_version()
+    else {
+        // This NULL target address signals gen_block_version() to compile the
+        // target block right after this one (fallthrough).
        branch->dst_addrs[0] = NULL;
        branch->shape = SHAPE_NEXT0;
        branch->start_pos = cb->write_pos;
@ -1048,7 +1049,7 @@ block_array_remove(rb_yjit_block_array_t block_array, block_t *block)

 // Invalidate one specific block version
 void
-invalidate_block_version(block_t* block)
+invalidate_block_version(block_t *block)
 {
    ASSERT_vm_locking();
    // TODO: want to assert that all other ractors are stopped here. Can't patch
@ -1067,8 +1068,7 @@ invalidate_block_version(block_t* block)
    uint8_t* code_ptr = cb_get_ptr(cb, block->start_pos);

    // For each incoming branch
-    rb_darray_for(block->incoming, incoming_idx)
-    {
+    rb_darray_for(block->incoming, incoming_idx) {
        branch_t* branch = rb_darray_get(block->incoming, incoming_idx);
        uint32_t target_idx = (branch->dst_addrs[0] == code_ptr)? 0:1;
        RUBY_ASSERT(branch->dst_addrs[target_idx] == code_ptr);
@ -1077,6 +1077,11 @@ invalidate_block_version(block_t* block)
        // Mark this target as being a stub
        branch->blocks[target_idx] = NULL;

+        // Don't patch frozen code region
+        if (branch->start_pos < yjit_codepage_frozen_bytes) {
+            continue;
+        }
+
        // Create a stub for this branch target
        branch->dst_addrs[target_idx] = get_branch_target(
            block->blockid,
@ -1088,8 +1093,7 @@ invalidate_block_version(block_t* block)
        // Check if the invalidated block immediately follows
        bool target_next = block->start_pos == branch->end_pos;

-        if (target_next)
-        {
+        if (target_next) {
            // The new block will no longer be adjacent
            branch->shape = SHAPE_DEFAULT;
        }
@ -1103,8 +1107,13 @@ invalidate_block_version(block_t* block)
        branch->block->end_pos = cb->write_pos;
        cb_set_pos(cb, cur_pos);

-        if (target_next && branch->end_pos > block->end_pos)
-        {
+        if (target_next && branch->end_pos > block->end_pos) {
+            fprintf(stderr, "branch_block_idx=%u block_idx=%u over=%d block_size=%d\n",
+                branch->block->blockid.idx,
+                block->blockid.idx,
+                branch->end_pos - block->end_pos,
+                block->end_pos - block->start_pos);
+            yjit_print_iseq(branch->block->blockid.iseq);
            rb_bug("yjit invalidate rewrote branch past end of invalidated block");
        }
    }
--- a/yjit_iface.c
+++ b/yjit_iface.c
@ -81,6 +81,17 @@ map_addr2insn(void *code_ptr, int insn)
    }
 }

+// For debugging. Print the disassembly of an iseq.
+void
+yjit_print_iseq(const rb_iseq_t *iseq)
+{
+    char *ptr;
+    long len;
+    VALUE disassembly = rb_iseq_disasm(iseq);
+    RSTRING_GETMEM(disassembly, ptr, len);
+    fprintf(stderr, "%.*s\n", (int)len, ptr);
+}
+
 int
 yjit_opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc)
 {
--- a/yjit_iface.h
+++ b/yjit_iface.h
@ -54,6 +54,7 @@ YJIT_DECLARE_COUNTERS(
    send_cfunc_ruby_array_varg,
    send_cfunc_argc_mismatch,
    send_cfunc_toomany_args,
+    send_cfunc_tracing,
    send_iseq_tailcall,
    send_iseq_arity_error,
    send_iseq_only_keywords,
@ -63,6 +64,8 @@ YJIT_DECLARE_COUNTERS(
    send_se_cf_overflow,
    send_se_protected_check_failed,

+    traced_cfunc_return,
+
    leave_se_interrupt,
    leave_interp_return,
    leave_start_pc_non_zero,
@ -105,6 +108,7 @@ RUBY_EXTERN struct rb_yjit_runtime_counters yjit_runtime_counters;
 void yjit_map_addr2insn(void *code_ptr, int insn);
 VALUE *yjit_iseq_pc_at_idx(const rb_iseq_t *iseq, uint32_t insn_idx);
 int yjit_opcode_at_pc(const rb_iseq_t *iseq, const VALUE *pc);
+void yjit_print_iseq(const rb_iseq_t *iseq);

 void check_cfunc_dispatch(VALUE receiver, struct rb_callinfo *ci, void *callee, rb_callable_method_entry_t *compile_time_cme);