diff --git a/doc/yjit/yjit.md b/doc/yjit/yjit.md index b1a4d5f022..4c1984ca6b 100644 --- a/doc/yjit/yjit.md +++ b/doc/yjit/yjit.md @@ -173,6 +173,7 @@ compiled, lower values mean less code is compiled (default 200000) - `--yjit-trace-exits`: produce a Marshal dump of backtraces from specific exits. Automatically enables `--yjit-stats` - `--yjit-max-versions=N`: maximum number of versions to generate per basic block (default 4) - `--yjit-greedy-versioning`: greedy versioning mode (disabled by default, may increase code size) +- `--yjit-perf`: Enable frame pointers and perf profiling Note that there is also an environment variable `RUBY_YJIT_ENABLE` which can be used to enable YJIT. This can be useful for some deployment scripts where specifying an extra command-line option to Ruby is not practical. @@ -428,3 +429,30 @@ While in your i386 shell, install Cargo and Homebrew, then hack away! 2. Cargo will install in $HOME/.cargo by default, and I don't know a good way to change architectures after install If you use Fish shell you can [read this link](https://tenderlovemaking.com/2022/01/07/homebrew-rosetta-and-ruby.html) for information on making the dev environment easier. + +## Profiling with Linux perf + +`--yjit-perf` allows you to profile JIT-ed methods along with other native functions using Linux perf. +When you run Ruby with `perf record`, perf looks up `/tmp/perf-{pid}.map` to resolve symbols in JIT code, +and this option lets YJIT write method symbols into that file as well as enabling frame pointers. + +Here's an example way to use this option with [Firefox Profiler](https://profiler.firefox.com) +(See also: [Profiling with Linux perf](https://profiler.firefox.com/docs/#/./guide-perf-profiling)): + +```bash +# Compile the interpreter with frame pointers enabled +./configure --enable-yjit --prefix=$HOME/.rubies/ruby-yjit --disable-install-doc cflags=-fno-omit-frame-pointer +make -j && make install + +# [Optional] Allow running perf without sudo +echo 0 | sudo tee /proc/sys/kernel/kptr_restrict +echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid + +# Profile Ruby with --yjit-perf +cd ../yjit-bench +perf record --call-graph fp -- ruby --yjit-perf -Iharness-perf benchmarks/liquid-render/benchmark.rb + +# View results on Firefox Profiler https://profiler.firefox.com. +# Create /tmp/test.perf as below and upload it using "Load a profile from file". +perf script --fields +pid > /tmp/test.perf +``` diff --git a/yjit/src/asm/mod.rs b/yjit/src/asm/mod.rs index 75478814c2..ea1095734a 100644 --- a/yjit/src/asm/mod.rs +++ b/yjit/src/asm/mod.rs @@ -323,7 +323,6 @@ impl CodeBlock { } /// Return the address ranges of a given address range that this CodeBlock can write. - #[cfg(any(feature = "disasm", target_arch = "aarch64"))] #[allow(dead_code)] pub fn writable_addrs(&self, start_ptr: CodePtr, end_ptr: CodePtr) -> Vec<(usize, usize)> { let region_start = self.get_ptr(0).into_usize(); diff --git a/yjit/src/backend/x86_64/mod.rs b/yjit/src/backend/x86_64/mod.rs index 7a67429488..fe5f821372 100644 --- a/yjit/src/backend/x86_64/mod.rs +++ b/yjit/src/backend/x86_64/mod.rs @@ -498,8 +498,21 @@ impl Assembler cb.write_byte(0); }, - Insn::FrameSetup => {}, - Insn::FrameTeardown => {}, + // Set up RBP to work with frame pointer unwinding + // (e.g. with Linux `perf record --call-graph fp`) + Insn::FrameSetup => { + if get_option!(frame_pointer) { + push(cb, RBP); + mov(cb, RBP, RSP); + push(cb, RBP); + } + }, + Insn::FrameTeardown => { + if get_option!(frame_pointer) { + pop(cb, RBP); + pop(cb, RBP); + } + }, Insn::Add { left, right, .. } => { let opnd1 = emit_64bit_immediate(cb, right); diff --git a/yjit/src/codegen.rs b/yjit/src/codegen.rs index d396243f1c..1636a4d74b 100644 --- a/yjit/src/codegen.rs +++ b/yjit/src/codegen.rs @@ -21,6 +21,7 @@ use std::mem; use std::os::raw::c_int; use std::ptr; use std::rc::Rc; +use std::cell::RefCell; use std::slice; pub use crate::virtualmem::CodePtr; @@ -97,6 +98,9 @@ pub struct JITState { /// When true, the block is valid only when there is a total of one ractor running pub block_assumes_single_ractor: bool, + + /// Address range for Linux perf's [JIT interface](https://github.com/torvalds/linux/blob/master/tools/perf/Documentation/jit-interface.txt) + perf_map: Rc::, String)>>>, } impl JITState { @@ -118,6 +122,7 @@ impl JITState { bop_assumptions: vec![], stable_constant_names_assumption: None, block_assumes_single_ractor: false, + perf_map: Rc::default(), } } @@ -231,6 +236,40 @@ impl JITState { pub fn queue_outgoing_branch(&mut self, branch: PendingBranchRef) { self.pending_outgoing.push(branch) } + + /// Mark the start address of a symbol to be reported to perf + fn perf_symbol_range_start(&self, asm: &mut Assembler, symbol_name: &str) { + let symbol_name = symbol_name.to_string(); + let syms = self.perf_map.clone(); + asm.pos_marker(move |start| syms.borrow_mut().push((start, None, symbol_name.clone()))); + } + + /// Mark the end address of a symbol to be reported to perf + fn perf_symbol_range_end(&self, asm: &mut Assembler) { + let syms = self.perf_map.clone(); + asm.pos_marker(move |end| { + if let Some((_, ref mut end_store, _)) = syms.borrow_mut().last_mut() { + assert_eq!(None, *end_store); + *end_store = Some(end); + } + }); + } + + /// Flush addresses and symbols to /tmp/perf-{pid}.map + fn flush_perf_symbols(&self, cb: &CodeBlock) { + let path = format!("/tmp/perf-{}.map", std::process::id()); + let mut f = std::fs::File::options().create(true).append(true).open(path).unwrap(); + for sym in self.perf_map.borrow().iter() { + if let (start, Some(end), name) = sym { + // In case the code straddles two pages, part of it belongs to the symbol. + for (inline_start, inline_end) in cb.writable_addrs(*start, *end) { + use std::io::Write; + let code_size = inline_end - inline_start; + writeln!(f, "{inline_start:x} {code_size:x} {name}").unwrap(); + } + } + } + } } use crate::codegen::JCCKinds::*; @@ -883,6 +922,19 @@ pub fn gen_single_block( asm_comment!(asm, "reg_temps: {:08b}", asm.ctx.get_reg_temps().as_u8()); } + // Mark the start of a method name symbol for --yjit-perf + if get_option!(perf_map) { + let comptime_recv_class = jit.peek_at_self().class_of(); + let class_name = unsafe { cstr_to_rust_string(rb_class2name(comptime_recv_class)) }; + match (class_name, unsafe { rb_iseq_label(iseq) }) { + (Some(class_name), iseq_label) if iseq_label != Qnil => { + let iseq_label = ruby_str_to_rust(iseq_label); + jit.perf_symbol_range_start(&mut asm, &format!("[JIT] {}#{}", class_name, iseq_label)); + } + _ => {}, + } + } + if asm.ctx.is_return_landing() { // Continuation of the end of gen_leave(). // Reload REG_SP for the current frame and transfer the return value @@ -1004,10 +1056,20 @@ pub fn gen_single_block( asm.pad_inval_patch(); } + // Mark the end of a method name symbol for --yjit-perf + if get_option!(perf_map) { + jit.perf_symbol_range_end(&mut asm); + } + // Compile code into the code block let gc_offsets = asm.compile(cb, Some(ocb)); let end_addr = cb.get_write_ptr(); + // Flush perf symbols after asm.compile() writes addresses + if get_option!(perf_map) { + jit.flush_perf_symbols(cb); + } + // If code for the block doesn't fit, fail if cb.has_dropped_bytes() || ocb.unwrap().has_dropped_bytes() { return Err(()); @@ -8681,8 +8743,6 @@ impl CodegenGlobals { #[cfg(not(test))] let (mut cb, mut ocb) = { - use std::cell::RefCell; - let virt_block: *mut u8 = unsafe { rb_yjit_reserve_addr_space(mem_size as u32) }; // Memory protection syscalls need page-aligned addresses, so check it here. Assuming diff --git a/yjit/src/cruby.rs b/yjit/src/cruby.rs index 2f9f2bedf2..6896ae9fce 100644 --- a/yjit/src/cruby.rs +++ b/yjit/src/cruby.rs @@ -577,7 +577,6 @@ pub fn rust_str_to_sym(str: &str) -> VALUE { } /// Produce an owned Rust String from a C char pointer -#[cfg(feature = "disasm")] pub fn cstr_to_rust_string(c_char_ptr: *const c_char) -> Option { assert!(c_char_ptr != std::ptr::null()); diff --git a/yjit/src/options.rs b/yjit/src/options.rs index 1d8e711a81..e5e0552d7e 100644 --- a/yjit/src/options.rs +++ b/yjit/src/options.rs @@ -62,6 +62,12 @@ pub struct Options { /// Verify context objects (debug mode only) pub verify_ctx: bool, + + /// Enable generating frame pointers (for x86. arm64 always does this) + pub frame_pointer: bool, + + /// Enable writing /tmp/perf-{pid}.map for Linux perf + pub perf_map: bool, } // Initialize the options to default values @@ -80,10 +86,12 @@ pub static mut OPTIONS: Options = Options { dump_disasm: None, verify_ctx: false, dump_iseq_disasm: None, + frame_pointer: false, + perf_map: false, }; /// YJIT option descriptions for `ruby --help`. -static YJIT_OPTIONS: [(&str, &str); 8] = [ +static YJIT_OPTIONS: [(&str, &str); 9] = [ ("--yjit-stats", "Enable collecting YJIT statistics"), ("--yjit-trace-exits", "Record Ruby source location when exiting from generated code"), ("--yjit-trace-exits-sample-rate", "Trace exit locations only every Nth occurrence"), @@ -92,6 +100,7 @@ static YJIT_OPTIONS: [(&str, &str); 8] = [ ("--yjit-cold-threshold=num", "Global call after which ISEQs not compiled (default: 200K)"), ("--yjit-max-versions=num", "Maximum number of versions per basic block (default: 4)"), ("--yjit-greedy-versioning", "Greedy versioning mode (default: disabled)"), + ("--yjit-perf", "Enable frame pointers and perf profiling"), ]; #[derive(Clone, PartialEq, Eq, Debug)] @@ -191,6 +200,16 @@ pub fn parse_option(str_ptr: *const std::os::raw::c_char) -> Option<()> { } }, + ("perf", _) => match opt_val { + "" => unsafe { + OPTIONS.frame_pointer = true; + OPTIONS.perf_map = true; + }, + "fp" => unsafe { OPTIONS.frame_pointer = true }, + "map" => unsafe { OPTIONS.perf_map = true }, + _ => return None, + }, + ("dump-disasm", _) => match opt_val { "" => unsafe { OPTIONS.dump_disasm = Some(DumpDisasm::Stdout) }, directory => { diff --git a/yjit/src/utils.rs b/yjit/src/utils.rs index 3a8f0ef590..a883e959a0 100644 --- a/yjit/src/utils.rs +++ b/yjit/src/utils.rs @@ -73,7 +73,7 @@ pub(crate) use offset_of; // Convert a CRuby UTF-8-encoded RSTRING into a Rust string. // This should work fine on ASCII strings and anything else // that is considered legal UTF-8, including embedded nulls. -fn ruby_str_to_rust(v: VALUE) -> String { +pub fn ruby_str_to_rust(v: VALUE) -> String { let str_ptr = unsafe { rb_RSTRING_PTR(v) } as *mut u8; let str_len: usize = unsafe { rb_RSTRING_LEN(v) }.try_into().unwrap(); let str_slice: &[u8] = unsafe { slice::from_raw_parts(str_ptr, str_len) }; diff --git a/yjit/src/yjit.rs b/yjit/src/yjit.rs index 94e2d174e6..515fa75ce8 100644 --- a/yjit/src/yjit.rs +++ b/yjit/src/yjit.rs @@ -72,6 +72,13 @@ pub extern "C" fn rb_yjit_init_rust() { println!("YJIT: rb_yjit_init_rust() panicked. Aborting."); std::process::abort(); } + + // Make sure --yjit-perf doesn't append symbols to an old file + if get_option!(perf_map) { + let perf_map = format!("/tmp/perf-{}.map", std::process::id()); + let _ = std::fs::remove_file(&perf_map); + println!("YJIT perf map: {perf_map}"); + } } /// At the moment, we abort in all cases we panic.